diff --git a/.circleci/.gitignore b/.circleci/.gitignore
deleted file mode 100644
index 485dee64bcf..00000000000
--- a/.circleci/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.idea
diff --git a/.circleci/build_docs/commit_docs.sh b/.circleci/build_docs/commit_docs.sh
deleted file mode 100755
index 04e3538fefc..00000000000
--- a/.circleci/build_docs/commit_docs.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-
-if [ "$2" == "" ]; then
-    echo call as "$0" "<src>" "<target branch>"
-    echo where src is the root of the built documentation git checkout and
-    echo branch should be "main" or "1.7" or so
-    exit 1
-fi
-
-src=$1
-target=$2
-
-echo "committing docs from ${src} to ${target}"
-
-pushd "${src}"
-git checkout gh-pages
-mkdir -p ./"${target}"
-rm -rf ./"${target}"/*
-cp -r "${src}/docs/build/html/"* ./"$target"
-if [ "${target}" == "main" ]; then
-    mkdir -p ./_static
-    rm -rf ./_static/*
-    cp -r "${src}/docs/build/html/_static/"* ./_static
-    git add --all ./_static || true
-fi
-git add --all ./"${target}" || true
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "auto-generating sphinx docs" || true
-git remote add https https://github.com/pytorch/vision.git
-git push -u https gh-pages
diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 4262656ae91..00000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,3300 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7
-#     - Replace binary_linux_wheel_py3.7 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-#     - run:
-#         name: Checkout merge branch
-#         command: |
-#           set -ex
-#           BRANCH=$(git rev-parse --abbrev-ref HEAD)
-#           if [[ "$BRANCH" != "main" ]]; then
-#             git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH}
-#             git checkout "merged/$CIRCLE_BRANCH"
-#           fi
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-  brew_update:
-    description: "Update Homebrew and install base formulae"
-    steps:
-      - run:
-          name: Update Homebrew
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-
-            # Update repositories manually.
-            # Running `brew update` produces a comparison between the
-            # current checkout and the updated checkout, which takes a
-            # very long time because the existing checkout is 2y old.
-            for path in $(find /usr/local/Homebrew -type d -name .git)
-            do
-            cd $path/..
-            git fetch --depth=1 origin
-            git reset --hard origin/master
-            done
-
-            export HOMEBREW_NO_AUTO_UPDATE=1
-
-            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
-            # moreutils installs a `parallel` executable by default, which conflicts
-            # with the executable from the GNU `parallel`, so we must unlink GNU
-            # `parallel` first, and relink it afterwards.
-            brew install coreutils
-            brew unlink parallel
-            brew install moreutils
-            brew link parallel --overwrite
-            brew install expect
-
-  brew_install:
-    description: "Install Homebrew formulae"
-    parameters:
-      formulae:
-        type: string
-        default: ""
-    steps:
-      - run:
-          name: Install << parameters.formulae >>
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-            export HOMEBREW_NO_AUTO_UPDATE=1
-            brew install << parameters.formulae >>
-
-  run_brew_for_ios_build:
-    steps:
-      - brew_update
-      - brew_install:
-          formulae: libtool
-
-  apt_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      update:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> apt install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: |
-            <<# parameters.update >> sudo apt update -qy  <</ parameters.update >>
-            sudo apt install << parameters.args >>
-
-  pip_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      user:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> pip install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: >
-            pip install
-            <<# parameters.user >> --user <</ parameters.user >>
-            --progress-bar=off
-            << parameters.args >>
-
-  install_torchvision:
-    parameters:
-      editable:
-        type: boolean
-        default: true
-    steps:
-      - pip_install:
-          args: --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install PyTorch from nightly releases
-      - pip_install:
-          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
-          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
-
-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
-  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
-  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
-  run_tests_selective:
-    parameters:
-      file_or_dir:
-        type: string
-    steps:
-      - run:
-          name: Install test utilities
-          command: pip install --progress-bar=off pytest pytest-mock
-      - run:
-          name: Run tests
-          command: pytest --junitxml=test-results/junit.xml -v --durations 20 <<parameters.file_or_dir>>
-      - store_test_results:
-          path: test-results
-
-  download_model_weights:
-    parameters:
-      extract_roots:
-        type: string
-        default: "torchvision/models"
-      background:
-        type: boolean
-        default: true
-    steps:
-      - apt_install:
-          args: parallel wget
-          descr: Install download utilitites
-      - run:
-          name: Download model weights
-          background: << parameters.background >>
-          command: |
-            mkdir -p ~/.cache/torch/hub/checkpoints
-            python scripts/collect_model_urls.py << parameters.extract_roots >> \
-                | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci'
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.7)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-      default: "cpu"
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: "pytorch/manylinux-cuda102"
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cpu"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-torchvision_ios_params: &torchvision_ios_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-    ios_arch:
-      type: string
-      default: ""
-    ios_platform:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-    IOS_ARCH: << parameters.ios_arch >>
-    IOS_PLATFORM: << parameters.ios_platform >>
-
-torchvision_android_params: &torchvision_android_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: torchvision/smoke_test:latest
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: jinja2 pyyaml
-      - run:
-          name: Check CircleCI config consistency
-          command: |
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  lint_python_and_config:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: pre-commit
-          descr: Install lint utilities
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  lint_c:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - apt_install:
-          args: libtinfo5
-          descr: Install additional system libraries
-      - checkout
-      - run:
-          name: Install lint utilities
-          command: |
-            curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format
-            chmod +x clang-format
-            sudo mv clang-format /opt/clang-format
-      - run:
-          name: Lint C code
-          command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  type_check_python:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision:
-          editable: true
-      - install_prototype_dependencies
-      - pip_install:
-          args: mypy
-          descr: Install Python type check utilities
-      - run:
-          name: Check Python types statically
-          command: mypy --install-types --non-interactive --config-file mypy.ini
-
-  unittest_torchhub:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - run_tests_selective:
-          file_or_dir: test/test_hub.py
-
-  unittest_onnx:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - pip_install:
-          args: onnx onnxruntime
-          descr: Install ONNX
-      - run_tests_selective:
-          file_or_dir: test/test_onnx.py
-
-  unittest_extended:
-    docker:
-      - image: cimg/python:3.7
-    resource_class: xlarge
-    steps:
-      - checkout
-      - download_model_weights
-      - install_torchvision
-      - run:
-          name: Enable extended tests
-          command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV
-      - run_tests_selective:
-          file_or_dir: test/test_extended_*.py
-
-  binary_linux_wheel:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run: packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_linux_conda:
-    <<: *binary_common
-    docker:
-      - image: "<< parameters.conda_docker_image >>"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run: packaging/build_conda.sh
-      - store_artifacts:
-          path: /opt/conda/conda-bld/linux-64
-      - persist_to_workspace:
-          root: /opt/conda/conda-bld/linux-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_conda:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 20m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda install -yq conda-build "conda-package-handling!=1.5.0"
-            # cudatoolkit >= 11 isn't available for windows in the nvidia channel
-            if [[ "${CU_VERSION}" =~ cu11.* ]]; then
-              export CONDA_CHANNEL_FLAGS="-c conda-forge"
-            fi
-            packaging/build_conda.sh
-            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-      - store_artifacts:
-          path: C:/tools/miniconda3/conda-bld/win-64
-      - persist_to_workspace:
-          root: C:/tools/miniconda3/conda-bld/win-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_macos_wheel:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          # Cannot easily deduplicate this as source'ing activate
-          # will set environment variables which we need to propagate
-          # to build_wheel.sh
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_ios_build:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh"
-          cat "$script"
-          source "$script"
-    - persist_to_workspace:
-        root: /Users/distiller/workspace/
-        paths: ios
-
-  binary_ios_upload:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_android_build:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh"
-          cat "$script"
-          source "$script"
-    - store_artifacts:
-        path: ~/workspace/artifacts
-
-  binary_android_upload:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_macos_conda:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build
-            packaging/build_conda.sh
-      - store_artifacts:
-          path: /Users/distiller/miniconda3/conda-bld/osx-64
-      - persist_to_workspace:
-          root: /Users/distiller/miniconda3/conda-bld/osx-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  # Requires org-member context
-  binary_conda_upload:
-    docker:
-      - image: continuumio/miniconda
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          command: |
-            # Prevent credential from leaking
-            conda install -yq anaconda-client
-            set -x
-            anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force
-
-  # Requires org-member context
-  binary_wheel_upload:
-    parameters:
-      subfolder:
-        description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
-        type: string
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - checkout
-      - pip_install:
-          args: awscli
-      - run:
-          command: |
-            export PATH="$HOME/.local/bin:$PATH"
-            # Prevent credential from leaking
-            set +x
-            export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
-            set -x
-            for pkg in ~/workspace/*.whl; do
-              aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
-            done
-
-  smoke_test_linux_conda:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_linux_pip:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_docker_image_build:
-    machine:
-      image: ubuntu-2004:202104-01
-    resource_class: large
-    environment:
-      image_name: torchvision/smoke_test
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Build and push Docker image
-          no_output_timeout: "1h"
-          command: |
-            set +x
-            echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin
-            set -x
-            cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest
-            docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker push ${image_name}:latest
-
-  smoke_test_win_conda:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda env remove -n python${PYTHON_VERSION} || true
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_win_pip:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda102"
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    environment:
-      image_name: "pytorch/manylinux-cuda102"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          # Here we create an envlist file that contains some env variables that we want the docker container to be aware of.
-          # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables.
-          # They're avaiable in all the other workflows (OSX and Windows).
-          # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible.
-          # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run".
-          name: export CIRCLECI env var
-          command: echo "CIRCLECI=true" >> ./env.list
-      - run:
-          name: Install torchvision
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post Process
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.3"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  cmake_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda102"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run: packaging/build_cmake.sh
-
-  cmake_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.small
-    environment:
-      PYTHON_VERSION: << parameters.python_version >>
-      PYTORCH_VERSION: << parameters.pytorch_version >>
-      UNICODE_ABI: << parameters.unicode_abi >>
-      CU_VERSION: << parameters.cu_version >>
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Build torchvision C++ distribution and test
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh
-
-  cmake_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake
-            packaging/build_cmake.sh
-
-  cmake_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/build_cmake.sh
-
-  cmake_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_cmake.sh
-
-  build_docs:
-    <<: *binary_common
-    docker:
-      - image: cimg/python:3.7
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - checkout
-      - download_model_weights
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - designate_upload_channel
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Build docs
-          command: |
-            set -ex
-            # turn v1.12.0rc3 into 1.12.0
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
-            VERSION=${tag:-main}
-            eval "$(./conda/bin/conda shell.bash hook)"
-            conda activate ./env
-            pushd docs
-            pip install --progress-bar=off -r requirements.txt
-            make html
-            popd
-      - persist_to_workspace:
-          root: ./
-          paths:
-            - "*"
-      - store_artifacts:
-          path: ./docs/build/html
-          destination: docs
-
-  upload_docs:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda100"
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - run:
-          name: Generate netrc
-          command: |
-            # set credentials for https pushing
-            # requires the org-member context
-            cat > ~/.netrc \<<DONE
-              machine github.com
-              login pytorchbot
-              password ${GITHUB_PYTORCHBOT_TOKEN}
-            DONE
-      - run:
-          name: Upload docs
-          command: |
-            # Don't use "checkout" step since it uses ssh, which cannot git push
-            # https://circleci.com/docs/2.0/configuration-reference/#checkout
-            set -ex
-            # Change v1.12.1rc1 into 1.12 (only major.minor)
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-            target=${tag:-main}
-            ~/workspace/.circleci/build_docs/commit_docs.sh ~/workspace $target
-
-
-workflows:
-  lint:
-    jobs:
-      - circleci_consistency
-      - lint_python_and_config
-      - lint_c
-      - type_check_python
-
-  build:
-    jobs:
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_wheel_py3.7_cu102
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_wheel_py3.7_cu113
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          name: binary_linux_wheel_py3.7_rocm5.0
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.7_rocm5.1.1
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_wheel_py3.8_cu102
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_wheel_py3.8_cu113
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          name: binary_linux_wheel_py3.8_rocm5.0
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.8_rocm5.1.1
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_wheel_py3.9_cu102
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_wheel_py3.9_cu113
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          name: binary_linux_wheel_py3.9_rocm5.0
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.9_rocm5.1.1
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_wheel_py3.10_cu102
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_wheel_py3.10_cu113
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          name: binary_linux_wheel_py3.10_rocm5.0
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.10_rocm5.1.1
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cpu
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu113
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu116
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu113
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu116
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu113
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu116
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cpu
-          name: binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu113
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu116
-          name: binary_win_wheel_py3.10_cu116
-          python_version: '3.10'
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_conda_py3.7_cu102
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_conda_py3.7_cu113
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_conda_py3.8_cu102
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_conda_py3.8_cu113
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_conda_py3.9_cu102
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_conda_py3.9_cu113
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          name: binary_linux_conda_py3.10_cu102
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          name: binary_linux_conda_py3.10_cu113
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cpu
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu113
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu116
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu113
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu116
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu113
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu116
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cpu
-          name: binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu113
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu116
-          name: binary_win_conda_py3.10_cu116
-          python_version: '3.10'
-      - build_docs:
-          filters:
-            branches:
-              only:
-              - /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: build_docs
-          python_version: '3.7'
-          requires:
-          - binary_linux_wheel_py3.7_cpu
-      - upload_docs:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: upload_docs
-          python_version: '3.7'
-          requires:
-          - build_docs
-      - binary_ios_build:
-          build_environment: binary-libtorchvision_ops-ios-12.0.0-x86_64
-          ios_arch: x86_64
-          ios_platform: SIMULATOR
-          name: binary_libtorchvision_ops_ios_12.0.0_x86_64
-      - binary_ios_build:
-          build_environment: binary-libtorchvision_ops-ios-12.0.0-arm64
-          ios_arch: arm64
-          ios_platform: OS
-          name: binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_android_build:
-          build_environment: binary-libtorchvision_ops-android
-          name: binary_libtorchvision_ops_android
-
-  unittest:
-    jobs:
-      - unittest_torchhub
-      - unittest_onnx
-      - unittest_extended
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.7
-          python_version: '3.7'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.8
-          python_version: '3.8'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.9
-          python_version: '3.9'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.10
-          python_version: '3.10'
-      - unittest_linux_gpu:
-          cu_version: cu102
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.7
-          python_version: '3.7'
-      - unittest_linux_gpu:
-          cu_version: cu102
-          name: unittest_linux_gpu_py3.8
-          python_version: '3.8'
-      - unittest_linux_gpu:
-          cu_version: cu102
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.9
-          python_version: '3.9'
-      - unittest_linux_gpu:
-          cu_version: cu102
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.10
-          python_version: '3.10'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.7
-          python_version: '3.7'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.8
-          python_version: '3.8'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.9
-          python_version: '3.9'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.10
-          python_version: '3.10'
-      - unittest_windows_gpu:
-          cu_version: cu102
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.7
-          python_version: '3.7'
-      - unittest_windows_gpu:
-          cu_version: cu102
-          name: unittest_windows_gpu_py3.8
-          python_version: '3.8'
-      - unittest_windows_gpu:
-          cu_version: cu102
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.9
-          python_version: '3.9'
-      - unittest_windows_gpu:
-          cu_version: cu102
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.10
-          python_version: '3.10'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.7
-          python_version: '3.7'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.8
-          python_version: '3.8'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.9
-          python_version: '3.9'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.10
-          python_version: '3.10'
-
-  cmake:
-    jobs:
-      - cmake_linux_cpu:
-          cu_version: cpu
-          name: cmake_linux_cpu
-          python_version: '3.8'
-      - cmake_linux_gpu:
-          cu_version: cu113
-          name: cmake_linux_gpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - cmake_windows_cpu:
-          cu_version: cpu
-          name: cmake_windows_cpu
-          python_version: '3.8'
-      - cmake_windows_gpu:
-          cu_version: cu113
-          name: cmake_windows_gpu
-          python_version: '3.8'
-      - cmake_macos_cpu:
-          cu_version: cpu
-          name: cmake_macos_cpu
-          python_version: '3.8'
-
-  nightly:
-    jobs:
-      - binary_ios_build:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-x86_64
-          filters:
-            branches:
-              only:
-              - nightly
-          ios_arch: x86_64
-          ios_platform: SIMULATOR
-          name: nightly_binary_libtorchvision_ops_ios_12.0.0_x86_64
-      - binary_ios_build:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-arm64
-          filters:
-            branches:
-              only:
-              - nightly
-          ios_arch: arm64
-          ios_platform: OS
-          name: nightly_binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_ios_upload:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-upload
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-          requires:
-          - nightly_binary_libtorchvision_ops_ios_12.0.0_x86_64
-          - nightly_binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_android_upload:
-          build_environment: nightly-binary-libtorchvision_ops-android-upload
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-          name: nightly_binary_libtorchvision_ops_android_upload
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu102
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu102_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu102
-          subfolder: cu102/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu113
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu113_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu113
-          subfolder: cu113/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.0
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.0_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_rocm5.0
-          subfolder: rocm5.0/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.1.1
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu102
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu102_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu102
-          subfolder: cu102/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu113
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu113_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu113
-          subfolder: cu113/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.0
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.0_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_rocm5.0
-          subfolder: rocm5.0/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.1.1
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu102
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu102_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu102
-          subfolder: cu102/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu113
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu113_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu113
-          subfolder: cu113/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.0
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.0_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_rocm5.0
-          subfolder: rocm5.0/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.1.1
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu102
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu102_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu102
-          subfolder: cu102/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu113
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu113_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu113
-          subfolder: cu113/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          cu_version: rocm5.0
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.0
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.0
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.0_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_rocm5.0
-          subfolder: rocm5.0/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.1.1
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.7_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.8_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.9_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.10_cpu
-          subfolder: ''
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cpu
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu113
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu113_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu113
-          subfolder: cu113/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu116
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu113
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu113_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu113
-          subfolder: cu113/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu116
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu113
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu113_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu113
-          subfolder: cu113/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu116
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu113
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu113_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu113
-          subfolder: cu113/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu116
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu116
-          subfolder: cu116/
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu102
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu102_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu113
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu113_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu102
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu102_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu113
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu113_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu102
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu102_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu113
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu113_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda102
-          cu_version: cu102
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu102
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu102_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu102
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda113
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu113
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda113
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu113_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu113
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu116
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.7_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.8_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.9_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cpu
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cpu
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu113
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu113_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu113
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu116
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu116
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cpu
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu113
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu113_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu113
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu116
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu116
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cpu
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu113
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu113_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu113
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu116
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu116
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cu113
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu113
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu113_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu113
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu116
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu116
-  docker_build:
-    triggers:
-      - schedule:
-          cron: "0 10 * * 0"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - smoke_test_docker_image_build:
-          context: org-member
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
deleted file mode 100644
index 0f75539097d..00000000000
--- a/.circleci/config.yml.in
+++ /dev/null
@@ -1,1128 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7
-#     - Replace binary_linux_wheel_py3.7 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-#     - run:
-#         name: Checkout merge branch
-#         command: |
-#           set -ex
-#           BRANCH=$(git rev-parse --abbrev-ref HEAD)
-#           if [[ "$BRANCH" != "main" ]]; then
-#             git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH}
-#             git checkout "merged/$CIRCLE_BRANCH"
-#           fi
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-  brew_update:
-    description: "Update Homebrew and install base formulae"
-    steps:
-      - run:
-          name: Update Homebrew
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-
-            # Update repositories manually.
-            # Running `brew update` produces a comparison between the
-            # current checkout and the updated checkout, which takes a
-            # very long time because the existing checkout is 2y old.
-            for path in $(find /usr/local/Homebrew -type d -name .git)
-            do
-            cd $path/..
-            git fetch --depth=1 origin
-            git reset --hard origin/master
-            done
-
-            export HOMEBREW_NO_AUTO_UPDATE=1
-
-            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
-            # moreutils installs a `parallel` executable by default, which conflicts
-            # with the executable from the GNU `parallel`, so we must unlink GNU
-            # `parallel` first, and relink it afterwards.
-            brew install coreutils
-            brew unlink parallel
-            brew install moreutils
-            brew link parallel --overwrite
-            brew install expect
-
-  brew_install:
-    description: "Install Homebrew formulae"
-    parameters:
-      formulae:
-        type: string
-        default: ""
-    steps:
-      - run:
-          name: Install << parameters.formulae >>
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-            export HOMEBREW_NO_AUTO_UPDATE=1
-            brew install << parameters.formulae >>
-
-  run_brew_for_ios_build:
-    steps:
-      - brew_update
-      - brew_install:
-          formulae: libtool
-
-  apt_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      update:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> apt install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: |
-            <<# parameters.update >> sudo apt update -qy  <</ parameters.update >>
-            sudo apt install << parameters.args >>
-
-  pip_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      user:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> pip install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: >
-            pip install
-            <<# parameters.user >> --user <</ parameters.user >>
-            --progress-bar=off
-            << parameters.args >>
-
-  install_torchvision:
-    parameters:
-      editable:
-        type: boolean
-        default: true
-    steps:
-      - pip_install:
-          args: --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install PyTorch from nightly releases
-      - pip_install:
-          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
-          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
-
-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
-  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
-  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
-  run_tests_selective:
-    parameters:
-      file_or_dir:
-        type: string
-    steps:
-      - run:
-          name: Install test utilities
-          command: pip install --progress-bar=off pytest pytest-mock
-      - run:
-          name: Run tests
-          command: pytest --junitxml=test-results/junit.xml -v --durations 20 <<parameters.file_or_dir>>
-      - store_test_results:
-          path: test-results
-
-  download_model_weights:
-    parameters:
-      extract_roots:
-        type: string
-        default: "torchvision/models"
-      background:
-        type: boolean
-        default: true
-    steps:
-      - apt_install:
-          args: parallel wget
-          descr: Install download utilitites
-      - run:
-          name: Download model weights
-          background: << parameters.background >>
-          command: |
-            mkdir -p ~/.cache/torch/hub/checkpoints
-            python scripts/collect_model_urls.py << parameters.extract_roots >> \
-                | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci'
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.7)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-      default: "cpu"
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: "pytorch/manylinux-cuda102"
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cpu"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-torchvision_ios_params: &torchvision_ios_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-    ios_arch:
-      type: string
-      default: ""
-    ios_platform:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-    IOS_ARCH: << parameters.ios_arch >>
-    IOS_PLATFORM: << parameters.ios_platform >>
-
-torchvision_android_params: &torchvision_android_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: torchvision/smoke_test:latest
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: jinja2 pyyaml
-      - run:
-          name: Check CircleCI config consistency
-          command: |
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  lint_python_and_config:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: pre-commit
-          descr: Install lint utilities
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  lint_c:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - apt_install:
-          args: libtinfo5
-          descr: Install additional system libraries
-      - checkout
-      - run:
-          name: Install lint utilities
-          command: |
-            curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format
-            chmod +x clang-format
-            sudo mv clang-format /opt/clang-format
-      - run:
-          name: Lint C code
-          command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  type_check_python:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision:
-          editable: true
-      - install_prototype_dependencies
-      - pip_install:
-          args: mypy
-          descr: Install Python type check utilities
-      - run:
-          name: Check Python types statically
-          command: mypy --install-types --non-interactive --config-file mypy.ini
-
-  unittest_torchhub:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - run_tests_selective:
-          file_or_dir: test/test_hub.py
-
-  unittest_onnx:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - pip_install:
-          args: onnx onnxruntime
-          descr: Install ONNX
-      - run_tests_selective:
-          file_or_dir: test/test_onnx.py
-
-  unittest_extended:
-    docker:
-      - image: cimg/python:3.7
-    resource_class: xlarge
-    steps:
-      - checkout
-      - download_model_weights
-      - install_torchvision
-      - run:
-          name: Enable extended tests
-          command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV
-      - run_tests_selective:
-          file_or_dir: test/test_extended_*.py
-
-  binary_linux_wheel:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run: packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_linux_conda:
-    <<: *binary_common
-    docker:
-      - image: "<< parameters.conda_docker_image >>"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run: packaging/build_conda.sh
-      - store_artifacts:
-          path: /opt/conda/conda-bld/linux-64
-      - persist_to_workspace:
-          root: /opt/conda/conda-bld/linux-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_conda:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 20m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda install -yq conda-build "conda-package-handling!=1.5.0"
-            # cudatoolkit >= 11 isn't available for windows in the nvidia channel
-            if [[ "${CU_VERSION}" =~ cu11.* ]]; then
-              export CONDA_CHANNEL_FLAGS="-c conda-forge"
-            fi
-            packaging/build_conda.sh
-            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-      - store_artifacts:
-          path: C:/tools/miniconda3/conda-bld/win-64
-      - persist_to_workspace:
-          root: C:/tools/miniconda3/conda-bld/win-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_macos_wheel:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          # Cannot easily deduplicate this as source'ing activate
-          # will set environment variables which we need to propagate
-          # to build_wheel.sh
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_ios_build:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh"
-          cat "$script"
-          source "$script"
-    - persist_to_workspace:
-        root: /Users/distiller/workspace/
-        paths: ios
-
-  binary_ios_upload:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_android_build:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh"
-          cat "$script"
-          source "$script"
-    - store_artifacts:
-        path: ~/workspace/artifacts
-
-  binary_android_upload:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_macos_conda:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build
-            packaging/build_conda.sh
-      - store_artifacts:
-          path: /Users/distiller/miniconda3/conda-bld/osx-64
-      - persist_to_workspace:
-          root: /Users/distiller/miniconda3/conda-bld/osx-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  # Requires org-member context
-  binary_conda_upload:
-    docker:
-      - image: continuumio/miniconda
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          command: |
-            # Prevent credential from leaking
-            conda install -yq anaconda-client
-            set -x
-            anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force
-
-  # Requires org-member context
-  binary_wheel_upload:
-    parameters:
-      subfolder:
-        description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
-        type: string
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - checkout
-      - pip_install:
-          args: awscli
-      - run:
-          command: |
-            export PATH="$HOME/.local/bin:$PATH"
-            # Prevent credential from leaking
-            set +x
-            export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
-            set -x
-            for pkg in ~/workspace/*.whl; do
-              aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
-            done
-
-  smoke_test_linux_conda:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_linux_pip:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_docker_image_build:
-    machine:
-      image: ubuntu-2004:202104-01
-    resource_class: large
-    environment:
-      image_name: torchvision/smoke_test
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Build and push Docker image
-          no_output_timeout: "1h"
-          command: |
-            set +x
-            echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin
-            set -x
-            cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest
-            docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker push ${image_name}:latest
-
-  smoke_test_win_conda:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda env remove -n python${PYTHON_VERSION} || true
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_win_pip:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda102"
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    environment:
-      image_name: "pytorch/manylinux-cuda102"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          # Here we create an envlist file that contains some env variables that we want the docker container to be aware of.
-          # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables.
-          # They're avaiable in all the other workflows (OSX and Windows).
-          # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible.
-          # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run".
-          name: export CIRCLECI env var
-          command: echo "CIRCLECI=true" >> ./env.list
-      - run:
-          name: Install torchvision
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post Process
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.3"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  cmake_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda102"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run: packaging/build_cmake.sh
-
-  cmake_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.small
-    environment:
-      PYTHON_VERSION: << parameters.python_version >>
-      PYTORCH_VERSION: << parameters.pytorch_version >>
-      UNICODE_ABI: << parameters.unicode_abi >>
-      CU_VERSION: << parameters.cu_version >>
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Build torchvision C++ distribution and test
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh
-
-  cmake_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake
-            packaging/build_cmake.sh
-
-  cmake_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/build_cmake.sh
-
-  cmake_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_cmake.sh
-
-  build_docs:
-    <<: *binary_common
-    docker:
-      - image: cimg/python:3.7
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - checkout
-      - download_model_weights
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - designate_upload_channel
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Build docs
-          command: |
-            set -ex
-            # turn v1.12.0rc3 into 1.12.0
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
-            VERSION=${tag:-main}
-            eval "$(./conda/bin/conda shell.bash hook)"
-            conda activate ./env
-            pushd docs
-            pip install --progress-bar=off -r requirements.txt
-            make html
-            popd
-      - persist_to_workspace:
-          root: ./
-          paths:
-            - "*"
-      - store_artifacts:
-          path: ./docs/build/html
-          destination: docs
-
-  upload_docs:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda100"
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - run:
-          name: Generate netrc
-          command: |
-            # set credentials for https pushing
-            # requires the org-member context
-            cat > ~/.netrc \<<DONE
-              machine github.com
-              login pytorchbot
-              password ${GITHUB_PYTORCHBOT_TOKEN}
-            DONE
-      - run:
-          name: Upload docs
-          command: |
-            # Don't use "checkout" step since it uses ssh, which cannot git push
-            # https://circleci.com/docs/2.0/configuration-reference/#checkout
-            set -ex
-            # Change v1.12.1rc1 into 1.12 (only major.minor)
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-            target=${tag:-main}
-            ~/workspace/.circleci/build_docs/commit_docs.sh ~/workspace $target
-
-
-workflows:
-  lint:
-    jobs:
-      - circleci_consistency
-      - lint_python_and_config
-      - lint_c
-      - type_check_python
-
-  build:
-    jobs:
-      {{ build_workflows(windows_latest_only=True) }}
-      {{ ios_workflows() }}
-      {{ android_workflows() }}
-
-  unittest:
-    jobs:
-      - unittest_torchhub
-      - unittest_onnx
-      - unittest_extended
-      {{ unittest_workflows() }}
-
-  cmake:
-    jobs:
-      {{ cmake_workflows() }}
-
-  nightly:
-    jobs:
-      {{ ios_workflows(nightly=True) }}
-      {{ android_workflows(nightly=True) }}
-      {{ build_workflows(prefix="nightly_", filter_branch="nightly", upload=True) }}
-  docker_build:
-    triggers:
-      - schedule:
-          cron: "0 10 * * 0"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - smoke_test_docker_image_build:
-          context: org-member
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
deleted file mode 100755
index 50147bb6bb0..00000000000
--- a/.circleci/regenerate.py
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script should use a very simple, functional programming style.
-Avoid Jinja macros in favor of native Python functions.
-
-Don't go overboard on code generation; use Python only to generate
-content that can't be easily declared statically using CircleCI's YAML API.
-
-Data declarations (e.g. the nested loops for defining the configuration matrix)
-should be at the top of the file for easy updating.
-
-See this comment for design rationale:
-https://github.com/pytorch/vision/pull/1321#issuecomment-531033978
-"""
-
-import os.path
-
-import jinja2
-import yaml
-from jinja2 import select_autoescape
-
-
-PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
-
-RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-
-
-def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6, windows_latest_only=False):
-    w = []
-    for btype in ["wheel", "conda"]:
-        for os_type in ["linux", "macos", "win"]:
-            python_versions = PYTHON_VERSIONS
-            cu_versions_dict = {
-                "linux": ["cpu", "cu102", "cu113", "cu116", "rocm5.0", "rocm5.1.1"],
-                "win": ["cpu", "cu113", "cu116"],
-                "macos": ["cpu"],
-            }
-            cu_versions = cu_versions_dict[os_type]
-            for python_version in python_versions:
-                for cu_version in cu_versions:
-                    # ROCm conda packages not yet supported
-                    if cu_version.startswith("rocm") and btype == "conda":
-                        continue
-                    for unicode in [False]:
-                        fb = filter_branch
-                        if (
-                            windows_latest_only
-                            and os_type == "win"
-                            and filter_branch is None
-                            and (
-                                python_version != python_versions[-1]
-                                or (cu_version not in [cu_versions[0], cu_versions[-1]])
-                            )
-                        ):
-                            fb = "main"
-                        if not fb and (
-                            os_type == "linux" and cu_version == "cpu" and btype == "wheel" and python_version == "3.7"
-                        ):
-                            # the fields must match the build_docs "requires" dependency
-                            fb = "/.*/"
-                        w += workflow_pair(
-                            btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
-                        )
-
-    if not filter_branch:
-        # Build on every pull request, but upload only on nightly and tags
-        w += build_doc_job("/.*/")
-        w += upload_doc_job("nightly")
-    return indent(indentation, w)
-
-
-def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix="", upload=False, *, filter_branch=None):
-
-    w = []
-    unicode_suffix = "u" if unicode else ""
-    base_workflow_name = f"{prefix}binary_{os_type}_{btype}_py{python_version}{unicode_suffix}_{cu_version}"
-
-    w.append(
-        generate_base_workflow(
-            base_workflow_name, python_version, cu_version, unicode, os_type, btype, filter_branch=filter_branch
-        )
-    )
-
-    if upload:
-        w.append(generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, filter_branch=filter_branch))
-        # disable smoke tests, they are broken and needs to be fixed
-        # if filter_branch == "nightly" and os_type in ["linux", "win"]:
-        #     pydistro = "pip" if btype == "wheel" else "conda"
-        #     w.append(generate_smoketest_workflow(pydistro, base_workflow_name, filter_branch, python_version, os_type))
-
-    return w
-
-
-def build_doc_job(filter_branch):
-    job = {
-        "name": "build_docs",
-        "python_version": "3.7",
-        "requires": [
-            "binary_linux_wheel_py3.7_cpu",
-        ],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch, tags_list=RC_PATTERN)
-    return [{"build_docs": job}]
-
-
-def upload_doc_job(filter_branch):
-    job = {
-        "name": "upload_docs",
-        "context": "org-member",
-        "python_version": "3.7",
-        "requires": [
-            "build_docs",
-        ],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch, tags_list=RC_PATTERN)
-    return [{"upload_docs": job}]
-
-
-manylinux_images = {
-    "cu102": "pytorch/manylinux-cuda102",
-    "cu113": "pytorch/manylinux-cuda113",
-    "cu116": "pytorch/manylinux-cuda116",
-}
-
-
-def get_manylinux_image(cu_version):
-    if cu_version == "cpu":
-        return "pytorch/manylinux-cuda102"
-    elif cu_version.startswith("cu"):
-        cu_suffix = cu_version[len("cu") :]
-        return f"pytorch/manylinux-cuda{cu_suffix}"
-    elif cu_version.startswith("rocm"):
-        rocm_suffix = cu_version[len("rocm") :]
-        return f"pytorch/manylinux-rocm:{rocm_suffix}"
-
-
-def get_conda_image(cu_version):
-    if cu_version == "cpu":
-        return "pytorch/conda-builder:cpu"
-    elif cu_version.startswith("cu"):
-        cu_suffix = cu_version[len("cu") :]
-        return f"pytorch/conda-builder:cuda{cu_suffix}"
-
-
-def generate_base_workflow(
-    base_workflow_name, python_version, cu_version, unicode, os_type, btype, *, filter_branch=None
-):
-
-    d = {
-        "name": base_workflow_name,
-        "python_version": python_version,
-        "cu_version": cu_version,
-    }
-
-    if os_type != "win" and unicode:
-        d["unicode_abi"] = "1"
-
-    if os_type != "win":
-        d["wheel_docker_image"] = get_manylinux_image(cu_version)
-        # ROCm conda packages not yet supported
-        if "rocm" not in cu_version:
-            d["conda_docker_image"] = get_conda_image(cu_version)
-
-    if filter_branch is not None:
-        d["filters"] = {
-            "branches": {"only": filter_branch},
-            "tags": {
-                # Using a raw string here to avoid having to escape
-                # anything
-                "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-            },
-        }
-
-    w = f"binary_{os_type}_{btype}"
-    return {w: d}
-
-
-def gen_filter_branch_tree(*branches, tags_list=None):
-    filter_dict = {"branches": {"only": [b for b in branches]}}
-    if tags_list is not None:
-        filter_dict["tags"] = {"only": tags_list}
-    return filter_dict
-
-
-def generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, *, filter_branch=None):
-    d = {
-        "name": f"{base_workflow_name}_upload",
-        "context": "org-member",
-        "requires": [base_workflow_name],
-    }
-
-    if btype == "wheel":
-        d["subfolder"] = "" if os_type == "macos" else cu_version + "/"
-
-    if filter_branch is not None:
-        d["filters"] = {
-            "branches": {"only": filter_branch},
-            "tags": {
-                # Using a raw string here to avoid having to escape
-                # anything
-                "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-            },
-        }
-
-    return {f"binary_{btype}_upload": d}
-
-
-def generate_smoketest_workflow(pydistro, base_workflow_name, filter_branch, python_version, os_type):
-
-    required_build_suffix = "_upload"
-    required_build_name = base_workflow_name + required_build_suffix
-
-    smoke_suffix = f"smoke_test_{pydistro}"
-    d = {
-        "name": f"{base_workflow_name}_{smoke_suffix}",
-        "requires": [required_build_name],
-        "python_version": python_version,
-    }
-
-    if filter_branch:
-        d["filters"] = gen_filter_branch_tree(filter_branch)
-
-    return {f"smoke_test_{os_type}_{pydistro}": d}
-
-
-def indent(indentation, data_list):
-    return ("\n" + " " * indentation).join(yaml.dump(data_list, default_flow_style=False).splitlines())
-
-
-def unittest_workflows(indentation=6):
-    jobs = []
-    for os_type in ["linux", "windows", "macos"]:
-        for device_type in ["cpu", "gpu"]:
-            if os_type == "macos" and device_type == "gpu":
-                continue
-            for i, python_version in enumerate(PYTHON_VERSIONS):
-                job = {
-                    "name": f"unittest_{os_type}_{device_type}_py{python_version}",
-                    "python_version": python_version,
-                }
-
-                if device_type == "gpu":
-                    if python_version != "3.8":
-                        job["filters"] = gen_filter_branch_tree("main", "nightly")
-                    job["cu_version"] = "cu102"
-                else:
-                    job["cu_version"] = "cpu"
-
-                jobs.append({f"unittest_{os_type}_{device_type}": job})
-
-    return indent(indentation, jobs)
-
-
-def cmake_workflows(indentation=6):
-    jobs = []
-    python_version = "3.8"
-    for os_type in ["linux", "windows", "macos"]:
-        # Skip OSX CUDA
-        device_types = ["cpu", "gpu"] if os_type != "macos" else ["cpu"]
-        for device in device_types:
-            job = {"name": f"cmake_{os_type}_{device}", "python_version": python_version}
-
-            job["cu_version"] = "cu113" if device == "gpu" else "cpu"
-            if device == "gpu" and os_type == "linux":
-                job["wheel_docker_image"] = "pytorch/manylinux-cuda113"
-            jobs.append({f"cmake_{os_type}_{device}": job})
-    return indent(indentation, jobs)
-
-
-def ios_workflows(indentation=6, nightly=False):
-    jobs = []
-    build_job_names = []
-    name_prefix = "nightly_" if nightly else ""
-    env_prefix = "nightly-" if nightly else ""
-    for arch, platform in [("x86_64", "SIMULATOR"), ("arm64", "OS")]:
-        name = f"{name_prefix}binary_libtorchvision_ops_ios_12.0.0_{arch}"
-        build_job_names.append(name)
-        build_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-ios-12.0.0-{arch}",
-            "ios_arch": arch,
-            "ios_platform": platform,
-            "name": name,
-        }
-        if nightly:
-            build_job["filters"] = gen_filter_branch_tree("nightly")
-        jobs.append({"binary_ios_build": build_job})
-
-    if nightly:
-        upload_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-ios-12.0.0-upload",
-            "context": "org-member",
-            "filters": gen_filter_branch_tree("nightly"),
-            "requires": build_job_names,
-        }
-        jobs.append({"binary_ios_upload": upload_job})
-    return indent(indentation, jobs)
-
-
-def android_workflows(indentation=6, nightly=False):
-    jobs = []
-    build_job_names = []
-    name_prefix = "nightly_" if nightly else ""
-    env_prefix = "nightly-" if nightly else ""
-
-    name = f"{name_prefix}binary_libtorchvision_ops_android"
-    build_job_names.append(name)
-    build_job = {
-        "build_environment": f"{env_prefix}binary-libtorchvision_ops-android",
-        "name": name,
-    }
-
-    if nightly:
-        upload_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-android-upload",
-            "context": "org-member",
-            "filters": gen_filter_branch_tree("nightly"),
-            "name": f"{name_prefix}binary_libtorchvision_ops_android_upload",
-        }
-        jobs.append({"binary_android_upload": upload_job})
-    else:
-        jobs.append({"binary_android_build": build_job})
-    return indent(indentation, jobs)
-
-
-if __name__ == "__main__":
-    d = os.path.dirname(__file__)
-    env = jinja2.Environment(
-        loader=jinja2.FileSystemLoader(d),
-        lstrip_blocks=True,
-        autoescape=select_autoescape(enabled_extensions=("html", "xml")),
-        keep_trailing_newline=True,
-    )
-
-    with open(os.path.join(d, "config.yml"), "w") as f:
-        f.write(
-            env.get_template("config.yml.in").render(
-                build_workflows=build_workflows,
-                unittest_workflows=unittest_workflows,
-                cmake_workflows=cmake_workflows,
-                ios_workflows=ios_workflows,
-                android_workflows=android_workflows,
-            )
-        )
diff --git a/.circleci/smoke_test/docker/Dockerfile b/.circleci/smoke_test/docker/Dockerfile
deleted file mode 100644
index 34bdcda1053..00000000000
--- a/.circleci/smoke_test/docker/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# this Dockerfile is for torchvision smoke test, it will be created periodically via CI system
-# if you need to do it locally, follow below steps once you have Docker installed
-# assuming you're within the directory where this Dockerfile located
-#  $ docker build . -t torchvision/smoketest
-
-# if you want to push to aws ecr, make sure you have the rights to write to ECR, then run
-# $ eval $(aws ecr get-login --region us-east-1 --no-include-email)
-# $ export MYTAG=localbuild  ## you can choose whatever tag you like
-# $ docker tag torchvision/smoketest 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchvision/smoke_test:${MYTAG}
-# $ docker push  308535385114.dkr.ecr.us-east-1.amazonaws.com/torchvision/smoke_test:${MYTAG}
-
-FROM ubuntu:latest
-
-RUN apt-get -qq update && apt-get -qq -y install curl bzip2 libsox-fmt-all \
-    && curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \
-    && bash /tmp/miniconda.sh -bfp /usr/local \
-    && rm -rf /tmp/miniconda.sh \
-    && conda install -y python=3 \
-    && conda update conda \
-    && apt-get -qq -y remove curl bzip2 \
-    && apt-get -qq -y autoremove \
-    && apt-get autoclean \
-    && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log \
-    && conda clean --all --yes
-
-ENV PATH /opt/conda/bin:$PATH
-
-RUN conda create -y --name python3.7 python=3.7
-RUN conda create -y --name python3.8 python=3.8
-RUN conda create -y --name python3.9 python=3.9
-RUN conda create -y --name python3.10 python=3.10
-SHELL [ "/bin/bash", "-c" ]
-RUN echo "source /usr/local/etc/profile.d/conda.sh" >> ~/.bashrc
-CMD [ "/bin/bash"]
diff --git a/.circleci/unittest/android/scripts/binary_android_build.sh b/.circleci/unittest/android/scripts/binary_android_build.sh
deleted file mode 100644
index 0d8c0d47d8a..00000000000
--- a/.circleci/unittest/android/scripts/binary_android_build.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo "DIR: $(pwd)"
-echo "ANDROID_HOME=${ANDROID_HOME}"
-echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}"
-echo "JAVA_HOME=${JAVA_HOME}"
-
-WORKSPACE=/home/circleci/workspace
-VISION_ANDROID=/home/circleci/project/android
-
-. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh
-
-GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties
-rm -f $GRADLE_LOCAL_PROPERTIES
-
-echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-
-echo "GRADLE_PATH $GRADLE_PATH"
-echo "GRADLE_HOME $GRADLE_HOME"
-
-${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} assemble || true
-
-mkdir -p ~/workspace/artifacts
-find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz
-find . -type f -name *apk -print | xargs tar cfvz ~/workspace/artifacts/artifacts-apks.tgz
diff --git a/.circleci/unittest/android/scripts/binary_android_upload.sh b/.circleci/unittest/android/scripts/binary_android_upload.sh
deleted file mode 100644
index 1472a877d90..00000000000
--- a/.circleci/unittest/android/scripts/binary_android_upload.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo "DIR: $(pwd)"
-echo "ANDROID_HOME=${ANDROID_HOME}"
-echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}"
-echo "JAVA_HOME=${JAVA_HOME}"
-
-WORKSPACE=/home/circleci/workspace
-VISION_ANDROID=/home/circleci/project/android
-
-. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh
-
-GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties
-rm -f $GRADLE_LOCAL_PROPERTIES
-GRADLE_PROPERTIES=/home/circleci/project/android/gradle.properties
-
-echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-
-echo "SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES
-echo "mavenCentralRepositoryUsername=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES
-echo "SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES
-echo "mavenCentralRepositoryPassword=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES
-
-echo "signing.keyId=${ANDROID_SIGN_KEY}" >> $GRADLE_PROPERTIES
-echo "signing.password=${ANDROID_SIGN_PASS}" >> $GRADLE_PROPERTIES
-
-cat /home/circleci/project/android/gradle.properties | grep VERSION
-
-${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} ops:uploadArchives
-
-mkdir -p ~/workspace/artifacts
-find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz
diff --git a/.circleci/unittest/android/scripts/install_gradle.sh b/.circleci/unittest/android/scripts/install_gradle.sh
deleted file mode 100755
index 5f803abfa94..00000000000
--- a/.circleci/unittest/android/scripts/install_gradle.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -ex
-
-_https_amazon_aws=https://ossci-android.s3.amazonaws.com
-GRADLE_VERSION=6.8.3
-
-_gradle_home=/opt/gradle
-sudo rm -rf $gradle_home
-sudo mkdir -p $_gradle_home
-
-curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip
-
-sudo unzip -q /tmp/gradle.zip -d $_gradle_home
-rm /tmp/gradle.zip
-
-sudo chmod -R 777 $_gradle_home
-
-export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION
-export GRADLE_PATH=${GRADLE_HOME}/bin/gradle
diff --git a/.circleci/unittest/ios/scripts/binary_ios_build.sh b/.circleci/unittest/ios/scripts/binary_ios_build.sh
deleted file mode 100755
index e2ad7b0c55f..00000000000
--- a/.circleci/unittest/ios/scripts/binary_ios_build.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo ""
-echo "DIR: $(pwd)"
-WORKSPACE=/Users/distiller/workspace
-PROJ_ROOT_IOS=/Users/distiller/project/ios
-PYTORCH_IOS_NIGHTLY_NAME=libtorch_ios_nightly_build.zip
-export TCLLIBPATH="/usr/local/lib"
-
-# install conda
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x ~/conda.sh
-/bin/bash ~/conda.sh -b -p ~/anaconda
-export PATH="~/anaconda/bin:${PATH}"
-source ~/anaconda/bin/activate
-
-# install dependencies
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests typing_extensions wget --yes
-conda install -c conda-forge valgrind --yes
-export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-
-# sync submodules
-cd ${PROJ_ROOT_IOS}
-git submodule sync
-git submodule update --init --recursive
-
-# download pytorch-iOS nightly build and unzip it
-mkdir -p ${PROJ_ROOT_IOS}/lib
-mkdir -p ${PROJ_ROOT_IOS}/build
-mkdir -p ${PROJ_ROOT_IOS}/pytorch
-TORCH_ROOT="${PROJ_ROOT_IOS}/pytorch"
-
-cd ${TORCH_ROOT}
-wget https://ossci-ios-build.s3.amazonaws.com/${PYTORCH_IOS_NIGHTLY_NAME}
-mkdir -p ./build_ios
-unzip -d ./build_ios ./${PYTORCH_IOS_NIGHTLY_NAME}
-
-LIBTORCH_HEADER_ROOT="${TORCH_ROOT}/build_ios/install/include"
-cd ${PROJ_ROOT_IOS}
-IOS_ARCH=${IOS_ARCH} LIBTORCH_HEADER_ROOT=${LIBTORCH_HEADER_ROOT} ./build_ios.sh
-rm -rf ${TORCH_ROOT}
-
-# store the binary
-DEST_DIR=${WORKSPACE}/ios/${IOS_ARCH}
-mkdir -p ${DEST_DIR}
-cp ${PROJ_ROOT_IOS}/lib/*.a ${DEST_DIR}
diff --git a/.circleci/unittest/ios/scripts/binary_ios_upload.sh b/.circleci/unittest/ios/scripts/binary_ios_upload.sh
deleted file mode 100644
index ce56388e5da..00000000000
--- a/.circleci/unittest/ios/scripts/binary_ios_upload.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo ""
-echo "DIR: $(pwd)"
-
-WORKSPACE=/Users/distiller/workspace
-PROJ_ROOT=/Users/distiller/project
-ARTIFACTS_DIR=${WORKSPACE}/ios
-ls ${ARTIFACTS_DIR}
-ZIP_DIR=${WORKSPACE}/zip
-mkdir -p ${ZIP_DIR}/install/lib
-
-# build a FAT bianry
-cd ${ZIP_DIR}/install/lib
-libs=("${ARTIFACTS_DIR}/x86_64/libtorchvision_ops.a" "${ARTIFACTS_DIR}/arm64/libtorchvision_ops.a")
-lipo -create "${libs[@]}" -o ${ZIP_DIR}/install/lib/libtorchvision_ops.a
-lipo -i ${ZIP_DIR}/install/lib/*.a
-
-# copy the license
-cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
-# zip the library
-ZIPFILE=libtorchvision_ops_ios_nightly_build.zip
-cd ${ZIP_DIR}
-#for testing
-touch version.txt
-echo $(date +%s) > version.txt
-zip -r ${ZIPFILE} install version.txt LICENSE
-
-# upload to aws
-# Install conda then 'conda install' awscli
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x ~/conda.sh
-/bin/bash ~/conda.sh -b -p ~/anaconda
-export PATH="~/anaconda/bin:${PATH}"
-source ~/anaconda/bin/activate
-conda install -c conda-forge awscli --yes
-set +x
-export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD}
-export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD}
-set -x
-aws s3 cp ${ZIPFILE} s3://ossci-ios-build/ --acl public-read
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
deleted file mode 100644
index 77ee9929519..00000000000
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pip
-  - libpng
-  - jpeg
-  - ca-certificates
-  - h5py
-  - pip:
-    - future
-    - scipy
-    - av
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
deleted file mode 100755
index c9c85bdd88a..00000000000
--- a/.circleci/unittest/linux/scripts/install.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-
-    cudatoolkit="nvidia::cudatoolkit=${version}"
-    if [[ "$version" == "11.6" || "$version" == "11.7" ]]; then
-        cudatoolkit=" pytorch-cuda=${version}"
-    fi
-fi
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-if [ "${os}" == "MacOSX" ]; then
-    conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}"
-else
-    conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-fi
-
-printf "* Installing torchvision\n"
-python setup.py develop
diff --git a/.circleci/unittest/linux/scripts/post_process.sh b/.circleci/unittest/linux/scripts/post_process.sh
deleted file mode 100755
index e97bf2a7b1b..00000000000
--- a/.circleci/unittest/linux/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
diff --git a/.circleci/unittest/linux/scripts/run-clang-format.py b/.circleci/unittest/linux/scripts/run-clang-format.py
deleted file mode 100755
index 5c61b2519e0..00000000000
--- a/.circleci/unittest/linux/scripts/run-clang-format.py
+++ /dev/null
@@ -1,331 +0,0 @@
-#!/usr/bin/env python
-"""
-MIT License
-
-Copyright (c) 2017 Guillaume Papin
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-A wrapper script around clang-format, suitable for linting multiple files
-and to use for continuous integration.
-
-This is an alternative API for the clang-format command line.
-It runs over multiple files and directories in parallel.
-A diff output is produced and a sensible exit code is returned.
-
-"""
-
-import argparse
-import difflib
-import fnmatch
-import multiprocessing
-import os
-import signal
-import subprocess
-import sys
-import traceback
-from functools import partial
-
-try:
-    from subprocess import DEVNULL  # py3k
-except ImportError:
-    DEVNULL = open(os.devnull, "wb")
-
-
-DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu"
-
-
-class ExitStatus:
-    SUCCESS = 0
-    DIFF = 1
-    TROUBLE = 2
-
-
-def list_files(files, recursive=False, extensions=None, exclude=None):
-    if extensions is None:
-        extensions = []
-    if exclude is None:
-        exclude = []
-
-    out = []
-    for file in files:
-        if recursive and os.path.isdir(file):
-            for dirpath, dnames, fnames in os.walk(file):
-                fpaths = [os.path.join(dirpath, fname) for fname in fnames]
-                for pattern in exclude:
-                    # os.walk() supports trimming down the dnames list
-                    # by modifying it in-place,
-                    # to avoid unnecessary directory listings.
-                    dnames[:] = [x for x in dnames if not fnmatch.fnmatch(os.path.join(dirpath, x), pattern)]
-                    fpaths = [x for x in fpaths if not fnmatch.fnmatch(x, pattern)]
-                for f in fpaths:
-                    ext = os.path.splitext(f)[1][1:]
-                    if ext in extensions:
-                        out.append(f)
-        else:
-            out.append(file)
-    return out
-
-
-def make_diff(file, original, reformatted):
-    return list(
-        difflib.unified_diff(
-            original, reformatted, fromfile=f"{file}\t(original)", tofile=f"{file}\t(reformatted)", n=3
-        )
-    )
-
-
-class DiffError(Exception):
-    def __init__(self, message, errs=None):
-        super().__init__(message)
-        self.errs = errs or []
-
-
-class UnexpectedError(Exception):
-    def __init__(self, message, exc=None):
-        super().__init__(message)
-        self.formatted_traceback = traceback.format_exc()
-        self.exc = exc
-
-
-def run_clang_format_diff_wrapper(args, file):
-    try:
-        ret = run_clang_format_diff(args, file)
-        return ret
-    except DiffError:
-        raise
-    except Exception as e:
-        raise UnexpectedError(f"{file}: {e.__class__.__name__}: {e}", e)
-
-
-def run_clang_format_diff(args, file):
-    try:
-        with open(file, encoding="utf-8") as f:
-            original = f.readlines()
-    except OSError as exc:
-        raise DiffError(str(exc))
-    invocation = [args.clang_format_executable, file]
-
-    # Use of utf-8 to decode the process output.
-    #
-    # Hopefully, this is the correct thing to do.
-    #
-    # It's done due to the following assumptions (which may be incorrect):
-    # - clang-format will returns the bytes read from the files as-is,
-    #   without conversion, and it is already assumed that the files use utf-8.
-    # - if the diagnostics were internationalized, they would use utf-8:
-    #   > Adding Translations to Clang
-    #   >
-    #   > Not possible yet!
-    #   > Diagnostic strings should be written in UTF-8,
-    #   > the client can translate to the relevant code page if needed.
-    #   > Each translation completely replaces the format string
-    #   > for the diagnostic.
-    #   > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation
-
-    try:
-        proc = subprocess.Popen(
-            invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, encoding="utf-8"
-        )
-    except OSError as exc:
-        raise DiffError(f"Command '{subprocess.list2cmdline(invocation)}' failed to start: {exc}")
-    proc_stdout = proc.stdout
-    proc_stderr = proc.stderr
-
-    # hopefully the stderr pipe won't get full and block the process
-    outs = list(proc_stdout.readlines())
-    errs = list(proc_stderr.readlines())
-    proc.wait()
-    if proc.returncode:
-        raise DiffError(
-            "Command '{}' returned non-zero exit status {}".format(
-                subprocess.list2cmdline(invocation), proc.returncode
-            ),
-            errs,
-        )
-    return make_diff(file, original, outs), errs
-
-
-def bold_red(s):
-    return "\x1b[1m\x1b[31m" + s + "\x1b[0m"
-
-
-def colorize(diff_lines):
-    def bold(s):
-        return "\x1b[1m" + s + "\x1b[0m"
-
-    def cyan(s):
-        return "\x1b[36m" + s + "\x1b[0m"
-
-    def green(s):
-        return "\x1b[32m" + s + "\x1b[0m"
-
-    def red(s):
-        return "\x1b[31m" + s + "\x1b[0m"
-
-    for line in diff_lines:
-        if line[:4] in ["--- ", "+++ "]:
-            yield bold(line)
-        elif line.startswith("@@ "):
-            yield cyan(line)
-        elif line.startswith("+"):
-            yield green(line)
-        elif line.startswith("-"):
-            yield red(line)
-        else:
-            yield line
-
-
-def print_diff(diff_lines, use_color):
-    if use_color:
-        diff_lines = colorize(diff_lines)
-    sys.stdout.writelines(diff_lines)
-
-
-def print_trouble(prog, message, use_colors):
-    error_text = "error:"
-    if use_colors:
-        error_text = bold_red(error_text)
-    print(f"{prog}: {error_text} {message}", file=sys.stderr)
-
-
-def main():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--clang-format-executable",
-        metavar="EXECUTABLE",
-        help="path to the clang-format executable",
-        default="clang-format",
-    )
-    parser.add_argument(
-        "--extensions",
-        help=f"comma separated list of file extensions (default: {DEFAULT_EXTENSIONS})",
-        default=DEFAULT_EXTENSIONS,
-    )
-    parser.add_argument("-r", "--recursive", action="store_true", help="run recursively over directories")
-    parser.add_argument("files", metavar="file", nargs="+")
-    parser.add_argument("-q", "--quiet", action="store_true")
-    parser.add_argument(
-        "-j",
-        metavar="N",
-        type=int,
-        default=0,
-        help="run N clang-format jobs in parallel (default number of cpus + 1)",
-    )
-    parser.add_argument(
-        "--color", default="auto", choices=["auto", "always", "never"], help="show colored diff (default: auto)"
-    )
-    parser.add_argument(
-        "-e",
-        "--exclude",
-        metavar="PATTERN",
-        action="append",
-        default=[],
-        help="exclude paths matching the given glob-like pattern(s) from recursive search",
-    )
-
-    args = parser.parse_args()
-
-    # use default signal handling, like diff return SIGINT value on ^C
-    # https://bugs.python.org/issue14229#msg156446
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-    try:
-        signal.SIGPIPE
-    except AttributeError:
-        # compatibility, SIGPIPE does not exist on Windows
-        pass
-    else:
-        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
-
-    colored_stdout = False
-    colored_stderr = False
-    if args.color == "always":
-        colored_stdout = True
-        colored_stderr = True
-    elif args.color == "auto":
-        colored_stdout = sys.stdout.isatty()
-        colored_stderr = sys.stderr.isatty()
-
-    version_invocation = [args.clang_format_executable, "--version"]
-    try:
-        subprocess.check_call(version_invocation, stdout=DEVNULL)
-    except subprocess.CalledProcessError as e:
-        print_trouble(parser.prog, str(e), use_colors=colored_stderr)
-        return ExitStatus.TROUBLE
-    except OSError as e:
-        print_trouble(
-            parser.prog,
-            f"Command '{subprocess.list2cmdline(version_invocation)}' failed to start: {e}",
-            use_colors=colored_stderr,
-        )
-        return ExitStatus.TROUBLE
-
-    retcode = ExitStatus.SUCCESS
-    files = list_files(
-        args.files, recursive=args.recursive, exclude=args.exclude, extensions=args.extensions.split(",")
-    )
-
-    if not files:
-        return
-
-    njobs = args.j
-    if njobs == 0:
-        njobs = multiprocessing.cpu_count() + 1
-    njobs = min(len(files), njobs)
-
-    if njobs == 1:
-        # execute directly instead of in a pool,
-        # less overhead, simpler stacktraces
-        it = (run_clang_format_diff_wrapper(args, file) for file in files)
-        pool = None
-    else:
-        pool = multiprocessing.Pool(njobs)
-        it = pool.imap_unordered(partial(run_clang_format_diff_wrapper, args), files)
-    while True:
-        try:
-            outs, errs = next(it)
-        except StopIteration:
-            break
-        except DiffError as e:
-            print_trouble(parser.prog, str(e), use_colors=colored_stderr)
-            retcode = ExitStatus.TROUBLE
-            sys.stderr.writelines(e.errs)
-        except UnexpectedError as e:
-            print_trouble(parser.prog, str(e), use_colors=colored_stderr)
-            sys.stderr.write(e.formatted_traceback)
-            retcode = ExitStatus.TROUBLE
-            # stop at the first unexpected error,
-            # something could be very wrong,
-            # don't process all files unnecessarily
-            if pool:
-                pool.terminate()
-            break
-        else:
-            sys.stderr.writelines(errs)
-            if outs == []:
-                continue
-            if not args.quiet:
-                print_diff(outs, use_color=colored_stdout)
-            if retcode == ExitStatus.SUCCESS:
-                retcode = ExitStatus.DIFF
-    return retcode
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh
deleted file mode 100755
index 8f6b8cb8485..00000000000
--- a/.circleci/unittest/linux/scripts/run_test.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
deleted file mode 100755
index 0574cdff1cf..00000000000
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -e
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-# Avoid error: "fatal: unsafe repository"
-git config --global --add safe.directory '*'
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh"
-    bash ./miniconda.sh -b -f -p "${conda_dir}"
-fi
-eval "$(${conda_dir}/bin/conda shell.bash hook)"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-FFMPEG_PIN="=4.2"
-if [[ "${PYTHON_VERSION}" = "3.9" ]]; then
-    FFMPEG_PIN=">=4.2"
-fi
-
-conda install -y -c pytorch "ffmpeg${FFMPEG_PIN}"
-conda env update --file "${this_dir}/environment.yml" --prune
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
deleted file mode 100644
index 0e07ae80d0d..00000000000
--- a/.circleci/unittest/windows/scripts/environment.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pip
-  - libpng
-  - jpeg
-  - ca-certificates
-  - hdf5
-  - setuptools
-  - pip:
-    - future
-    - scipy
-    - av != 9.1.1
-    - dataclasses
-    - h5py
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
deleted file mode 100644
index cfdff3da6ba..00000000000
--- a/.circleci/unittest/windows/scripts/install.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -ex
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-# TODO, refactor the below logic to make it easy to understand how to get correct cuda_version.
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-
-    cuda_toolkit_pckg="cudatoolkit"
-    if [[ "$CU_VERSION" == cu116 ]]; then
-        cuda_toolkit_pckg="cuda"
-    fi
-
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-    cudatoolkit="${cuda_toolkit_pckg}=${version}"
-fi
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-
-torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())")
-echo torch.cuda.is_available is $torch_cuda
-
-if [ ! -z "${CUDA_VERSION:-}" ] ; then
-    if [ "$torch_cuda" == "False" ]; then
-        echo "torch with cuda installed but torch.cuda.is_available() is False"
-        exit 1
-    fi
-fi
-
-source "$this_dir/set_cuda_envs.sh"
-
-printf "* Installing torchvision\n"
-"$this_dir/vc_env_helper.bat" python setup.py develop
diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.circleci/unittest/windows/scripts/install_conda.bat
deleted file mode 100644
index 6052ad08b10..00000000000
--- a/.circleci/unittest/windows/scripts/install_conda.bat
+++ /dev/null
@@ -1 +0,0 @@
-start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
diff --git a/.circleci/unittest/windows/scripts/post_process.sh b/.circleci/unittest/windows/scripts/post_process.sh
deleted file mode 100644
index 5c5cbb758a9..00000000000
--- a/.circleci/unittest/windows/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.circleci/unittest/windows/scripts/run_test.sh
deleted file mode 100644
index 802ad37f511..00000000000
--- a/.circleci/unittest/windows/scripts/run_test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source "$this_dir/set_cuda_envs.sh"
-
-python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
diff --git a/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/.circleci/unittest/windows/scripts/set_cuda_envs.sh
deleted file mode 100644
index d1ed415940d..00000000000
--- a/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-
-echo CU_VERSION is "${CU_VERSION}"
-echo CUDA_VERSION is "${CUDA_VERSION}"
-
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent. 
-# to understand this code, see https://github.com/pytorch/vision/issues/4443
-version="cpu"
-if [[ ! -z "${CUDA_VERSION}" ]] ; then
-    version="$CUDA_VERSION"
-else
-    if [[ ${#CU_VERSION} -eq 5 ]]; then
-        version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-fi
-
-# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
-# It would exit the shell. One result is cpu tests would not run if the shell exit.
-# Unless there's an error, Don't exit.
-if [[ "$version" != "cpu" ]]; then
-    # set cuda envs
-    export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
-    export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-    export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-
-    if  [ ! -d "$CUDA_PATH" ]; then
-        echo "$CUDA_PATH" does not exist
-        exit 1
-    fi
-
-    if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then
-        echo "nvjpeg does not exist"
-        exit 1
-    fi
-
-    # check cuda driver version
-    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
-        if [[ -x "$path" ]]; then
-            "$path" || echo "true";
-            break
-        fi
-    done
-
-    which nvcc
-    nvcc --version
-    env | grep CUDA
-fi
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
deleted file mode 100644
index 5eeb2e17b48..00000000000
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -e
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    export tmp_conda="$(echo $conda_dir | tr '/' '\\')"
-    export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe"
-    curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
-    "$this_dir/install_conda.bat"
-    unset tmp_conda
-    unset miniconda_exe
-fi
-
-eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-conda env update --file "${this_dir}/environment.yml" --prune
-
-# 4. Downgrade setuptools on Python 3.7.
-#    See https://github.com/pytorch/vision/pull/5868
-if [[ "${PYTHON_VERSION}" == '3.7' ]]; then
-  pip install --upgrade setuptools==58.0.4
-fi
diff --git a/.circleci/unittest/windows/scripts/vc_env_helper.bat b/.circleci/unittest/windows/scripts/vc_env_helper.bat
deleted file mode 100644
index 9410135677a..00000000000
--- a/.circleci/unittest/windows/scripts/vc_env_helper.bat
+++ /dev/null
@@ -1,39 +0,0 @@
-@echo on
-
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15INSTALLDIR=%%i"
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto vswhere
-    )
-)
-
-:vswhere
-if "%VSDEVCMD_ARGS%" == "" (
-    call "%VS15VCVARSALL%" x64 || exit /b 1
-) else (
-    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
-)
-
-@echo on
-
-set DISTUTILS_USE_SDK=1
-
-set args=%1
-shift
-:start
-if [%1] == [] goto done
-set args=%args% %1
-shift
-goto start
-
-:done
-if "%args%" == "" (
-    echo Usage: vc_env_helper.bat [command] [args]
-    echo e.g. vc_env_helper.bat cl /c test.cpp
-)
-
-%args% || exit /b 1
diff --git a/.clang-format b/.clang-format
index 6d0ab740db4..95d60445f4a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -60,9 +60,6 @@ MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -85,4 +82,11 @@ SpacesInSquareBrackets: false
 Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
+---
+Language: ObjC
+ColumnLimit: 120
+AlignAfterOpenBracket: Align
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
 ...
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index eec93854788..138adf1104e 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -7,3 +7,9 @@
 d367a01a18a3ae6bee13d8be3b63fd6a581ea46f
 # Upgrade usort to 1.0.2 and black to 22.3.0 (#5106) 
 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f
+# Fix unnecessary exploded black formatting (#7709)
+a335d916db0694770e8152f41e19195de3134523
+# Renaming: `BoundingBox` -> `BoundingBoxes` (#7778)
+332bff937c6711666191880fab57fa2f23ae772e
+# Upgrade type hint and others to Python 3.9 (#8814)
+a095de183d3811d79ed0db2715e7a1c3162fa19d
diff --git a/.gitattributes b/.gitattributes
index f9d672d7fb5..22d0452f8d7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -6,6 +6,3 @@
 
 # To ignore it use below
 *.ipynb linguist-documentation
-
-# To exclude autogenerated files from code reviews
-.circleci/config.yml linguist-generated=true
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index a073146ebed..ba811554c43 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -48,7 +48,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+      wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
       # For security purposes, please check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 27d0f2a1f0b..1a3402466f4 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1 +1,10 @@
 tracking_issue: 2447
+
+# List of workflows that will be re-run in case of failures
+# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts
+retryable_workflows:
+- Build Linux
+- Build Macos
+- Build M1
+- Build Windows
+- Tests
diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh
new file mode 100755
index 00000000000..4217a9d24be
--- /dev/null
+++ b/.github/scripts/cmake.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+./.github/scripts/setup-env.sh
+
+# Activate conda environment
+set +x && eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci && set -x
+
+# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
+case $(uname) in
+  Linux)
+    OS_TYPE=linux
+    ;;
+  Darwin)
+    OS_TYPE=macos
+    ;;
+  MSYS*)
+    OS_TYPE=windows
+    ;;
+  *)
+    echo "Unknown OS type:" $(uname)
+    exit 1
+    ;;
+esac
+
+if [[ $OS_TYPE == macos ]]; then
+  JOBS=$(sysctl -n hw.logicalcpu)
+else
+  JOBS=$(nproc)
+fi
+
+if [[ $OS_TYPE == linux ]]; then
+  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+fi
+
+TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))")
+if [[ $OS_TYPE == windows ]]; then
+  PACKAGING_DIR="${PWD}/packaging"
+  export PATH="${TORCH_PATH}/lib:${PATH}"
+fi
+
+Torch_DIR="${TORCH_PATH}/share/cmake/Torch"
+if [[ "${GPU_ARCH_TYPE}" == "cuda" ]]; then
+  WITH_CUDA=1
+else
+  WITH_CUDA=0
+fi
+
+echo '::group::Prepare CMake builds'
+mkdir -p cpp_build
+
+pushd examples/cpp
+python script_model.py
+mkdir -p build
+mv resnet18.pt fasterrcnn_resnet50_fpn.pt build
+popd
+
+# This was only needed for the tracing above
+pip uninstall -y torchvision
+echo '::endgroup::'
+
+echo '::group::Build and install libtorchvision'
+pushd cpp_build
+
+
+# On macOS, CMake is looking for the library (*.dylib) and the header (*.h) separately. By default, it prefers to load
+# the header from other packages that install the library. This easily leads to a mismatch if the library installed
+# from conda doesn't have the exact same version. Thus, we need to explicitly set CMAKE_FIND_FRAMEWORK=NEVER to force
+# it to not load anything from other installed frameworks. Resources:
+# https://stackoverflow.com/questions/36523911/osx-homebrew-cmake-libpng-version-mismatch-issue
+# https://cmake.org/cmake/help/latest/variable/CMAKE_FIND_FRAMEWORK.html
+cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER \
+  -DCMAKE_INSTALL_PREFIX="${CONDA_PREFIX}"
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cmake.bat" $JOBS
+else
+  make -j$JOBS
+  make install
+fi
+
+popd
+echo '::endgroup::'
+
+echo '::group::Build and run C++ example'
+pushd examples/cpp/build
+
+cmake .. -DTorch_DIR="${Torch_DIR}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER \
+  -DUSE_TORCHVISION=ON  # Needed for faster-rcnn since it's using torchvision ops like NMS.
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cpp_example.bat" $JOBS
+  cd Release
+  cp ../resnet18.pt .
+  cp ../fasterrcnn_resnet50_fpn.pt .
+else
+  make -j$JOBS
+fi
+
+./run_model resnet18.pt
+./run_model fasterrcnn_resnet50_fpn.pt
+
+popd
+echo '::endgroup::'
diff --git a/.github/scripts/export_IS_M1_CONDA_BUILD_JOB.sh b/.github/scripts/export_IS_M1_CONDA_BUILD_JOB.sh
new file mode 100755
index 00000000000..1cca56ddc56
--- /dev/null
+++ b/.github/scripts/export_IS_M1_CONDA_BUILD_JOB.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+export IS_M1_CONDA_BUILD_JOB=1
diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
new file mode 100755
index 00000000000..e1c5855f31c
--- /dev/null
+++ b/.github/scripts/setup-env.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+# Prepare conda
+set +x && eval "$($(which conda) shell.bash hook)" && set -x
+
+# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
+case $(uname) in
+  Linux)
+    OS_TYPE=linux
+    ;;
+  Darwin)
+    OS_TYPE=macos
+    ;;
+  MSYS*)
+    OS_TYPE=windows
+    ;;
+  *)
+    echo "Unknown OS type:" $(uname)
+    exit 1
+    ;;
+esac
+
+echo '::group::Create build environment'
+# See https://github.com/pytorch/vision/issues/7296 for ffmpeg
+conda create \
+  --name ci \
+  --quiet --yes \
+  python="${PYTHON_VERSION}" pip \
+  ninja cmake \
+  libpng \
+  libwebp \
+  'ffmpeg<4.3'
+conda activate ci
+conda install --quiet --yes libjpeg-turbo -c pytorch
+pip install --progress-bar=off --upgrade setuptools==72.1.0
+
+# See https://github.com/pytorch/vision/issues/6790
+if [[ "${PYTHON_VERSION}" != "3.11" ]]; then
+  pip install --progress-bar=off av!=10.0.0
+fi
+
+echo '::endgroup::'
+
+if [[ "${OS_TYPE}" == windows && "${GPU_ARCH_TYPE}" == cuda ]]; then
+  echo '::group::Install VisualStudio CUDA extensions on Windows'
+  TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/MSBuild/Microsoft/VC/v170/BuildCustomizations"
+  mkdir -p "${TARGET_DIR}"
+  cp -r "${CUDA_HOME}/MSBuildExtensions/"* "${TARGET_DIR}"
+  echo '::endgroup::'
+fi
+
+echo '::group::Install PyTorch'
+# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
+if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+  CHANNEL=test
+else
+  CHANNEL=nightly
+fi
+
+case $GPU_ARCH_TYPE in
+  cpu)
+    GPU_ARCH_ID="cpu"
+    ;;
+  cuda)
+    VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//')
+    GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}"
+    ;;
+  *)
+    echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}"
+    exit 1
+    ;;
+esac
+PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}"
+pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+
+if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
+  python -c "import torch; exit(not torch.cuda.is_available())"
+fi
+echo '::endgroup::'
+
+echo '::group::Install TorchVision'
+pip install -e . -v --no-build-isolation
+echo '::endgroup::'
+
+echo '::group::Install torchvision-extra-decoders'
+# This can be done after torchvision was built
+if [[ "$(uname)" == "Linux" && "$(uname -m)" != "aarch64" ]]; then
+    extra_decoders_channel="--pre --index-url https://download.pytorch.org/whl/nightly/cpu"
+else
+    extra_decoders_channel=""
+fi
+
+pip install torchvision-extra-decoders $extra_decoders_channel
+echo '::endgroup::'
+
+echo '::group::Collect environment information'
+conda list
+python -m torch.utils.collect_env
+echo '::endgroup::'
diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
new file mode 100755
index 00000000000..43968762a8b
--- /dev/null
+++ b/.github/scripts/unittest.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+./.github/scripts/setup-env.sh
+
+# Activate conda environment
+eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci
+
+echo '::group::Install testing utilities'
+# TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed
+pip install --progress-bar=off "pytest<8" pytest-mock pytest-cov expecttest!=0.2.0 requests
+echo '::endgroup::'
+
+python test/smoke_test.py
+
+# We explicitly ignore the video tests until we resolve https://github.com/pytorch/vision/issues/8162
+pytest --ignore-glob="*test_video*" --ignore-glob="*test_onnx*" --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 -k "not TestFxFeatureExtraction"
diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
new file mode 100644
index 00000000000..13518aba924
--- /dev/null
+++ b/.github/workflows/build-cmake.yml
@@ -0,0 +1,87 @@
+name: CMake
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  linux:
+    strategy:
+      matrix:
+        include:
+          - runner: linux.12xlarge
+            gpu-arch-type: cpu
+          - runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.9
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        ./.github/scripts/cmake.sh
+
+  macos:
+    strategy:
+      matrix:
+        include:
+          - runner: macos-m1-stable
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.9
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ${CONDA_RUN} ./.github/scripts/cmake.sh
+
+  windows:
+    strategy:
+      matrix:
+        include:
+          - runner: windows.4xlarge
+            gpu-arch-type: cpu
+          - runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.9
+        export VC_YEAR=2022
+        export VSDEVCMD_ARGS=""
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/cmake.sh
diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml
deleted file mode 100644
index b34dfd8f528..00000000000
--- a/.github/workflows/build-m1-binaries.yml
+++ /dev/null
@@ -1,157 +0,0 @@
-name: Build on M1
-on:
-  pull_request:
-    paths:
-      - .github/workflows/build-m1-binaries.yml
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-    tags:
-        # NOTE: Binary build pipelines should only get triggered on release candidate builds
-        # Release candidate tags look like: v1.11.0-rc1
-        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-  workflow_dispatch:
-env:
-  CHANNEL: "nightly"
-jobs:
-  build_wheels:
-    name: "Build TorchVision M1 wheels"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8", "3.9", "3.10" ]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Build TorchVision M1 wheel
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')`
-          set -ex
-          . packaging/pkg_helpers.bash
-          # if we are uploading to test channell, our version consist only of the base: 0.x.x - no date string or suffix added
-          if [[ $CHANNEL == "test" ]]; then
-            setup_base_build_version
-          else
-            setup_build_version
-          fi
-
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg wheel pkg-config
-          conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 -mpip install delocate
-          conda run -p ${ENV_NAME} python3 setup.py bdist_wheel
-          export PYTORCH_VERSION="$(conda run -p ${ENV_NAME} python3 -mpip show torch | grep ^Version: | sed 's/Version:  *//')"
-          conda run -p ${ENV_NAME} DYLD_FALLBACK_LIBRARY_PATH="${ENV_NAME}/lib" delocate-wheel -v --ignore-missing-dependencies dist/*.whl
-          conda env remove -p ${ENV_NAME}
-      - name: Test wheel
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-test-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          set -ex
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy
-          conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 -mpip install dist/*.whl
-          # Test torch is importable, by changing cwd and running import commands
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print('torchvision version is ', torchvision.__version__)"
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torch;import torchvision;print('Is torchvision useable?', all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]))"
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print(torchvision.io.read_image('${PWD}/gallery/assets/dog1.jpg').shape)"
-          conda env remove -p ${ENV_NAME}
-      - name: Upload wheel to GitHub
-        uses: actions/upload-artifact@v3
-        with:
-          name: torchvision-py${{ matrix.py_vers }}-macos11-m1
-          path: dist/
-      - name: Upload wheel to S3
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }}
-        shell: arch -arch arm64 bash {0}
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-        run: |
-          for pkg in dist/*; do
-            aws s3 cp "$pkg" "s3://pytorch/whl/${CHANNEL}/cpu/" --acl public-read
-          done
-  build_conda:
-    name: "Build TorchVision M1 conda packages"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8", "3.9", "3.10" ]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Set CHANNEL Release (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Install conda-build and purge previous artifacts
-        shell: arch -arch arm64 bash {0}
-        run: |
-          conda install -yq conda-build
-          conda build purge-all
-
-      - name: Build TorchVision M1 conda package
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PYTHON_VERSION: ${{ matrix.py_vers }}
-          CU_VERSION: cpu
-        run: |
-          set -ex
-          . packaging/pkg_helpers.bash
-
-          if [[ $CHANNEL == "test" ]]; then
-            setup_base_build_version
-          else
-            setup_build_version
-          fi
-
-          setup_conda_pytorch_constraint
-          export SOURCE_ROOT_DIR=$(pwd)
-          conda build \
-            -c defaults \
-            $CONDA_CHANNEL_FLAGS \
-            --no-anaconda-upload \
-            --python "$PYTHON_VERSION" \
-            --output-folder=dist/ \
-            packaging/torchvision
-      - name: Upload package to GitHub
-        uses: actions/upload-artifact@v3
-        with:
-          name: torchvision-py${{ matrix.py_vers }}-macos11-m1-conda
-          path: dist/
-      - name: Upload package to conda
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }}
-        shell: arch -arch arm64 bash {0}
-        env:
-          CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          conda install -yq anaconda-client
-          set -x
-          export ANACONDA_PATH=$(conda info --base)/bin
-          $ANACONDA_PATH/anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload dist/osx-arm64/*.tar.bz2 -u "pytorch-${CHANNEL}" --label main --no-progress --force
diff --git a/.github/workflows/build-wheels-aarch64-linux.yml b/.github/workflows/build-wheels-aarch64-linux.yml
new file mode 100644
index 00000000000..89948db6397
--- /dev/null
+++ b/.github/workflows/build-wheels-aarch64-linux.yml
@@ -0,0 +1,54 @@
+name: Build Aarch64 Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux-aarch64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: enable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+      architecture: aarch64
+      setup-miniconda: false
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
new file mode 100644
index 00000000000..818f32c102b
--- /dev/null
+++ b/.github/workflows/build-wheels-linux.yml
@@ -0,0 +1,52 @@
+name: Build Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-xpu: enable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
new file mode 100644
index 00000000000..76709b755e8
--- /dev/null
+++ b/.github/workflows/build-wheels-m1.yml
@@ -0,0 +1,52 @@
+name: Build M1 Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-m1-stable
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
new file mode 100644
index 00000000000..a269aea2604
--- /dev/null
+++ b/.github/workflows/build-wheels-windows.yml
@@ -0,0 +1,54 @@
+name: Build Windows Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: windows
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-xpu: enable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            env-script: packaging/windows/internal/vc_env_helper.bat
+            post-script: "python packaging/wheel/relocate.py"
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      env-script: ${{ matrix.env-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
diff --git a/.github/workflows/build_wheel_windows_arm64.yml b/.github/workflows/build_wheel_windows_arm64.yml
new file mode 100644
index 00000000000..0c578b194ab
--- /dev/null
+++ b/.github/workflows/build_wheel_windows_arm64.yml
@@ -0,0 +1,54 @@
+name: Build Windows ARM64 Wheels
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/build_wheel_windows_arm64.yml
+  push:
+    branches:
+      - nightly
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: windows-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: disable
+
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            smoke-test-script: test/smoke_test.py
+            pre-script: packaging/pre_build_script_arm64.sh
+            package-name: torchvision
+            architecture: "arm64"
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      pre-script: ${{ matrix.pre-script }}
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+      architecture: ${{ matrix.architecture }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000000..8b341622181
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,133 @@
+name: Docs
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+      - v[0-9]+.[0-9]+.[0-9]
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  build:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      upload-artifact: docs
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.10
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it
+        #  already links against the one pulled from conda. However, at runtime it pulls from
+        #  /lib64
+        # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
+        # have to pay attention in all other workflows?
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+
+        cd docs
+
+        echo '::group::Install doc requirements'
+        pip install --progress-bar=off -r requirements.txt
+        echo '::endgroup::'
+
+        if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then
+          echo '::group::Enable version string sanitization'
+          # This environment variable just has to exist and must not be empty. The actual value is arbitrary.
+          # See docs/source/conf.py for details
+          export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1
+          echo '::endgroup::'
+        fi
+
+        # The runner does not have sufficient memory to run with as many processes as there are
+        # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
+        sed -i -e 's/-j auto/-j 1/' Makefile
+        make html
+
+        # Below is an imperfect way for us to add "try on Colab" links to all of our gallery examples.
+        # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in
+        # build/html/_downloads/<some_hash>/<example_name>.ipynb
+        # We copy all those ipynb files in a more convenient folder so that we can more easily link to them.
+        mkdir build/html/_generated_ipynb_notebooks
+        for file in `find build/html/_downloads`; do
+          if [[ $file == *.ipynb ]]; then
+            cp $file build/html/_generated_ipynb_notebooks/
+          fi
+        done
+
+        cp -r build/html "${RUNNER_ARTIFACT_DIR}"
+
+        # On PRs we also want to upload the docs into our S3 bucket for preview.
+        if [[ ${{ github.event_name == 'pull_request' }} ]]; then
+          cp -r build/html/* "${RUNNER_DOCS_DIR}"
+        fi
+
+  upload:
+    needs: build
+    if: github.repository == 'pytorch/vision' && github.event_name == 'push' &&
+        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: write
+    with:
+      repository: pytorch/vision
+      download-artifact: docs
+      ref: gh-pages
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        REF_TYPE=${{ github.ref_type }}
+        REF_NAME=${{ github.ref_name }}
+
+        if [[ "${REF_TYPE}" == branch ]]; then
+          TARGET_FOLDER="${REF_NAME}"
+        elif [[ "${REF_TYPE}" == tag ]]; then
+          case "${REF_NAME}" in
+            *-rc*)
+              echo "Aborting upload since this is an RC tag: ${REF_NAME}"
+              exit 0
+              ;;
+            *)
+              # Strip the leading "v" as well as the trailing patch version. For example:
+              # 'v0.15.2' -> '0.15'
+              TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/')
+              ;;
+          esac
+        fi
+        echo "Target Folder: ${TARGET_FOLDER}"
+
+        mkdir -p "${TARGET_FOLDER}"
+        rm -rf "${TARGET_FOLDER}"/*
+        mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
+        git add "${TARGET_FOLDER}" || true
+
+        if [[ "${TARGET_FOLDER}" == main ]]; then
+          mkdir -p _static
+          rm -rf _static/*
+          cp -r "${TARGET_FOLDER}"/_static/* _static
+          git add _static || true
+        fi
+
+        git config user.name 'pytorchbot'
+        git config user.email 'soumith+bot@pytorch.org'
+        git config http.postBuffer 524288000
+        git commit -m "auto-generating sphinx docs" || true
+        git push
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 00000000000..deb3cdff83d
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,81 @@
+name: Lint
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  python-source-and-configs:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda create --name ci --quiet --yes python=3.9 pip
+        conda activate ci
+        echo '::endgroup::'
+
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off pre-commit
+        echo '::endgroup::'
+
+        set +e
+        pre-commit run --all-files
+
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+
+  python-types:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.11
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off "mypy==1.13.0"
+        echo '::endgroup::'
+
+        echo '::group::Lint Python types'
+        mypy --install-types --non-interactive --config-file mypy.ini
+        echo '::endgroup::'
+
+  # bc:
+  #   if: github.event.pull_request
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Run BC Lint Action
+  #       uses: pytorch/test-infra/.github/actions/bc-lint@main
+  #       with:
+  #         repo: ${{ github.event.pull_request.head.repo.full_name }}
+  #         base_sha: ${{ github.event.pull_request.base.sha }}
+  #         head_sha: ${{ github.event.pull_request.head.sha }}
diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml
index 20c37e4fd88..bf6349ab02e 100644
--- a/.github/workflows/pr-labels.yml
+++ b/.github/workflows/pr-labels.yml
@@ -8,28 +8,33 @@ on:
 jobs:
   is-properly-labeled:
     runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
 
     steps:
       - name: Set up python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
 
       - name: Install requests
         run: pip install requests
 
       - name: Checkout repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Process commit and find merger responsible for labeling
         id: commit
-        run: echo "::set-output name=merger::$(python .github/process_commit.py ${{ github.sha }})"
+        run: |
+          MERGER=$(python .github/process_commit.py ${{ github.sha }})
+          echo "merger=${MERGER}" | tee --append $GITHUB_OUTPUT
 
       - name: Ping merger responsible for labeling if necessary
         if: ${{ steps.commit.outputs.merger != '' }}
-        uses: mshick/add-pr-comment@v1
+        uses: mshick/add-pr-comment@v2
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           message: |
             Hey ${{ steps.commit.outputs.merger }}!
 
-            You merged this PR, but no labels were added. The list of valid labels is available at https://github.com/pytorch/vision/blob/main/.github/process_commit.py
+            You merged this PR, but no labels were added.
+            The list of valid labels is available at https://github.com/pytorch/vision/blob/main/.github/process_commit.py
diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
new file mode 100644
index 00000000000..18881bfacb2
--- /dev/null
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -0,0 +1,60 @@
+name: Prototype tests on Linux
+
+# IMPORTANT: This workflow has been manually disabled from the GitHub interface
+# in June 2024. The file is kept for reference in case we ever put this back.
+
+on:
+  pull_request:
+
+jobs:
+  unittests-prototype:
+    strategy:
+      matrix:
+        python-version:
+          - "3.9"
+          - "3.10"
+          - "3.11"
+          - "3.12"
+        runner: ["linux.12xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: "3.9"
+            runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest pytest-mock pytest-cov
+        echo '::endgroup::'
+
+        # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e.
+        # `test/test_prototype*.py` takes the highest priority, neither `--ignore` nor `--ignore-glob` can help us here.
+        rm test/test_prototype_datasets*.py
+        pytest \
+          -v --durations=25 \
+          --cov=torchvision/prototype --cov-report=term-missing \
+          --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" \
+          test/test_prototype_*.py
diff --git a/.github/workflows/prototype-tests.yml b/.github/workflows/prototype-tests.yml
deleted file mode 100644
index ff29168d9a7..00000000000
--- a/.github/workflows/prototype-tests.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: tests
-
-on:
-  pull_request:
-
-jobs:
-  prototype:
-    strategy:
-      matrix:
-        os:
-          - ubuntu-latest
-          - windows-latest
-          - macos-latest
-      fail-fast: false
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Set up python
-        uses: actions/setup-python@v3
-        with:
-          python-version: 3.7
-
-      - name: Upgrade system packages
-        run: python -m pip install --upgrade pip setuptools wheel
-
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install PyTorch nightly builds
-        run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
-
-      - name: Install torchvision
-        run: pip install --progress-bar=off --no-build-isolation --editable .
-
-      - name: Install other prototype dependencies
-        run: pip install --progress-bar=off scipy pycocotools h5py iopath
-
-      - name: Install test requirements
-        run: pip install --progress-bar=off pytest pytest-mock
-
-      - name: Run prototype tests
-        shell: bash
-        run: pytest -vvv --durations=20 test/test_prototype_*.py
diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml
deleted file mode 100644
index 1e5f79f82fd..00000000000
--- a/.github/workflows/test-m1.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Unit-tests on M1
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-env:
-  CHANNEL: "nightly"
-jobs:
-  tests:
-    name: "Unit-tests on M1"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8"]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Install TorchVision
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')`
-          export PATH=~/miniconda3/bin:$PATH
-          set -ex
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy
-          conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 setup.py develop
-          conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock av
-      - name: Run tests
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          set -ex
-          conda run -p ${ENV_NAME} --no-capture-output python3 -u -mpytest -v --tb=long --durations 20
-          conda env remove -p ${ENV_NAME}
diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml
index ecc283cac27..3cba2ef59d8 100644
--- a/.github/workflows/tests-schedule.yml
+++ b/.github/workflows/tests-schedule.yml
@@ -18,14 +18,20 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.9
 
       - name: Upgrade system packages
         run: python -m pip install --upgrade pip setuptools wheel
 
+      - name: SSL
+        run: python -c 'import ssl; print(ssl.OPENSSL_VERSION)'
+
       - name: Checkout repository
         uses: actions/checkout@v2
 
+      - name: TODO REMOVE THIS! Install non pre-release version of mpmath.
+        run: pip install "mpmath<1.4"
+
       - name: Install torch nightly build
         run: pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 
@@ -33,7 +39,7 @@ jobs:
         run: pip install --no-build-isolation --editable .
 
       - name: Install all optional dataset requirements
-        run: pip install scipy pycocotools lmdb requests
+        run: pip install scipy pycocotools lmdb gdown
 
       - name: Install tests requirements
         run: pip install pytest
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000000..5e0ed381b01
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,184 @@
+name: Tests
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  unittests-linux:
+    strategy:
+      matrix:
+        python-version:
+          - "3.9"
+          - "3.10"
+          - "3.11"
+          - "3.12"
+        runner: ["linux.12xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: 3.9
+            runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/unittest.sh
+
+  unittests-macos:
+    strategy:
+      matrix:
+        python-version:
+          - "3.9"
+          - "3.10"
+          # TODO put back 3.11 (See blame)
+          # - "3.11"
+          - "3.12"
+        runner: ["macos-m1-stable"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      timeout: 240
+      runner: ${{ matrix.runner }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ${CONDA_RUN} ./.github/scripts/unittest.sh
+
+  unittests-windows:
+    strategy:
+      matrix:
+        python-version:
+          - "3.9"
+          - "3.10"
+          - "3.11"
+          - "3.12"
+        runner: ["windows.4xlarge"]
+        gpu-arch-type: ["cpu"]
+        # TODO: put GPU testing back
+        # include:
+        #   - python-version: "3.9"
+        #     runner: windows.g5.4xlarge.nvidia.gpu
+        #     gpu-arch-type: cuda
+        #     gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      test-infra-ref: main
+      script: |
+        set -euxo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export VC_YEAR=2022
+        export VSDEVCMD_ARGS=""
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/unittest.sh
+
+  # onnx:
+  #   uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+  #   permissions:
+  #     id-token: write
+  #     contents: read
+  #   with:
+  #     repository: pytorch/vision
+  #     test-infra-ref: main
+  #     script: |
+  #       set -euo pipefail
+
+  #       export PYTHON_VERSION=3.10
+  #       export GPU_ARCH_TYPE=cpu
+  #       export GPU_ARCH_VERSION=''
+
+  #       ./.github/scripts/setup-env.sh
+
+  #       # Prepare conda
+  #       CONDA_PATH=$(which conda)
+  #       eval "$(${CONDA_PATH} shell.bash hook)"
+  #       conda activate ci
+
+  #       echo '::group::Install ONNX'
+  #       pip install --progress-bar=off onnx onnxruntime
+  #       echo '::endgroup::'
+
+  #       echo '::group::Install testing utilities'
+  #       pip install --progress-bar=off pytest "numpy<2"
+  #       echo '::endgroup::'
+
+  #       echo '::group::Run ONNX tests'
+  #       pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py
+  #       echo '::endgroup::'
+
+  unittests-extended:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    if: contains(github.event.pull_request.labels.*.name, 'run-extended')
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.9
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Pre-download model weights'
+        pip install --progress-bar=off aiohttp aiofiles tqdm
+        python scripts/download_model_urls.py
+        echo '::endgroup::'
+
+        echo '::group::Install testing utilities'
+        # TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed
+        pip install --progress-bar=off "pytest<8"
+        echo '::endgroup::'
+
+        echo '::group::Run extended unittests'
+        export PYTORCH_TEST_WITH_EXTENDED=1
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py
+        echo '::endgroup::'
diff --git a/.gitignore b/.gitignore
index f16b54061e0..c2d4d2a1c42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ docs/source/auto_examples/
 docs/source/gen_modules/
 docs/source/generated/
 docs/source/models/generated/
+docs/source/sg_execution_times.rst
 # pytorch-sphinx-theme gets installed here
 docs/src
 
@@ -42,3 +43,5 @@ xcuserdata/
 # direnv
 .direnv
 .envrc
+
+scripts/release_notes/data.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 463a97359ab..73a0b329112 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,20 +6,22 @@ repos:
       - id: check-toml
       - id: check-yaml
         exclude: packaging/.*
+        args:
+          - --allow-multiple-documents
       - id: mixed-line-ending
         args: [--fix=lf]
       - id: end-of-file-fixer
 
   - repo: https://github.com/omnilib/ufmt
-    rev: v1.3.2
+    rev: v1.3.3
     hooks:
       - id: ufmt
         additional_dependencies:
           - black == 22.3.0
           - usort == 1.0.2
 
-  - repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
     hooks:
       - id: flake8
         args: [--config=setup.cfg]
@@ -28,3 +30,12 @@ repos:
     rev: 6.1.1
     hooks:
       - id: pydocstyle
+
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.3
+    hooks:
+      - id: clang-format
+        name: clang-format
+        files: \.(cpp|hpp|c|h|cu)$
+        types: [file]
+        exclude: ^torchvision/csrc/io/image/cpu/giflib/
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000000..37db28b2bad
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,14 @@
+cff-version: 1.2.0
+title: "TorchVision: PyTorch's Computer Vision library"
+message: >-
+  If you find TorchVision useful in your work, please
+  consider citing the following BibTeX entry.
+type: software
+authors:
+  - given-names: TorchVision maintainers and contributors
+url: "https://github.com/pytorch/vision"
+license: "BSD-3-Clause"
+date-released: "2016-11-06"
+journal: "GitHub repository"
+publisher: "GitHub"
+key: "torchvision2016"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85b878307cf..f2430559909 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,23 +1,29 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.18)
 project(torchvision)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 file(STRINGS version.txt TORCHVISION_VERSION)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_MPS "Enable MPS support" OFF)
 option(WITH_PNG "Enable features requiring LibPNG." ON)
 option(WITH_JPEG "Enable features requiring LibJPEG." ON)
-option(USE_PYTHON "Link to Python when building" OFF)
+# Libwebp is disabled by default, which means enabling it from cmake is largely
+# untested. Since building from cmake is very low pri anyway, this is OK. If
+# you're a user and you need this, please open an issue (and a PR!).
+option(WITH_WEBP "Enable features requiring LibWEBP." OFF)
+# Same here
+option(WITH_AVIF "Enable features requiring LibAVIF." OFF)
 
 if(WITH_CUDA)
   enable_language(CUDA)
   add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
   add_definitions(-DWITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-  # CUDA-11.x can not be compiled using C++14 standard on Windows
-  string(REGEX MATCH "^[0-9]+" CUDA_MAJOR ${CMAKE_CUDA_COMPILER_VERSION})
-  if(${CUDA_MAJOR} GREATER 10 AND MSVC)
-    set(CMAKE_CXX_STANDARD 17)
-  endif()
+endif()
+
+if(WITH_MPS)
+  enable_language(OBJC OBJCXX)
+  add_definitions(-DWITH_MPS)
 endif()
 
 find_package(Torch REQUIRED)
@@ -32,9 +38,14 @@ if (WITH_JPEG)
     find_package(JPEG REQUIRED)
 endif()
 
-if (USE_PYTHON)
-  add_definitions(-DUSE_PYTHON)
-  find_package(Python3 REQUIRED COMPONENTS Development)
+if (WITH_WEBP)
+    add_definitions(-DWEBP_FOUND)
+    find_package(WEBP REQUIRED)
+endif()
+
+if (WITH_AVIF)
+    add_definitions(-DAVIF_FOUND)
+    find_package(AVIF REQUIRED)
 endif()
 
 function(CUDA_CONVERT_FLAGS EXISTING_TARGET)
@@ -79,11 +90,14 @@ include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
 set(TVCPP torchvision/csrc)
-list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCPP}/models ${TVCPP}/ops
+list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCPP}/io/image/cpu/giflib ${TVCPP}/models ${TVCPP}/ops
   ${TVCPP}/ops/autograd ${TVCPP}/ops/cpu ${TVCPP}/io/image/cuda)
 if(WITH_CUDA)
     list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast)
 endif()
+if(WITH_MPS)
+    list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps)
+endif()
 
 FOREACH(DIR ${ALLOW_LISTED})
     file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*)
@@ -92,6 +106,12 @@ ENDFOREACH()
 add_library(${PROJECT_NAME} SHARED ${ALL_SOURCES})
 target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
 
+if(WITH_MPS)
+  find_library(metal NAMES Metal)
+  find_library(foundation NAMES Foundation)
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${metal} ${foundation})
+endif()
+
 if (WITH_PNG)
     target_link_libraries(${PROJECT_NAME} PRIVATE ${PNG_LIBRARY})
 endif()
@@ -100,8 +120,12 @@ if (WITH_JPEG)
     target_link_libraries(${PROJECT_NAME} PRIVATE ${JPEG_LIBRARIES})
 endif()
 
-if (USE_PYTHON)
-  target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
+if (WITH_WEBP)
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${WEBP_LIBRARIES})
+endif()
+
+if (WITH_AVIF)
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${AVIF_LIBRARIES})
 endif()
 
 set_target_properties(${PROJECT_NAME} PROPERTIES
@@ -118,6 +142,14 @@ if (WITH_JPEG)
     include_directories(${JPEG_INCLUDE_DIRS})
 endif()
 
+if (WITH_WEBP)
+    include_directories(${WEBP_INCLUDE_DIRS})
+endif()
+
+if (WITH_AVIF)
+    include_directories(${AVIF_INCLUDE_DIRS})
+endif()
+
 set(TORCHVISION_CMAKECONFIG_INSTALL_DIR "share/cmake/TorchVision" CACHE STRING "install path for TorchVisionConfig.cmake")
 
 configure_package_config_file(cmake/TorchVisionConfig.cmake.in
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3eedb6261a4..d3cc923e268 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,22 +4,22 @@ We want to make contributing to this project as easy and transparent as possible
 
 ## TL;DR
 
-We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. 
+We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out.
 Your contributions may fall into the following categories:
 
-- It helps the project if you could 
+- It helps the project if you could
     - Report issues you're facing
-    - Give a :+1: on issues that others reported and that are relevant to you 
+    - Give a :+1: on issues that others reported and that are relevant to you
 
 - Answering queries on the issue tracker, investigating bugs are very valuable contributions to the project.
 
-- You would like to improve the documentation. This is no less important than improving the library itself! 
+- You would like to improve the documentation. This is no less important than improving the library itself!
 If you find a typo in the documentation, do not hesitate to submit a GitHub pull request.
 
 - If you would like to fix a bug
     - please pick one from the [list of open issues labelled as "help wanted"](https://github.com/pytorch/vision/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22)
     - comment on the issue that you want to work on this issue
-    - send a PR with your fix, see below. 
+    - send a PR with your fix, see below.
 
 - If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us.
 
@@ -30,30 +30,51 @@ clear and has sufficient instructions to be able to reproduce the issue.
 
 ## Development installation
 
-### Install PyTorch Nightly 
+
+### Dependencies
+
+Start by installing the **nightly** build of PyTorch following the [official
+instructions](https://pytorch.org/get-started/locally/). Note that the official
+instructions may ask you to install torchvision itself. If you are doing development
+on torchvision, you should not install prebuilt torchvision packages.
+
+**Optionally**, install `libpng`, `libjpeg-turbo` and `libwebp` if you want to enable
+support for
+native encoding / decoding of PNG, JPEG and WebP formats in
+[torchvision.io](https://pytorch.org/vision/stable/io.html#image):
 
 ```bash
-conda install pytorch -c pytorch-nightly
-# or with pip (see https://pytorch.org/get-started/locally/)
-# pip install numpy
-# pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
+conda install libpng libjpeg-turbo libwebp -c pytorch
 ```
 
-### Install Torchvision
+Note: you can use the `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`
+environment variables to tell the build system where to find those libraries if
+they are in specific locations. Take a look at
+[setup.py](https://github.com/pytorch/vision/blob/main/setup.py) for more
+details.
+
+### Clone and install torchvision
 
 ```bash
 git clone https://github.com/pytorch/vision.git
 cd vision
-python setup.py develop
+pip install -e . -v --no-build-isolation  # leave out the -e switch if you don't care about development.
 # or, for OSX
-# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop
-# for C++ debugging, please use DEBUG=1
-# DEBUG=1 python setup.py develop
-pip install flake8 typing mypy pytest pytest-mock scipy
+# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ pip install -e . -v --no-build-isolation
+# for C++ debugging, use DEBUG=1
+# DEBUG=1 pip install -e . -v --no-build-isolation
 ```
-You may also have to install `libpng-dev` and `libjpeg-turbo8-dev` libraries:
-```bash
-conda install libpng jpeg
+
+By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
+building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+
+We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
+`--no-build-isolation` flag.
+
+#### Other development dependencies (some of these are needed to run tests):
+
+```
+pip install expecttest flake8 typing mypy pytest pytest-mock scipy requests
 ```
 
 ## Development Process
@@ -66,12 +87,12 @@ If you plan to modify the code or documentation, please follow the steps below:
 4. Ensure the test suite passes.
 5. Make sure your code passes the formatting checks (see below).
 
-For more details about pull requests, 
-please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). 
+For more details about pull requests,
+please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
 
-If you would like to contribute a new model, please see [here](#New-model).
+If you would like to contribute a new model, please see [here](#New-architecture-or-improved-model-weights).
 
-If you would like to contribute a new dataset, please see [here](#New-dataset). 
+If you would like to contribute a new dataset, please see [here](#New-dataset).
 
 ### Code formatting and typing
 
@@ -83,7 +104,7 @@ Instead of relying directly on `black` however, we rely on
 [ufmt](https://github.com/omnilib/ufmt), for compatibility reasons with Facebook
 internal infrastructure.
 
-To format your code, install `ufmt` with `pip install ufmt==1.3.2 black==22.3.0 usort==1.0.2` and use e.g.:
+To format your code, install `ufmt` with `pip install ufmt==1.3.3 black==22.3.0 usort==1.0.2` and use e.g.:
 
 ```bash
 ufmt format torchvision
@@ -126,8 +147,10 @@ mypy --config-file mypy.ini
 
 ### Unit tests
 
-If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific 
-test: 
+Before running tests make sure to install [test dependencies](#other-development-dependencies-some-of-these-are-needed-to-run-tests).
+
+If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific
+test:
 ```bash
 pytest test/<test-module.py> -vvv -k <test_myfunc>
 # e.g. pytest test/test_transforms.py -vvv -k test_center_crop
@@ -136,7 +159,7 @@ pytest test/<test-module.py> -vvv -k <test_myfunc>
 If you would like to run all tests:
 ```bash
 pytest test -vvv
-``` 
+```
 
 Tests that require internet access should be in
 `test/test_internet.py`.
@@ -189,21 +212,23 @@ with "transforms" in their name.
 ### New architecture or improved model weights
 
 Please refer to the guidelines in [Contributing to Torchvision - Models](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md).
- 
+
 ### New dataset
 
-More details on how to add a new dataset will be provided later. Please, do not send any PR with a new dataset without discussing 
+Please, do not send any PR with a new dataset without discussing
 it in an issue as, most likely, it will not be accepted.
 
 ### Pull Request
 
-If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on 
-different operation systems, python versions and hardwares.
+If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on
+different operating systems, python versions and hardware.
 
-For more details about pull requests workflow, 
+For more details about pull requests workflow,
 please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
 
 ## License
 
 By contributing to Torchvision, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
+
+Contributors are also required to [sign our Contributor License Agreement](https://code.facebook.com/cla).
diff --git a/CONTRIBUTING_MODELS.md b/CONTRIBUTING_MODELS.md
index 82845e6579a..390a25a0f89 100644
--- a/CONTRIBUTING_MODELS.md
+++ b/CONTRIBUTING_MODELS.md
@@ -20,13 +20,13 @@ So, before starting any work and submitting a PR there are a few critical things
 
 ### 1. Preparation work
 
-- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether or not this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it.
+- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it.
 
 - If the decision is to include the new model, then please create a new ticket which will be used for all design and implementation discussions prior to the PR. One of the TorchVision maintainers will reach out at this stage and this will be your POC from this point onwards in order to provide support, guidance and regular feedback.
 
 ### 2.  Implement the model
 
-Please take a look at existing models in TorchVision to get familiar with the idioms. Also please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1.  Example of things to take into account:
+Please take a look at existing models in TorchVision to get familiar with the idioms. Also, please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1.  Example of things to take into account:
 
 - The implementation should be as close as possible to the canonical implementation/paper
 - The PR must include the code implementation, documentation and tests
@@ -34,7 +34,7 @@ Please take a look at existing models in TorchVision to get familiar with the id
 - The weights need to reproduce closely the results of the paper in terms of accuracy, even though the final weights to be deployed will be those trained by the TorchVision maintainers
 - The PR description should include commands/configuration used to train the model, so that the TorchVision maintainers can easily run them to verify the implementation and generate the final model to be released
 - Make sure we re-use existing components as much as possible (inheritance)
-- New primitives (transforms, losses, etc) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer
+- New primitives (transforms, losses, etc.) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer
 - Please take a look at the detailed [implementation and documentation guidelines](https://github.com/pytorch/vision/issues/5319) for a fine grain list of things not to be missed
 
 ### 3. Train the model with reference scripts
diff --git a/MANIFEST.in b/MANIFEST.in
index 75f238c0a2c..9e45188df35 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include README.rst
+include README.md
 include LICENSE
 
 recursive-exclude * __pycache__
diff --git a/README.md b/README.md
new file mode 100644
index 00000000000..7846c4462e6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,118 @@
+# torchvision
+
+[![total torchvision downloads](https://pepy.tech/badge/torchvision)](https://pepy.tech/project/torchvision)
+[![documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/vision/stable/index.html)
+
+The torchvision package consists of popular datasets, model architectures, and common image transformations for computer
+vision.
+
+## Installation
+
+Please refer to the [official
+instructions](https://pytorch.org/get-started/locally/) to install the stable
+versions of `torch` and `torchvision` on your system.
+
+To build source, refer to our [contributing
+page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation).
+
+The following is the corresponding `torchvision` versions and supported Python
+versions.
+
+| `torch`            | `torchvision`      | Python              |
+| ------------------ | ------------------ | ------------------- |
+| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.12`   |
+| `2.8`              | `0.23`             | `>=3.9`, `<=3.13`   |
+| `2.7`              | `0.22`             | `>=3.9`, `<=3.13`   |
+| `2.6`              | `0.21`             | `>=3.9`, `<=3.12`   |
+
+<details>
+    <summary>older versions</summary>
+
+| `torch` | `torchvision`     | Python                    |
+|---------|-------------------|---------------------------|
+| `2.5`              | `0.20`             | `>=3.9`, `<=3.12`   |
+| `2.4`              | `0.19`             | `>=3.8`, `<=3.12`   |
+| `2.3`              | `0.18`             | `>=3.8`, `<=3.12`   |
+| `2.2`              | `0.17`             | `>=3.8`, `<=3.11`   |
+| `2.1`              | `0.16`             | `>=3.8`, `<=3.11`   |
+| `2.0`              | `0.15`             | `>=3.8`, `<=3.11`   |
+| `1.13`  | `0.14`            | `>=3.7.2`, `<=3.10`       |
+| `1.12`  | `0.13`            | `>=3.7`, `<=3.10`         |
+| `1.11`  | `0.12`            | `>=3.7`, `<=3.10`         |
+| `1.10`  | `0.11`            | `>=3.6`, `<=3.9`          |
+| `1.9`   | `0.10`            | `>=3.6`, `<=3.9`          |
+| `1.8`   | `0.9`             | `>=3.6`, `<=3.9`          |
+| `1.7`   | `0.8`             | `>=3.6`, `<=3.9`          |
+| `1.6`   | `0.7`             | `>=3.6`, `<=3.8`          |
+| `1.5`   | `0.6`             | `>=3.5`, `<=3.8`          |
+| `1.4`   | `0.5`             | `==2.7`, `>=3.5`, `<=3.8` |
+| `1.3`   | `0.4.2` / `0.4.3` | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.2`   | `0.4.1`           | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.1`   | `0.3`             | `==2.7`, `>=3.5`, `<=3.7` |
+| `<=1.0` | `0.2`             | `==2.7`, `>=3.5`, `<=3.7` |
+
+</details>
+
+## Image Backends
+
+Torchvision currently supports the following image backends:
+
+- torch tensors
+- PIL images:
+    - [Pillow](https://python-pillow.org/)
+    - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
+
+Read more in in our [docs](https://pytorch.org/vision/stable/transforms.html).
+
+# Using the models on C++
+
+Refer to [example/cpp](https://github.com/pytorch/vision/tree/main/examples/cpp).
+
+**DISCLAIMER**: the `libtorchvision` library includes the torchvision
+custom ops as well as most of the C++ torchvision APIs. Those APIs do not come
+with any backward-compatibility guarantees and may change from one version to
+the next. Only the Python APIs are stable and with backward-compatibility
+guarantees. So, if you need stability within a C++ environment, your best bet is
+to export the Python APIs via torchscript.
+
+## Documentation
+
+You can find the API documentation on the pytorch website: <https://pytorch.org/vision/stable/index.html>
+
+## Contributing
+
+See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out.
+
+## Disclaimer on Datasets
+
+This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets,
+vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to
+determine whether you have permission to use the dataset under the dataset's license.
+
+If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset
+to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML
+community!
+
+## Pre-trained Model License
+
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the
+dataset used for training. It is your responsibility to determine whether you have permission to use the models for your
+use case.
+
+More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See
+[SWAG LICENSE](https://github.com/facebookresearch/SWAG/blob/main/LICENSE) for additional details.
+
+## Citing TorchVision
+
+If you find TorchVision useful in your work, please consider citing the following BibTeX entry:
+
+```bibtex
+@software{torchvision2016,
+    title        = {TorchVision: PyTorch's Computer Vision library},
+    author       = {TorchVision maintainers and contributors},
+    year         = 2016,
+    journal      = {GitHub repository},
+    publisher    = {GitHub},
+    howpublished = {\url{https://github.com/pytorch/vision}}
+}
+```
diff --git a/README.rst b/README.rst
deleted file mode 100644
index c3605cc3c95..00000000000
--- a/README.rst
+++ /dev/null
@@ -1,198 +0,0 @@
-torchvision
-===========
-
-.. image:: https://pepy.tech/badge/torchvision
-    :target: https://pepy.tech/project/torchvision
-
-.. image:: https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v
-    :target: https://pytorch.org/vision/stable/index.html
-
-
-The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
-
-
-Installation
-============
-
-We recommend Anaconda as Python package management system. Please refer to `pytorch.org <https://pytorch.org/>`_
-for the detail of PyTorch (``torch``) installation. The following is the corresponding ``torchvision`` versions and
-supported Python versions.
-
-+--------------------------+--------------------------+---------------------------------+
-| ``torch``                | ``torchvision``          | ``python``                      |
-+==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.12.0``               | ``0.13.0``               | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.11.0``               | ``0.12.0``               | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.2``               | ``0.11.3``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.1``               | ``0.11.2``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.0``               | ``0.11.1``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.9.1``                | ``0.10.1``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.9.0``                | ``0.10.0``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.2``                | ``0.9.2``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.1``                | ``0.9.1``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.0``                | ``0.9.0``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.1``                | ``0.8.2``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.0``                | ``0.8.1``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.0``                | ``0.8.0``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.6.0``                | ``0.7.0``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.5.1``                | ``0.6.1``                | ``>=3.5``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.5.0``                | ``0.6.0``                | ``>=3.5``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.4.0``                | ``0.5.0``                | ``==2.7``, ``>=3.5``, ``<=3.8`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.3.1``                | ``0.4.2``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.3.0``                | ``0.4.1``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.2.0``                | ``0.4.0``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.1.0``                | ``0.3.0``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``<=1.0.1``              | ``0.2.2``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-
-Anaconda:
-
-.. code:: bash
-
-    conda install torchvision -c pytorch
-
-pip:
-
-.. code:: bash
-
-    pip install torchvision
-
-From source:
-
-.. code:: bash
-
-    python setup.py install
-    # or, for OSX
-    # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
-
-
-We don't officially support building from source using ``pip``, but *if* you do,
-you'll need to use the ``--no-build-isolation`` flag.
-In case building TorchVision from source fails, install the nightly version of PyTorch following
-the linked guide on the  `contributing page <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_ and retry the install.
-
-By default, GPU support is built if CUDA is found and ``torch.cuda.is_available()`` is true.
-It's possible to force building GPU support by setting ``FORCE_CUDA=1`` environment variable,
-which is useful when building a docker image.
-
-Image Backend
-=============
-Torchvision currently supports the following image backends:
-
-* `Pillow`_ (default)
-
-* `Pillow-SIMD`_ - a **much faster** drop-in replacement for Pillow with SIMD. If installed will be used as the default.
-
-* `accimage`_ - if installed can be activated by calling :code:`torchvision.set_image_backend('accimage')`
-
-* `libpng`_ - can be installed via conda :code:`conda install libpng` or any of the package managers for debian-based and RHEL-based Linux distributions.
-
-* `libjpeg`_ - can be installed via conda :code:`conda install jpeg` or any of the package managers for debian-based and RHEL-based Linux distributions. `libjpeg-turbo`_ can be used as well.
-
-**Notes:** ``libpng`` and ``libjpeg`` must be available at compilation time in order to be available. Make sure that it is available on the standard library locations,
-otherwise, add the include and library paths in the environment variables ``TORCHVISION_INCLUDE`` and ``TORCHVISION_LIBRARY``, respectively.
-
-.. _libpng : http://www.libpng.org/pub/png/libpng.html
-.. _Pillow : https://python-pillow.org/
-.. _Pillow-SIMD : https://github.com/uploadcare/pillow-simd
-.. _accimage: https://github.com/pytorch/accimage
-.. _libjpeg: http://ijg.org/
-.. _libjpeg-turbo: https://libjpeg-turbo.org/
-
-Video Backend
-=============
-Torchvision currently supports the following video backends:
-
-* `pyav`_ (default) - Pythonic binding for ffmpeg libraries.
-
-.. _pyav : https://github.com/PyAV-Org/PyAV
-
-* video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any conflicting version of ffmpeg installed. Currently, this is only supported on Linux.
-
-.. code:: bash
-
-     conda install -c conda-forge ffmpeg
-     python setup.py install
-
-
-Using the models on C++
-=======================
-TorchVision provides an example project for how to use the models on C++ using JIT Script.
-
-Installation From source:
-
-.. code:: bash
-
-    mkdir build
-    cd build
-    # Add -DWITH_CUDA=on support for the CUDA if needed
-    cmake ..
-    make
-    make install
-
-Once installed, the library can be accessed in cmake (after properly configuring ``CMAKE_PREFIX_PATH``) via the :code:`TorchVision::TorchVision` target:
-
-.. code:: rest
-
-	find_package(TorchVision REQUIRED)
-	target_link_libraries(my-target PUBLIC TorchVision::TorchVision)
-
-The ``TorchVision`` package will also automatically look for the ``Torch`` package and add it as a dependency to ``my-target``,
-so make sure that it is also available to cmake via the ``CMAKE_PREFIX_PATH``.
-
-For an example setup, take a look at ``examples/cpp/hello_world``.
-
-Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any Python 
-dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link to Python. This 
-can be done by passing ``-DUSE_PYTHON=on`` to CMake.
-
-TorchVision Operators
----------------------
-In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that you
-:code:`#include <torchvision/vision.h>` in your project.
-
-Documentation
-=============
-You can find the API documentation on the pytorch website: https://pytorch.org/vision/stable/index.html
-
-Contributing
-============
-
-See the `CONTRIBUTING <CONTRIBUTING.md>`_ file for how to help out.
-
-Disclaimer on Datasets
-======================
-
-This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
-
-If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
-
-Pre-trained Model License
-=========================
-
-The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
-
-More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE <https://github.com/facebookresearch/SWAG/blob/main/LICENSE>`_ for additional details.
diff --git a/android/README.md b/android/README.md
new file mode 100644
index 00000000000..788c83f26de
--- /dev/null
+++ b/android/README.md
@@ -0,0 +1,3 @@
+## Status
+
+The Android demo of TorchVision is currently unmaintained, untested and likely out-of-date.
diff --git a/android/build.gradle b/android/build.gradle
index f28ba9112ff..f7995a07f5b 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -14,7 +14,7 @@ allprojects {
 
             androidSupportAppCompatV7Version = "28.0.0"
             fbjniJavaOnlyVersion = "0.0.3"
-            soLoaderNativeLoaderVersion = "0.10.4"
+            soLoaderNativeLoaderVersion = "0.10.5"
             pytorchAndroidVersion = "1.12"
         }
 
diff --git a/android/gradle.properties b/android/gradle.properties
index 1b6b275f63f..8204b73b051 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=0.14.0-SNAPSHOT
+VERSION_NAME=0.15.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch
diff --git a/android/ops/CMakeLists.txt b/android/ops/CMakeLists.txt
index ad42adbfa71..fb8d4348e8e 100644
--- a/android/ops/CMakeLists.txt
+++ b/android/ops/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 
 string(APPEND CMAKE_CXX_FLAGS " -DMOBILE")
 
diff --git a/benchmarks/encoding_decoding.py b/benchmarks/encoding_decoding.py
new file mode 100644
index 00000000000..0cafdb2d8a6
--- /dev/null
+++ b/benchmarks/encoding_decoding.py
@@ -0,0 +1,99 @@
+import os
+import platform
+import statistics
+
+import torch
+import torch.utils.benchmark as benchmark
+import torchvision
+
+
+def print_machine_specs():
+    print("Processor:", platform.processor())
+    print("Platform:", platform.platform())
+    print("Logical CPUs:", os.cpu_count())
+    print(f"\nCUDA device: {torch.cuda.get_device_name()}")
+    print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+
+
+def get_data():
+    transform = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.PILToTensor(),
+        ]
+    )
+    path = os.path.join(os.getcwd(), "data")
+    testset = torchvision.datasets.Places365(
+        root="./data", download=not os.path.exists(path), transform=transform, split="val"
+    )
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
+    )
+    return next(iter(testloader))
+
+
+def run_encoding_benchmark(decoded_images):
+    results = []
+    for device in ["cpu", "cuda"]:
+        decoded_images_device = [t.to(device=device) for t in decoded_images]
+        for size in [1, 100, 1000]:
+            for num_threads in [1, 12, 24]:
+                for stmt, strat in zip(
+                    [
+                        "[torchvision.io.encode_jpeg(img) for img in decoded_images_device_trunc]",
+                        "torchvision.io.encode_jpeg(decoded_images_device_trunc)",
+                    ],
+                    ["unfused", "fused"],
+                ):
+                    decoded_images_device_trunc = decoded_images_device[:size]
+                    t = benchmark.Timer(
+                        stmt=stmt,
+                        setup="import torchvision",
+                        globals={"decoded_images_device_trunc": decoded_images_device_trunc},
+                        label="Image Encoding",
+                        sub_label=f"{device.upper()} ({strat}): {stmt}",
+                        description=f"{size} images",
+                        num_threads=num_threads,
+                    )
+                    results.append(t.blocked_autorange())
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+def run_decoding_benchmark(encoded_images):
+    results = []
+    for device in ["cpu", "cuda"]:
+        for size in [1, 100, 1000]:
+            for num_threads in [1, 12, 24]:
+                for stmt, strat in zip(
+                    [
+                        f"[torchvision.io.decode_jpeg(img, device='{device}') for img in encoded_images_trunc]",
+                        f"torchvision.io.decode_jpeg(encoded_images_trunc, device='{device}')",
+                    ],
+                    ["unfused", "fused"],
+                ):
+                    encoded_images_trunc = encoded_images[:size]
+                    t = benchmark.Timer(
+                        stmt=stmt,
+                        setup="import torchvision",
+                        globals={"encoded_images_trunc": encoded_images_trunc},
+                        label="Image Decoding",
+                        sub_label=f"{device.upper()} ({strat}): {stmt}",
+                        description=f"{size} images",
+                        num_threads=num_threads,
+                    )
+                    results.append(t.blocked_autorange())
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    print_machine_specs()
+    decoded_images = get_data()
+    mean_h, mean_w = statistics.mean(t.shape[-2] for t in decoded_images), statistics.mean(
+        t.shape[-1] for t in decoded_images
+    )
+    print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
+    run_encoding_benchmark(decoded_images)
+    encoded_images_cuda = torchvision.io.encode_jpeg([img.cuda() for img in decoded_images])
+    encoded_images_cpu = [img.cpu() for img in encoded_images_cuda]
+    run_decoding_benchmark(encoded_images_cpu)
diff --git a/cmake/TorchVisionConfig.cmake.in b/cmake/TorchVisionConfig.cmake.in
index 9e92bc3b512..7f7e78817fa 100644
--- a/cmake/TorchVisionConfig.cmake.in
+++ b/cmake/TorchVisionConfig.cmake.in
@@ -46,13 +46,5 @@ if(@WITH_JPEG@)
   target_compile_definitions(${PN}::${PN} INTERFACE JPEG_FOUND)
 endif()
 
-if (@USE_PYTHON@)
-  if(NOT TARGET Python3::Python)
-    find_package(Python3 COMPONENTS Development)
-  endif()
-  target_link_libraries(torch INTERFACE Python3::Python)
-  target_compile_definitions(${PN}::${PN} INTERFACE USE_PYTHON)
-endif()
-
 endif()
 endif()
diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake
index d42ea4c9232..935c57f11b9 100644
--- a/cmake/iOS.cmake
+++ b/cmake/iOS.cmake
@@ -10,11 +10,11 @@
 #   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
 #
 # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   By default this location is automatically chosen based on the IOS_PLATFORM value above.
 #   If set manually, it will override the default location and force the user of a particular Developer Platform
 #
 # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+#   By default this location is automatically chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
 #   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
 #   If set manually, this will force the use of a specific SDK version
 
@@ -100,7 +100,7 @@ if(IOS_DEPLOYMENT_TARGET)
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
 endif()
 
-# Hidden visibilty is required for cxx on iOS
+# Hidden visibility is required for cxx on iOS
 set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
 set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden")
 
diff --git a/docs/Makefile b/docs/Makefile
index 389a07a604e..f462ff22303 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -33,6 +33,7 @@ clean:
 	rm -rf $(SOURCEDIR)/auto_examples/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/gen_modules/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/generated/  # autosummary
+	rm -rf $(SOURCEDIR)/models/generated  # autosummary
 
 .PHONY: help Makefile docset
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 1ff0c828042..2a50d9b8f45 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,8 @@
 matplotlib
 numpy
 sphinx-copybutton>=0.3.1
-sphinx-gallery>=0.9.0
+sphinx-gallery>=0.11.1
 sphinx==5.0.0
 tabulate
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+pycocotools
diff --git a/docs/source/_static/css/custom_torchvision.css b/docs/source/_static/css/custom_torchvision.css
index bdc4071c1aa..07346d7b03f 100644
--- a/docs/source/_static/css/custom_torchvision.css
+++ b/docs/source/_static/css/custom_torchvision.css
@@ -21,3 +21,15 @@ article.pytorch-article .reference.download.internal, article.pytorch-article .s
 .table-weights p {
     margin-bottom: 0.2rem !important;
 }
+
+/* Fix for Sphinx gallery 0.11
+See https://github.com/sphinx-gallery/sphinx-gallery/issues/990
+*/
+article.pytorch-article .sphx-glr-thumbnails .sphx-glr-thumbcontainer {
+    width: unset;
+    margin-right: 0;
+    margin-left: 0;
+}
+article.pytorch-article div.section div.wy-table-responsive tbody td {
+    width: 50%;
+}
diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py
index 925894df5c5..8871f6debbb 100644
--- a/docs/source/beta_status.py
+++ b/docs/source/beta_status.py
@@ -4,11 +4,12 @@
 
 class BetaStatus(Directive):
     has_content = True
+    text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
+    node = nodes.warning
 
     def run(self):
-        api_name = " ".join(self.content)
-        text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
-        return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))]
+        text = self.text.format(api_name=" ".join(self.content))
+        return [self.node("", nodes.paragraph("", "", nodes.Text(text)))]
 
 
 def setup(app):
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 231d3cad416..26771a7b711 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -29,6 +29,7 @@
 import pytorch_sphinx_theme
 import torchvision
 import torchvision.models as M
+from sphinx_gallery.sorting import ExplicitOrder
 from tabulate import tabulate
 
 sys.path.append(os.path.abspath("."))
@@ -55,11 +56,66 @@
     "beta_status",
 ]
 
+# We override sphinx-gallery's example header to prevent sphinx-gallery from
+# creating a note at the top of the renderred notebook.
+# https://github.com/sphinx-gallery/sphinx-gallery/blob/451ccba1007cc523f39cbcc960ebc21ca39f7b75/sphinx_gallery/gen_rst.py#L1267-L1271
+# This is because we also want to add a link to google Colab, so we write our own note in each example.
+from sphinx_gallery import gen_rst
+
+gen_rst.EXAMPLE_HEADER = """
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "{0}"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_{1}:
+
+"""
+
+
+class CustomGalleryExampleSortKey:
+    # See https://sphinx-gallery.github.io/stable/configuration.html#sorting-gallery-examples
+    # and https://github.com/sphinx-gallery/sphinx-gallery/blob/master/sphinx_gallery/sorting.py
+    def __init__(self, src_dir):
+        self.src_dir = src_dir
+
+    transforms_subsection_order = [
+        "plot_transforms_getting_started.py",
+        "plot_transforms_illustrations.py",
+        "plot_transforms_e2e.py",
+        "plot_cutmix_mixup.py",
+        "plot_rotated_box_transforms.py",
+        "plot_custom_transforms.py",
+        "plot_tv_tensors.py",
+        "plot_custom_tv_tensors.py",
+    ]
+
+    def __call__(self, filename):
+        if "gallery/transforms" in self.src_dir:
+            try:
+                return self.transforms_subsection_order.index(filename)
+            except ValueError as e:
+                raise ValueError(
+                    "Looks like you added an example in gallery/transforms? "
+                    "You need to specify its order in docs/source/conf.py. Look for CustomGalleryExampleSortKey."
+                ) from e
+        else:
+            # For other subsections we just sort alphabetically by filename
+            return filename
+
+
 sphinx_gallery_conf = {
     "examples_dirs": "../../gallery/",  # path to your example scripts
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
+    "subsection_order": ExplicitOrder(["../../gallery/transforms", "../../gallery/others"]),
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
+    "remove_config_comments": True,
+    "ignore_pattern": "helpers.py",
+    "within_subsection_order": CustomGalleryExampleSortKey,
 }
 
 napoleon_use_ivar = True
@@ -88,17 +144,15 @@
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
-#
-# The short X.Y version.
-version = "main (" + torchvision.__version__ + " )"
-# The full version, including alpha/beta/rc tags.
-release = "main"
-VERSION = os.environ.get("VERSION", None)
-if VERSION:
+# version: The short X.Y version.
+# release: The full version, including alpha/beta/rc tags.
+if os.environ.get("TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS", None):
     # Turn 1.11.0aHASH into 1.11 (major.minor only)
-    version = ".".join(version.split(".")[:2])
+    version = release = ".".join(torchvision.__version__.split(".")[:2])
     html_title = " ".join((project, version, "documentation"))
-    release = version
+else:
+    version = f"main ({torchvision.__version__})"
+    release = "main"
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation
@@ -138,7 +192,7 @@
     "logo_only": True,
     "pytorch_project": "docs",
     "navigation_with_keys": True,
-    "analytics_id": "UA-117752657-2",
+    "analytics_id": "GTM-T8XT4PS",
 }
 
 html_logo = "_static/img/pytorch-logo-dark.svg"
@@ -318,7 +372,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
       used within the autoclass directive.
     """
 
-    if obj.__name__.endswith(("_Weights", "_QuantizedWeights")):
+    if getattr(obj, "__name__", "").endswith(("_Weights", "_QuantizedWeights")):
 
         if len(obj) == 0:
             lines[:] = ["There are no available pre-trained weights."]
@@ -330,8 +384,8 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
             f"``weights='DEFAULT'`` or ``weights='{str(list(obj)[0]).split('.')[1]}'``.",
         ]
 
-        if obj.__doc__ != "An enumeration.":
-            # We only show the custom enum doc if it was overriden. The default one from Python is "An enumeration"
+        if obj.__doc__ is not None and obj.__doc__ != "An enumeration.":
+            # We only show the custom enum doc if it was overridden. The default one from Python is "An enumeration"
             lines.append("")
             lines.append(obj.__doc__)
 
@@ -362,6 +416,13 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
                     max_visible = 3
                     v_sample = ", ".join(v[:max_visible])
                     v = f"{v_sample}, ... ({len(v)-max_visible} omitted)" if len(v) > max_visible else v_sample
+                elif k == "_ops":
+                    v = f"{v:.2f}"
+                    k = "GIPS" if obj.__name__.endswith("_QuantizedWeights") else "GFLOPS"
+                elif k == "_file_size":
+                    k = "File size"
+                    v = f"{v:.1f} MB"
+
                 table.append((str(k), str(v)))
             table = tabulate(table, tablefmt="rst")
             lines += [".. rst-class:: table-weights"]  # Custom CSS class, see custom_torchvision.css
@@ -385,19 +446,27 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     if exclude_patterns is not None:
         weights = [w for w in weights if all(p not in str(w) for p in exclude_patterns)]
 
+    ops_name = "GIPS" if "QuantizedWeights" in weights_endswith else "GFLOPS"
+
     metrics_keys, metrics_names = zip(*metrics)
-    column_names = ["Weight"] + list(metrics_names) + ["Params", "Recipe"]
+    column_names = ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Recipe"]  # Final column order
     column_names = [f"**{name}**" for name in column_names]  # Add bold
 
-    content = [
-        (
+    content = []
+    for w in weights:
+        row = [
             f":class:`{w} <{type(w).__name__}>`",
             *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys),
             f"{w.meta['num_params']/1e6:.1f}M",
+            f"{w.meta['_ops']:.2f}",
             f"`link <{w.meta['recipe']}>`__",
-        )
-        for w in weights
-    ]
+        ]
+
+        content.append(row)
+
+    column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 2 + ["10"]
+    widths_table = " ".join(column_widths)
+
     table = tabulate(content, headers=column_names, tablefmt="rst")
 
     generated_dir = Path("generated")
@@ -405,7 +474,7 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     with open(generated_dir / f"{table_name}_table.rst", "w+") as table_file:
         table_file.write(".. rst-class:: table-weights\n")  # Custom CSS class, see custom_torchvision.css
         table_file.write(".. table::\n")
-        table_file.write(f"    :widths: 100 {'20 ' * len(metrics_names)} 20 10\n\n")
+        table_file.write(f"    :widths: {widths_table} \n\n")
         table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n")
 
 
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index 7641139daed..3caa7434e20 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -1,3 +1,5 @@
+.. _datasets:
+
 Datasets
 ========
 
@@ -25,6 +27,15 @@ All the datasets have almost similar API. They all have two common arguments:
 ``transform`` and  ``target_transform`` to transform the input and target respectively.
 You can also create your own datasets using the provided :ref:`base classes <base_classes_datasets>`.
 
+.. warning::
+
+    When a dataset object is created with ``download=True``, the files are first
+    downloaded and extracted in the root directory. This download logic is not
+    multi-process safe, so it may lead to conflicts / race conditions if it is
+    run within a distributed setting. In distributed mode, we recommend creating
+    a dummy dataset object to trigger the download logic *before* setting up
+    distributed mode.
+
 Image classification
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -52,6 +63,7 @@ Image classification
     GTSRB
     INaturalist
     ImageNet
+    Imagenette
     KMNIST
     LFWPeople
     LSUN
@@ -80,7 +92,6 @@ Image detection or segmentation
     CocoDetection
     CelebA
     Cityscapes
-    GTSRB
     Kitti
     OxfordIIITPet
     SBDataset
@@ -111,11 +122,13 @@ Stereo Matching
     CarlaStereo
     Kitti2012Stereo
     Kitti2015Stereo
+    CREStereo
     FallingThingsStereo
     SceneFlowStereo
     SintelStereo
     InStereo2k
     ETH3DStereo
+    Middlebury2014Stereo
 
 Image pairs
 ~~~~~~~~~~~
@@ -145,9 +158,16 @@ Video classification
 
     HMDB51
     Kinetics
-    Kinetics400
     UCF101
 
+Video prediction
+~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+    :toctree: generated/
+    :template: class_dataset.rst
+
+    MovingMNIST
 
 .. _base_classes_datasets:
 
@@ -161,3 +181,12 @@ Base classes for custom datasets
     DatasetFolder
     ImageFolder
     VisionDataset
+
+Transforms v2
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    wrap_dataset_for_transforms_v2
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd047..dc5fdefaefb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,6 +32,7 @@ architectures, and common image transformations for computer vision.
    :caption: Package Reference
 
    transforms
+   tv_tensors
    models
    datasets
    utils
diff --git a/docs/source/io.rst b/docs/source/io.rst
index 258a1ee16dc..478321a4e6d 100644
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -1,88 +1,119 @@
-Reading/Writing images and videos
-=================================
+Decoding / Encoding images and videos
+=====================================
 
 .. currentmodule:: torchvision.io
 
-The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing video and
-images.
+The :mod:`torchvision.io` module provides utilities for decoding and encoding
+images and videos.
 
-Video
------
+Image Decoding
+--------------
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Torchvision currently supports decoding JPEG, PNG, WEBP, GIF, AVIF, and HEIC
+images. JPEG decoding can also be done on CUDA GPUs.
 
-    read_video
-    read_video_timestamps
-    write_video
+The main entry point is the :func:`~torchvision.io.decode_image` function, which
+you can use as an alternative to ``PIL.Image.open()``. It will decode images
+straight into image Tensors, thus saving you the conversion and allowing you to
+run transforms/preproc natively on tensors.
 
+.. code::
 
-Fine-grained video API
-----------------------
+    from torchvision.io import decode_image
 
-In addition to the :mod:`read_video` function, we provide a high-performance 
-lower-level API for more fine-grained control compared to the :mod:`read_video` function.
-It does all this whilst fully supporting torchscript.
+    img = decode_image("path_to_image", mode="RGB")
+    img.dtype  # torch.uint8
 
-.. betastatus:: fine-grained video API
+    # Or
+    raw_encoded_bytes = ...  # read encoded bytes from your file system
+    img = decode_image(raw_encoded_bytes, mode="RGB")
 
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    VideoReader
 
+:func:`~torchvision.io.decode_image` will automatically detect the image format,
+and call the corresponding decoder (except for HEIC and AVIF images, see details
+in :func:`~torchvision.io.decode_avif` and :func:`~torchvision.io.decode_heic`).
+You can also use the lower-level format-specific decoders which can be more
+powerful, e.g. if you want to encode/decode JPEGs on CUDA.
 
-Example of inspecting a video:
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
 
-.. code:: python
+    decode_image
+    decode_jpeg
+    decode_png
+    decode_webp
+    decode_avif
+    decode_heic
+    decode_gif
 
-    import torchvision
-    video_path = "path to a test video"
-    # Constructor allocates memory and a threaded decoder
-    # instance per video. At the moment it takes two arguments:
-    # path to the video file, and a wanted stream.
-    reader = torchvision.io.VideoReader(video_path, "video")
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
 
-    # The information about the video can be retrieved using the 
-    # `get_metadata()` method. It returns a dictionary for every stream, with
-    # duration and other relevant metadata (often frame rate)
-    reader_md = reader.get_metadata()
+    ImageReadMode
 
-    # metadata is structured as a dict of dicts with following structure
-    # {"stream_type": {"attribute": [attribute per stream]}}
-    #
-    # following would print out the list of frame rates for every present video stream
-    print(reader_md["video"]["fps"])
+Obsolete decoding function:
 
-    # we explicitly select the stream we would like to operate on. In
-    # the constructor we select a default video stream, but
-    # in practice, we can set whichever stream we would like 
-    video.set_current_stream("video:0")
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
 
+    read_image
 
-Image
------
+Image Encoding
+--------------
 
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
+For encoding, JPEG (cpu and CUDA) and PNG are supported.
 
-    ImageReadMode
 
 .. autosummary::
     :toctree: generated/
     :template: function.rst
 
-    read_image
-    decode_image
     encode_jpeg
-    decode_jpeg
     write_jpeg
     encode_png
-    decode_png
     write_png
+
+IO operations
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
     read_file
     write_file
+
+Video - DEPREACTED
+------------------
+
+.. warning::
+
+    DEPRECATED: All the video decoding and encoding capabilities of torchvision
+    are deprecated from version 0.22 and will be removed in version 0.24.  We
+    recommend that you migrate to
+    `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+    consolidate the future decoding/encoding capabilities of PyTorch
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    read_video
+    read_video_timestamps
+    write_video
+
+
+**Fine-grained video API**
+
+In addition to the :mod:`read_video` function, we provide a high-performance 
+lower-level API for more fine-grained control compared to the :mod:`read_video` function.
+It does all this whilst fully supporting torchscript.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    VideoReader
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 57eda6d38a5..d0096aaf854 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -120,13 +120,12 @@ behavior, such as batch normalization. To switch between these modes, use
     # Set model to eval mode
     model.eval()
 
-Model Registration Mechanism
-----------------------------
-
-.. betastatus:: registration mechanism
+Listing and retrieving available models
+---------------------------------------
 
-As of v0.14, TorchVision offers a new model registration mechanism which allows retreaving models
-and weights by their names. Here are a few examples on how to use them:
+As of v0.14, TorchVision offers a new mechanism which allows listing and
+retrieving models and weights by their names. Here are a few examples on how to
+use them:
 
 .. code:: python
 
@@ -148,7 +147,7 @@ and weights by their names. Here are a few examples on how to use them:
     weights_enum2 = get_model_weights(torchvision.models.quantization.mobilenet_v3_large)
     assert weights_enum == weights_enum2
 
-Here are the available public methods of the model registration mechanism:
+Here are the available public functions to retrieve models and their corresponding weights:
 
 .. currentmodule:: torchvision.models
 .. autosummary::
@@ -173,7 +172,11 @@ Most pre-trained models can be accessed directly via PyTorch Hub without having
     model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
 
     # Option 2: passing weights param as enum
-    weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
+    weights = torch.hub.load(
+        "pytorch/vision",
+        "get_weight",
+        weights="ResNet50_Weights.IMAGENET1K_V2",
+    )
     model = torch.hub.load("pytorch/vision", "resnet50", weights=weights)
 
 You can also retrieve all the available weights of a specific model via PyTorch Hub by doing:
@@ -207,6 +210,7 @@ weights:
    models/efficientnetv2
    models/googlenet
    models/inception
+   models/maxvit
    models/mnasnet
    models/mobilenetv2
    models/mobilenetv3
@@ -226,10 +230,10 @@ Here is an example of how to use the pre-trained image classification models:
 
 .. code:: python
 
-    from torchvision.io import read_image
+    from torchvision.io import decode_image
     from torchvision.models import resnet50, ResNet50_Weights
 
-    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
+    img = decode_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
 
     # Step 1: Initialize model with the best available weights
     weights = ResNet50_Weights.DEFAULT
@@ -283,10 +287,10 @@ Here is an example of how to use the pre-trained quantized image classification
 
 .. code:: python
 
-    from torchvision.io import read_image
+    from torchvision.io import decode_image
     from torchvision.models.quantization import resnet50, ResNet50_QuantizedWeights
 
-    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
+    img = decode_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
 
     # Step 1: Initialize model with the best available weights
     weights = ResNet50_QuantizedWeights.DEFAULT
@@ -339,11 +343,11 @@ Here is an example of how to use the pre-trained semantic segmentation models:
 
 .. code:: python
 
-    from torchvision.io.image import read_image
+    from torchvision.io.image import decode_image
     from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
     from torchvision.transforms.functional import to_pil_image
 
-    img = read_image("gallery/assets/dog1.jpg")
+    img = decode_image("gallery/assets/dog1.jpg")
 
     # Step 1: Initialize model with the best available weights
     weights = FCN_ResNet50_Weights.DEFAULT
@@ -411,12 +415,12 @@ Here is an example of how to use the pre-trained object detection models:
 .. code:: python
 
 
-    from torchvision.io.image import read_image
+    from torchvision.io.image import decode_image
     from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
     from torchvision.utils import draw_bounding_boxes
     from torchvision.transforms.functional import to_pil_image
 
-    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
+    img = decode_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
 
     # Step 1: Initialize model with the best available weights
     weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
@@ -517,6 +521,7 @@ pre-trained weights:
    models/video_mvit
    models/video_resnet
    models/video_s3d
+   models/video_swin_transformer
 
 |
 
diff --git a/docs/source/models/alexnet.rst b/docs/source/models/alexnet.rst
index 080c241983b..8e94b4eeed9 100644
--- a/docs/source/models/alexnet.rst
+++ b/docs/source/models/alexnet.rst
@@ -14,7 +14,7 @@ and is based on `One weird trick for parallelizing convolutional neural networks
 Model builders
 --------------
 
-The following model builders can be used to instanciate an AlexNet model, with or
+The following model builders can be used to instantiate an AlexNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.alexnet.AlexNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/efficientnet.rst b/docs/source/models/efficientnet.rst
index 4df547b3cbd..cbc9718959a 100644
--- a/docs/source/models/efficientnet.rst
+++ b/docs/source/models/efficientnet.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an EfficientNet model, with or
+The following model builders can be used to instantiate an EfficientNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/efficientnetv2.rst b/docs/source/models/efficientnetv2.rst
index 05c953b1327..3066c28ebd4 100644
--- a/docs/source/models/efficientnetv2.rst
+++ b/docs/source/models/efficientnetv2.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an EfficientNetV2 model, with or
+The following model builders can be used to instantiate an EfficientNetV2 model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/fcos.rst b/docs/source/models/fcos.rst
index 1bcc4267678..085f26549b8 100644
--- a/docs/source/models/fcos.rst
+++ b/docs/source/models/fcos.rst
@@ -3,7 +3,7 @@ FCOS
 
 .. currentmodule:: torchvision.models.detection
 
-The RetinaNet model is based on the `FCOS: Fully Convolutional One-Stage Object Detection
+The FCOS model is based on the `FCOS: Fully Convolutional One-Stage Object Detection
 <https://arxiv.org/abs/1904.01355>`__ paper.
 
 .. betastatus:: detection module
@@ -12,7 +12,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a FCOS model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.fcos.FCOS`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/fcos.py>`_ for
 more details about this class.
diff --git a/docs/source/models/googlenet.rst b/docs/source/models/googlenet.rst
index ed4f1345e23..91ea03ddf3d 100644
--- a/docs/source/models/googlenet.rst
+++ b/docs/source/models/googlenet.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a GoogLeNet model, with or
+The following model builders can be used to instantiate a GoogLeNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.googlenet.GoogLeNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/googlenet_quant.rst b/docs/source/models/googlenet_quant.rst
index acb2737b52b..4358389b3e5 100644
--- a/docs/source/models/googlenet_quant.rst
+++ b/docs/source/models/googlenet_quant.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a quantized GoogLeNet
+The following model builders can be used to instantiate a quantized GoogLeNet
 model, with or without pre-trained weights. All the model builders internally
 rely on the ``torchvision.models.quantization.googlenet.QuantizableGoogLeNet``
 base class. Please refer to the `source code
diff --git a/docs/source/models/inception.rst b/docs/source/models/inception.rst
index 72aa9724d41..e162eef5d30 100644
--- a/docs/source/models/inception.rst
+++ b/docs/source/models/inception.rst
@@ -10,7 +10,7 @@ Computer Vision <https://arxiv.org/abs/1512.00567>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an InceptionV3 model, with or
+The following model builders can be used to instantiate an InceptionV3 model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.inception.Inception3`` base class. Please refer to the `source
 code <https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py>`_ for
diff --git a/docs/source/models/inception_quant.rst b/docs/source/models/inception_quant.rst
index 397fd10df3c..d26f1ab09da 100644
--- a/docs/source/models/inception_quant.rst
+++ b/docs/source/models/inception_quant.rst
@@ -10,7 +10,7 @@ Computer Vision <https://arxiv.org/abs/1512.00567>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a quantized Inception
+The following model builders can be used to instantiate a quantized Inception
 model, with or without pre-trained weights. All the model builders internally
 rely on the ``torchvision.models.quantization.inception.QuantizableInception3``
 base class. Please refer to the `source code
diff --git a/docs/source/models/maxvit.rst b/docs/source/models/maxvit.rst
new file mode 100644
index 00000000000..29aaaaab334
--- /dev/null
+++ b/docs/source/models/maxvit.rst
@@ -0,0 +1,23 @@
+MaxVit
+===============
+
+.. currentmodule:: torchvision.models
+
+The MaxVit transformer models are based on the `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`__
+paper.
+
+
+Model builders
+--------------
+
+The following model builders can be used to instantiate an MaxVit model with and without pre-trained weights.
+All the model builders internally rely on the ``torchvision.models.maxvit.MaxVit`` 
+base class. Please refer to the `source code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/maxvit.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    maxvit_t
diff --git a/docs/source/models/mnasnet.rst b/docs/source/models/mnasnet.rst
index e31b4aca1b6..fd9ea511585 100644
--- a/docs/source/models/mnasnet.rst
+++ b/docs/source/models/mnasnet.rst
@@ -11,7 +11,7 @@ Search for Mobile <https://arxiv.org/pdf/1807.11626.pdf>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an MNASNet model.
+The following model builders can be used to instantiate an MNASNet model.
 All the model builders internally rely on the
 ``torchvision.models.mnasnet.MNASNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/retinanet.rst b/docs/source/models/retinanet.rst
index 8613ae9aaab..910692ef3a5 100644
--- a/docs/source/models/retinanet.rst
+++ b/docs/source/models/retinanet.rst
@@ -12,7 +12,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a RetinaNet model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.retinanet.RetinaNet`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py>`_ for
 more details about this class.
diff --git a/docs/source/models/ssd.rst b/docs/source/models/ssd.rst
index 7d73b234a28..68b0bb224df 100644
--- a/docs/source/models/ssd.rst
+++ b/docs/source/models/ssd.rst
@@ -12,7 +12,7 @@ The SSD model is based on the `SSD: Single Shot MultiBox Detector
 Model builders
 --------------
 
-The following model builders can be used to instanciate a SSD model, with or
+The following model builders can be used to instantiate a SSD model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.SSD`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/ssdlite.rst b/docs/source/models/ssdlite.rst
index bac1575c966..7701d1c9f9f 100644
--- a/docs/source/models/ssdlite.rst
+++ b/docs/source/models/ssdlite.rst
@@ -6,7 +6,7 @@ SSDlite
 The SSDLite model is based on the `SSD: Single Shot MultiBox Detector
 <https://arxiv.org/abs/1512.02325>`__, `Searching for MobileNetV3
 <https://arxiv.org/abs/1905.02244>`__ and `MobileNetV2: Inverted Residuals and Linear
-Bottlenecks <https://arxiv.org/abs/1801.04381>__` papers.
+Bottlenecks <https://arxiv.org/abs/1801.04381>`__ papers.
 
 .. betastatus:: detection module
 
@@ -17,7 +17,7 @@ The following model builders can be used to instantiate a SSD Lite model, with o
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.ssd.SSD`` base class. Please refer to the `source
 code
-<https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssd.py>`_ for
+<https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssdlite.py>`_ for
 more details about this class.
 
 .. autosummary::
diff --git a/docs/source/models/swin_transformer.rst b/docs/source/models/swin_transformer.rst
index 35b52987954..b302f5bd79d 100644
--- a/docs/source/models/swin_transformer.rst
+++ b/docs/source/models/swin_transformer.rst
@@ -15,7 +15,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate an SwinTransformer model (original and V2) with and without pre-trained weights.
-All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` 
+All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer``
 base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_ for
 more details about this class.
diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst
index a9fa9aabfb1..77b5686927c 100644
--- a/docs/source/models/vgg.rst
+++ b/docs/source/models/vgg.rst
@@ -11,7 +11,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a VGG model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.vgg.VGG`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_ for
 more details about this class.
diff --git a/docs/source/models/video_swin_transformer.rst b/docs/source/models/video_swin_transformer.rst
new file mode 100644
index 00000000000..e31e69759b4
--- /dev/null
+++ b/docs/source/models/video_swin_transformer.rst
@@ -0,0 +1,27 @@
+Video SwinTransformer
+=====================
+
+.. currentmodule:: torchvision.models.video
+
+The Video SwinTransformer model is based on the `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`__ paper.
+
+.. betastatus:: video module
+
+
+Model builders
+--------------
+
+The following model builders can be used to instantiate a VideoResNet model, with or
+without pre-trained weights. All the model builders internally rely on the
+``torchvision.models.video.swin_transformer.SwinTransformer3d`` base class. Please refer to the `source
+code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    swin3d_t
+    swin3d_s
+    swin3d_b
diff --git a/docs/source/training_references.rst b/docs/source/training_references.rst
index fc22ac5eba6..4b5e43109c2 100644
--- a/docs/source/training_references.rst
+++ b/docs/source/training_references.rst
@@ -19,9 +19,9 @@ guarantees.
 
 In general, these scripts rely on the latest (not yet released) pytorch version
 or the latest torchvision version. This means that to use them, **you might need
-to install the latest pytorch and torchvision versions**, with e.g.::
+to install the latest pytorch and torchvision versions** following the `official
+instructions <https://pytorch.org/get-started/locally/>`_.
 
-    conda install pytorch torchvision -c pytorch-nightly
 
 If you need to rely on an older stable version of pytorch or torchvision, e.g.
 torchvision 0.10, then it's safer to use the scripts from that corresponding
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 5909b68966b..44b4cc3aaa5 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -1,127 +1,572 @@
 .. _transforms:
 
-Transforming and augmenting images
-==================================
+Transforming images, videos, boxes and more
+===========================================
 
 .. currentmodule:: torchvision.transforms
 
-Transforms are common image transformations available in the
-``torchvision.transforms`` module. They can be chained together using
-:class:`Compose`.
-Most transform classes have a function equivalent: :ref:`functional
-transforms <functional_transforms>` give fine-grained control over the
-transformations.
-This is useful if you have to build a more complex transformation pipeline
-(e.g. in the case of segmentation tasks).
-
-Most transformations accept both `PIL <https://pillow.readthedocs.io>`_
-images and tensor images, although some transformations are :ref:`PIL-only
-<transforms_pil_only>` and some are :ref:`tensor-only
-<transforms_tensor_only>`. The :ref:`conversion_transforms` may be used to
-convert to and from PIL images.
-
-The transformations that accept tensor images also accept batches of tensor
-images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a
-number of channels, ``H`` and ``W`` are image height and width. A batch of
-Tensor Images is a tensor of ``(B, C, H, W)`` shape, where ``B`` is a number
-of images in the batch.
+Torchvision supports common computer vision transformations in the
+``torchvision.transforms.v2`` module. Transforms can be used to transform and
+augment data, for both training or inference. The following objects are
+supported:
+
+- Images as pure tensors, :class:`~torchvision.tv_tensors.Image` or PIL image
+- Videos as :class:`~torchvision.tv_tensors.Video` 
+- Axis-aligned and rotated bounding boxes as :class:`~torchvision.tv_tensors.BoundingBoxes` 
+- Segmentation and detection masks as :class:`~torchvision.tv_tensors.Mask` 
+- KeyPoints as :class:`~torchvision.tv_tensors.KeyPoints`.
+
+.. code:: python
+
+    # Image Classification
+    import torch
+    from torchvision.transforms import v2
+
+    H, W = 32, 32
+    img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
+
+    transforms = v2.Compose([
+        v2.RandomResizedCrop(size=(224, 224), antialias=True),
+        v2.RandomHorizontalFlip(p=0.5),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    img = transforms(img)
+
+.. code:: python
+
+    # Detection (re-using imports and transforms from above)
+    from torchvision import tv_tensors
+
+    img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
+    boxes = torch.randint(0, H // 2, size=(3, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
+
+    # The same transforms can be used!
+    img, boxes = transforms(img, boxes)
+    # And you can pass arbitrary input structures
+    output_dict = transforms({"image": img, "boxes": boxes})
+
+Transforms are typically passed as the ``transform`` or ``transforms`` argument
+to the :ref:`Datasets <datasets>`.
+
+Start here
+----------
+
+Whether you're new to Torchvision transforms, or you're already experienced with
+them, we encourage you to start with
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` in
+order to learn more about what can be done with the new v2 transforms.
+
+Then, browse the sections in below this page for general information and
+performance tips. The available transforms and functionals are listed in the
+:ref:`API reference <v2_api_ref>`.
+
+More information and tutorials can also be found in our :ref:`example gallery
+<gallery>`, e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`
+or :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`.
+
+.. _conventions:
+
+Supported input types and conventions
+-------------------------------------
+
+Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
+and tensor inputs. Both CPU and CUDA tensors are supported.
+The result of both backends (PIL or Tensors) should be very
+close. In general, we recommend relying on the tensor backend :ref:`for
+performance <transforms_perf>`.  The :ref:`conversion transforms
+<conversion_transforms>` may be used to convert to and from PIL images, or for
+converting dtypes and ranges.
+
+Tensor image are expected to be of shape ``(C, H, W)``, where ``C`` is the
+number of channels, and ``H`` and ``W`` refer to height and width. Most
+transforms support batched tensor input. A batch of Tensor images is a tensor of
+shape ``(N, C, H, W)``, where ``N`` is a number of images in the batch. The
+:ref:`v2 <v1_or_v2>` transforms generally accept an arbitrary number of leading
+dimensions ``(..., C, H, W)`` and can handle batched images or batched videos.
+
+.. _range_and_dtype:
+
+Dtype and expected value range
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The expected range of the values of a tensor image is implicitly defined by
 the tensor dtype. Tensor images with a float dtype are expected to have
-values in ``[0, 1)``. Tensor images with an integer dtype are expected to
+values in ``[0, 1]``. Tensor images with an integer dtype are expected to
 have values in ``[0, MAX_DTYPE]`` where ``MAX_DTYPE`` is the largest value
-that can be represented in that dtype.
+that can be represented in that dtype. Typically, images of dtype
+``torch.uint8`` are expected to have values in ``[0, 255]``.
 
-Randomized transformations will apply the same transformation to all the
-images of a given batch, but they will produce different transformations
-across calls. For reproducible transformations across calls, you may use
-:ref:`functional transforms <functional_transforms>`.
+Use :class:`~torchvision.transforms.v2.ToDtype` to convert both the dtype and
+range of the inputs.
 
-The following examples illustrate the use of the available transforms:
+.. _v1_or_v2:
 
-    * :ref:`sphx_glr_auto_examples_plot_transforms.py`
+V1 or V2? Which one should I use?
+---------------------------------
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_transforms_001.png
-            :align: center
-            :scale: 65%
+**TL;DR** We recommend using the ``torchvision.transforms.v2`` transforms
+instead of those in ``torchvision.transforms``. They're faster and they can do
+more things. Just change the import and you should be good to go. Moving
+forward, new features and improvements will only be considered for the v2
+transforms.
+
+In Torchvision 0.15 (March 2023), we released a new set of transforms available
+in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of
+advantages compared to the v1 ones (in ``torchvision.transforms``):
+
+- They can transform images **and also** bounding boxes, masks, videos and
+  keypoints. This provides support for tasks beyond image classification:
+  detection, segmentation, video classification, pose estimation, etc. See
+  :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py`
+  and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
+- They support more transforms like :class:`~torchvision.transforms.v2.CutMix`
+  and :class:`~torchvision.transforms.v2.MixUp`. See
+  :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py`.
+- They're :ref:`faster <transforms_perf>`.
+- They support arbitrary input structures (dicts, lists, tuples, etc.).
+- Future improvements and features will be added to the v2 transforms only.
+
+These transforms are **fully backward compatible** with the v1 ones, so if
+you're already using tranforms from ``torchvision.transforms``, all you need to
+do to is to update the import to ``torchvision.transforms.v2``. In terms of
+output, there might be negligible differences due to implementation differences.
+
+.. _transforms_perf:
+
+Performance considerations
+--------------------------
 
-    * :ref:`sphx_glr_auto_examples_plot_scripted_tensor_transforms.py`
+We recommend the following guidelines to get the best performance out of the
+transforms:
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_scripted_tensor_transforms_001.png
-            :align: center
-            :scale: 30%
+- Rely on the v2 transforms from ``torchvision.transforms.v2``
+- Use tensors instead of PIL images
+- Use ``torch.uint8`` dtype, especially for resizing
+- Resize with bilinear or bicubic mode
 
-.. warning::
+This is what a typical transform pipeline could look like:
 
-    Since v0.8.0 all random transformations are using torch default random generator to sample random parameters.
-    It is a backward compatibility breaking change and user should set the random state as following:
+.. code:: python
+
+    from torchvision.transforms import v2
+    transforms = v2.Compose([
+        v2.ToImage(),  # Convert to tensor, only needed if you had a PIL image
+        v2.ToDtype(torch.uint8, scale=True),  # optional, most input are already uint8 at this point
+        # ...
+        v2.RandomResizedCrop(size=(224, 224), antialias=True),  # Or Resize(antialias=True)
+        # ...
+        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+The above should give you the best performance in a typical training environment
+that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers >
+0``.
+
+Transforms tend to be sensitive to the input strides / memory format. Some
+transforms will be faster with channels-first images while others prefer
+channels-last. Like ``torch`` operators, most transforms will preserve the
+memory format of the input, but this may not always be respected due to
+implementation details. You may want to experiment a bit if you're chasing the
+very best performance.  Using :func:`torch.compile` on individual transforms may
+also help factoring out the memory format variable (e.g. on
+:class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about
+**memory format**, not :ref:`tensor shape <conventions>`.
+
+Note that resize transforms like :class:`~torchvision.transforms.v2.Resize`
+and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer
+channels-last input and tend **not** to benefit from :func:`torch.compile` at
+this time.
 
-    .. code:: python
+.. _functional_transforms:
 
-        # Previous versions
-        # import random
-        # random.seed(12)
+Transform classes, functionals, and kernels
+-------------------------------------------
 
-        # Now
-        import torch
-        torch.manual_seed(17)
+Transforms are available as classes like
+:class:`~torchvision.transforms.v2.Resize`, but also as functionals like
+:func:`~torchvision.transforms.v2.functional.resize` in the
+``torchvision.transforms.v2.functional`` namespace.
+This is very much like the :mod:`torch.nn` package which defines both classes
+and functional equivalents in :mod:`torch.nn.functional`.
 
-    Please, keep in mind that the same seed for torch random generator and Python random generator will not
-    produce the same results.
+The functionals support PIL images, pure tensors, or :ref:`TVTensors
+<tv_tensors>`, e.g. both ``resize(image_tensor)`` and ``resize(boxes)`` are
+valid.
 
+.. note::
 
-Scriptable transforms
----------------------
+    Random transforms like :class:`~torchvision.transforms.v2.RandomCrop` will
+    randomly sample some parameter each time they're called. Their functional
+    counterpart (:func:`~torchvision.transforms.v2.functional.crop`) does not do
+    any kind of random sampling and thus have a slighlty different
+    parametrization. The ``get_params()`` class method of the transforms class
+    can be used to perform parameter sampling when using the functional APIs.
 
-In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`.
+
+The ``torchvision.transforms.v2.functional`` namespace also contains what we
+call the "kernels". These are the low-level functions that implement the
+core functionalities for specific types, e.g. ``resize_bounding_boxes`` or
+```resized_crop_mask``. They are public, although not documented. Check the
+`code
+<https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/functional/__init__.py>`_
+to see which ones are available (note that those starting with a leading
+underscore are **not** public!). Kernels are only really useful if you want
+:ref:`torchscript support <transforms_torchscript>` for types like bounding
+boxes or masks.
+
+.. _transforms_torchscript:
+
+Torchscript support
+-------------------
+
+Most transform classes and functionals support torchscript. For composing
+transforms, use :class:`torch.nn.Sequential` instead of
+:class:`~torchvision.transforms.v2.Compose`:
 
 .. code:: python
 
     transforms = torch.nn.Sequential(
-        transforms.CenterCrop(10),
-        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        CenterCrop(10),
+        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
     )
     scripted_transforms = torch.jit.script(transforms)
 
-Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor`` and does not require
-`lambda` functions or ``PIL.Image``.
+.. warning::
 
-For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
+    v2 transforms support torchscript, but if you call ``torch.jit.script()`` on
+    a v2 **class** transform, you'll actually end up with its (scripted) v1
+    equivalent.  This may lead to slightly different results between the
+    scripted and eager executions due to implementation differences between v1
+    and v2.
 
+    If you really need torchscript support for the v2 transforms, we recommend
+    scripting the **functionals** from the
+    ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
 
-Compositions of transforms
---------------------------
+
+Also note that the functionals only support torchscript for pure tensors, which
+are always treated as images. If you need torchscript support for other types
+like bounding boxes or masks, you can rely on the :ref:`low-level kernels
+<functional_transforms>`.
+
+For any custom transformations to be used with ``torch.jit.script``, they should
+be derived from ``torch.nn.Module``.
+
+See also: :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`.
+
+.. _v2_api_ref:
+
+V2 API reference - Recommended
+------------------------------
+
+Geometry
+^^^^^^^^
+
+Resizing
+""""""""
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Compose
+    v2.Resize
+    v2.ScaleJitter
+    v2.RandomShortestSize
+    v2.RandomResize
 
+Functionals
 
-Transforms on PIL Image and torch.\*Tensor
-------------------------------------------
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.resize
+
+Cropping
+""""""""
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    v2.RandomCrop
+    v2.RandomResizedCrop
+    v2.RandomIoUCrop
+    v2.CenterCrop
+    v2.FiveCrop
+    v2.TenCrop
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.crop
+    v2.functional.resized_crop
+    v2.functional.ten_crop
+    v2.functional.center_crop
+    v2.functional.five_crop
+
+Others
+""""""
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.RandomHorizontalFlip
+    v2.RandomVerticalFlip
+    v2.Pad
+    v2.RandomZoomOut
+    v2.RandomRotation
+    v2.RandomAffine
+    v2.RandomPerspective
+    v2.ElasticTransform
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.horizontal_flip
+    v2.functional.vertical_flip
+    v2.functional.pad
+    v2.functional.rotate
+    v2.functional.affine
+    v2.functional.perspective
+    v2.functional.elastic
+
+Color
+^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ColorJitter
+    v2.RandomChannelPermutation
+    v2.RandomPhotometricDistort
+    v2.Grayscale
+    v2.RGB
+    v2.RandomGrayscale
+    v2.GaussianBlur
+    v2.GaussianNoise
+    v2.RandomInvert
+    v2.RandomPosterize
+    v2.RandomSolarize
+    v2.RandomAdjustSharpness
+    v2.RandomAutocontrast
+    v2.RandomEqualize
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.permute_channels
+    v2.functional.rgb_to_grayscale
+    v2.functional.grayscale_to_rgb
+    v2.functional.to_grayscale
+    v2.functional.gaussian_blur
+    v2.functional.gaussian_noise
+    v2.functional.invert
+    v2.functional.posterize
+    v2.functional.solarize
+    v2.functional.adjust_sharpness
+    v2.functional.autocontrast
+    v2.functional.adjust_contrast
+    v2.functional.equalize
+    v2.functional.adjust_brightness
+    v2.functional.adjust_saturation
+    v2.functional.adjust_hue
+    v2.functional.adjust_gamma
+
+
+Composition
+^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.Compose
+    v2.RandomApply
+    v2.RandomChoice
+    v2.RandomOrder
+
+Miscellaneous
+^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.LinearTransformation
+    v2.Normalize
+    v2.RandomErasing
+    v2.Lambda
+    v2.SanitizeBoundingBoxes
+    v2.ClampBoundingBoxes
+    v2.ClampKeyPoints
+    v2.UniformTemporalSubsample
+    v2.JPEG
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.normalize
+    v2.functional.erase
+    v2.functional.sanitize_bounding_boxes
+    v2.functional.clamp_bounding_boxes
+    v2.functional.clamp_keypoints
+    v2.functional.uniform_temporal_subsample
+    v2.functional.jpeg
+
+.. _conversion_transforms:
+
+Conversion
+^^^^^^^^^^
+
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ToImage
+    v2.ToPureTensor
+    v2.PILToTensor
+    v2.ToPILImage
+    v2.ToDtype
+    v2.ConvertBoundingBoxFormat
+
+functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: functional.rst
+
+    v2.functional.to_image
+    v2.functional.pil_to_tensor
+    v2.functional.to_pil_image
+    v2.functional.to_dtype
+    v2.functional.convert_bounding_box_format
+
+
+Deprecated
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ToTensor
+    v2.functional.to_tensor
+    v2.ConvertImageDtype
+    v2.functional.convert_image_dtype
+
+Auto-Augmentation
+^^^^^^^^^^^^^^^^^
+
+`AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
+Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
+ImageNet policies provide significant improvements when applied to other datasets.
+In TorchVision we implemented 3 policies learned on the following datasets: ImageNet, CIFAR10 and SVHN.
+The new transform can be used standalone or mixed-and-matched with existing transforms:
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.AutoAugment
+    v2.RandAugment
+    v2.TrivialAugmentWide
+    v2.AugMix
+
+
+CutMix - MixUp
+^^^^^^^^^^^^^^
+
+CutMix and MixUp are special transforms that
+are meant to be used on batches rather than on individual images, because they
+are combining pairs of images together. These can be used after the dataloader
+(once the samples are batched), or part of a collation function. See
+:ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage examples.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.CutMix
+    v2.MixUp
+
+Developer tools
+^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.Transform
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.register_kernel
+    v2.query_size
+    v2.query_chw
+    v2.get_bounding_boxes
+
+
+V1 API Reference
+----------------
+
+Geometry
+^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Resize
+    RandomCrop
+    RandomResizedCrop
     CenterCrop
-    ColorJitter
     FiveCrop
-    Grayscale
+    TenCrop
     Pad
+    RandomRotation
     RandomAffine
-    RandomApply
-    RandomCrop
-    RandomGrayscale
-    RandomHorizontalFlip
     RandomPerspective
-    RandomResizedCrop
-    RandomRotation
+    ElasticTransform
+    RandomHorizontalFlip
     RandomVerticalFlip
-    Resize
-    TenCrop
+
+
+Color
+^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    ColorJitter
+    Grayscale
+    RandomGrayscale
     GaussianBlur
     RandomInvert
     RandomPosterize
@@ -130,23 +575,20 @@ Transforms on PIL Image and torch.\*Tensor
     RandomAutocontrast
     RandomEqualize
 
-
-.. _transforms_pil_only:
-
-Transforms on PIL Image only
-----------------------------
+Composition
+^^^^^^^^^^^
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    Compose
+    RandomApply
     RandomChoice
     RandomOrder
 
-.. _transforms_tensor_only:
-
-Transforms on torch.\*Tensor only
----------------------------------
+Miscellaneous
+^^^^^^^^^^^^^
 
 .. autosummary::
     :toctree: generated/
@@ -155,12 +597,16 @@ Transforms on torch.\*Tensor only
     LinearTransformation
     Normalize
     RandomErasing
-    ConvertImageDtype
+    Lambda
 
-.. _conversion_transforms:
+Conversion
+^^^^^^^^^^
 
-Conversion Transforms
----------------------
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`.
 
 .. autosummary::
     :toctree: generated/
@@ -169,20 +615,10 @@ Conversion Transforms
     ToPILImage
     ToTensor
     PILToTensor
+    ConvertImageDtype
 
-
-Generic Transforms
-------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    Lambda
-
-
-Automatic Augmentation Transforms
----------------------------------
+Auto-Augmentation
+^^^^^^^^^^^^^^^^^
 
 `AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
 Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
@@ -200,57 +636,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
     TrivialAugmentWide
     AugMix
 
-.. _functional_transforms:
+
 
 Functional Transforms
----------------------
+^^^^^^^^^^^^^^^^^^^^^
 
 .. currentmodule:: torchvision.transforms.functional
 
-Functional transforms give you fine-grained control of the transformation pipeline.
-As opposed to the transformations above, functional transforms don't contain a random number
-generator for their parameters.
-That means you have to specify/generate all parameters, but the functional transform will give you
-reproducible results across calls.
-
-Example:
-you can apply a functional transform with the same parameters to multiple images like this:
-
-.. code:: python
-
-    import torchvision.transforms.functional as TF
-    import random
-
-    def my_segmentation_transforms(image, segmentation):
-        if random.random() > 0.5:
-            angle = random.randint(-30, 30)
-            image = TF.rotate(image, angle)
-            segmentation = TF.rotate(segmentation, angle)
-        # more transforms ...
-        return image, segmentation
-
-
-Example:
-you can use a functional transform to build transform classes with custom behavior:
-
-.. code:: python
-
-    import torchvision.transforms.functional as TF
-    import random
-
-    class MyRotationTransform:
-        """Rotate by one of the given angles."""
-
-        def __init__(self, angles):
-            self.angles = angles
-
-        def __call__(self, x):
-            angle = random.choice(self.angles)
-            return TF.rotate(x, angle)
-
-    rotation_transform = MyRotationTransform(angles=[-30, -15, 0, 15, 30])
-
-
 .. autosummary::
     :toctree: generated/
     :template: function.rst
diff --git a/docs/source/tv_tensors.rst b/docs/source/tv_tensors.rst
new file mode 100644
index 00000000000..d292012fdf8
--- /dev/null
+++ b/docs/source/tv_tensors.rst
@@ -0,0 +1,30 @@
+.. _tv_tensors:
+
+TVTensors
+==========
+
+.. currentmodule:: torchvision.tv_tensors
+
+TVTensors are :class:`torch.Tensor` subclasses which the v2 :ref:`transforms
+<transforms>` use under the hood to dispatch their inputs to the appropriate
+lower-level kernels. Most users do not need to manipulate TVTensors directly.
+
+Refer to
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` for
+an introduction to TVTensors, or
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py` for more advanced
+info.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Image
+    Video
+    KeyPoints
+    BoundingBoxFormat
+    BoundingBoxes
+    Mask
+    TVTensor
+    set_return_type
+    wrap
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
index 276f730c294..cda04de900a 100644
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -4,7 +4,7 @@ Utils
 =====
 
 The ``torchvision.utils`` module contains various utilities, mostly :ref:`for
-vizualization <sphx_glr_auto_examples_plot_visualization_utils.py>`. 
+visualization <sphx_glr_auto_examples_others_plot_visualization_utils.py>`.
 
 .. currentmodule:: torchvision.utils
 
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..a1329b0c968
--- /dev/null
+++ b/examples/cpp/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.10)
+project(run_model)
+
+option(USE_TORCHVISION "Whether to link to torchvision" OFF)
+
+find_package(Torch REQUIRED)
+if(USE_TORCHVISION)
+  find_package(TorchVision REQUIRED)
+endif()
+
+add_executable(run_model run_model.cpp)
+
+target_link_libraries(run_model "${TORCH_LIBRARIES}")
+if(USE_TORCHVISION)
+  target_link_libraries(run_model TorchVision::TorchVision)
+endif()
+
+set_property(TARGET run_model PROPERTY CXX_STANDARD 17)
diff --git a/examples/cpp/README.md b/examples/cpp/README.md
new file mode 100644
index 00000000000..b2a9174c8ba
--- /dev/null
+++ b/examples/cpp/README.md
@@ -0,0 +1,101 @@
+Using torchvision models in C++
+===============================
+
+This is a minimal example of getting TorchVision models to work in C++ with
+Torchscript. The model is first scripted in Python and exported to a file, and
+then loaded in C++. For a similar tutorial, see [this
+tutorial](https://pytorch.org/tutorials/advanced/cpp_export.html).
+
+In order to successfully compile this example, make sure you have ``LibTorch``
+installed. You can either:
+
+- Install PyTorch normally
+- Or download the LibTorch C++ distribution.
+
+In both cases refer [here](https://pytorch.org/get-started/locally/) the
+corresponding install or download instructions.
+
+Some torchvision models only depend on PyTorch operators, and can be used in C++
+without depending on the torchvision lib. Other models rely on torchvision's C++
+operators like NMS, RoiAlign (typically the detection models) and those need to
+be linked against the torchvision lib.
+
+We'll first see the simpler case of running a model without the torchvision lib
+dependency.
+
+Running a model that doesn't need torchvision lib
+-------------------------------------------------
+
+Create a ``build`` directory inside the current one.
+
+```bash
+mkdir build
+cd build
+```
+
+Then run `python ../trace_model.py` which should create a `resnet18.pt` file in
+the build directory. This is the scripted model that will be used in the C++
+code.
+
+We can now start building with CMake. We have to tell CMake where it can find
+the necessary PyTorch resources. If you installed PyTorch normally, you can do:
+
+```bash
+TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))")
+Torch_DIR="${TORCH_PATH}/share/cmake/Torch"   # there should be .cmake files in there
+
+cmake .. -DTorch_DIR=$Torch_DIR
+```
+
+If instead you downloaded the LibTorch somewhere, you can do:
+
+```bash
+cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch
+```
+
+Then `cmake --build .` and you should now be able to run
+
+```bash
+./run_model resnet18.pt
+```
+
+If you try to run the model with a model that depends on the torchvision lib, like
+`./run_model fasterrcnn_resnet50_fpn.pt`, you should get a runtime error. This is
+because the executable wasn't linked against the torchvision lib.
+
+
+Running a model that needs torchvision lib
+------------------------------------------
+
+First, we need to build the torchvision lib. To build the torchvision lib go to
+the root of the torchvision project and run:
+
+```bash
+mkdir build
+cd build
+cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch  # or -DTorch_DIR= if you installed PyTorch normally, see above
+cmake --build .
+cmake --install .
+```
+
+You may want to pass `-DCMAKE_INSTALL_PREFIX=/path/to/libtorchvision` for
+cmake to copy/install the files to a specific location (e.g. `$CONDA_PREFIX`).
+
+**DISCLAIMER**: the `libtorchvision` library includes the torchvision
+custom ops as well as most of the C++ torchvision APIs. Those APIs do not come
+with any backward-compatibility guarantees and may change from one version to
+the next. Only the Python APIs are stable and with backward-compatibility
+guarantees. So, if you need stability within a C++ environment, your best bet is
+to export the Python APIs via torchscript.
+
+Now that libtorchvision is built and installed we can tell our project to use
+and link to it via the `-DUSE_TORCHVISION` flag. We also need to tell CMake
+where to find it, just like we did with LibTorch, e.g.:
+
+```bash
+cmake .. -DTorch_DIR=$Torch_DIR -DTorchVision_DIR=path/to/libtorchvision -DUSE_TORCHVISION=ON
+cmake --build .
+```
+
+Now the `run_model` executable should be able to run the
+`fasterrcnn_resnet50_fpn.pt` file.
diff --git a/examples/cpp/hello_world/CMakeLists.txt b/examples/cpp/hello_world/CMakeLists.txt
deleted file mode 100644
index 3ca59e4c199..00000000000
--- a/examples/cpp/hello_world/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-project(hello-world)
-
-# The first thing do is to tell cmake to find the TorchVision library.
-# The package pulls in all the necessary torch libraries,
-# so there is no need to also add `find_package(Torch)` here.
-find_package(TorchVision REQUIRED)
-
-# This due to LibTorch's version is the one included in the Python
-# package that links to Python.
-find_package(Python3 COMPONENTS Development)
-
-add_executable(hello-world main.cpp)
-
-# We now need to link the TorchVision library to our executable.
-# We can do that by using the TorchVision::TorchVision target,
-# which also adds all the necessary torch dependencies.
-target_compile_features(hello-world PUBLIC cxx_range_for)
-target_link_libraries(hello-world TorchVision::TorchVision)
-set_property(TARGET hello-world PROPERTY CXX_STANDARD 14)
diff --git a/examples/cpp/hello_world/README.rst b/examples/cpp/hello_world/README.rst
deleted file mode 100644
index 81161112034..00000000000
--- a/examples/cpp/hello_world/README.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-Hello World!
-============
-
-This is a minimal example of getting TorchVision to work in C++ with CMake.
-
-
-In order to successfully compile this example, make sure you have both ``LibTorch`` and
-``TorchVision`` installed.
-Once both dependencies are sorted, we can start the CMake fun:
-
-1) Create a ``build`` directory inside the current one.
-2) from within the ``build`` directory, run the following commands:
-    - | ``cmake -DCMAKE_PREFIX_PATH="<PATH_TO_LIBTORCH>;<PATH_TO_TORCHVISION>" ..``
-      | where ``<PATH_TO_LIBTORCH>`` and ``<PATH_TO_TORCHVISION>`` are the paths to the libtorch and torchvision installations.
-    - ``cmake --build .``
-
-| That's it!
-| You should now have a ``hello-world`` executable in your ``build`` folder.
- Running it will output a (fairly long) tensor of random values to your terminal.
diff --git a/examples/cpp/hello_world/main.cpp b/examples/cpp/hello_world/main.cpp
deleted file mode 100644
index bcbe68dd07d..00000000000
--- a/examples/cpp/hello_world/main.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <iostream>
-#include <torch/script.h>
-#include <torch/torch.h>
-#include <torchvision/vision.h>
-
-int main() {
-  torch::DeviceType device_type;
-  device_type = torch::kCPU;
-
-  torch::jit::script::Module model;
-  try {
-    std::cout << "Loading model\n";
-    // Deserialize the ScriptModule from a file using torch::jit::load().
-    model = torch::jit::load("resnet18.pt");
-    std::cout << "Model loaded\n";
-  } catch (const torch::Error& e) {
-    std::cout << "error loading the model\n";
-    return -1;
-  } catch (const std::exception& e) {
-    std::cout << "Other error: " << e.what() << "\n";
-    return -1;
-  }
-
-  // TorchScript models require a List[IValue] as input
-  std::vector<torch::jit::IValue> inputs;
-
-  // Create a random input tensor and run it through the model.
-  inputs.push_back(torch::rand({1, 3, 10, 10}));
-  auto out = model.forward(inputs);
-  std::cout << out << "\n";
-
-  if (torch::cuda::is_available()) {
-    // Move model and inputs to GPU
-    model.to(torch::kCUDA);
-
-    // Add GPU inputs
-    inputs.clear();
-    torch::TensorOptions options = torch::TensorOptions{torch::kCUDA};
-    inputs.push_back(torch::rand({1, 3, 10, 10}, options));
-
-    auto gpu_out = model.forward(inputs);
-    std::cout << gpu_out << "\n";
-  }
-}
diff --git a/examples/cpp/hello_world/trace_model.py b/examples/cpp/hello_world/trace_model.py
deleted file mode 100644
index 41bbaf8b6dd..00000000000
--- a/examples/cpp/hello_world/trace_model.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import os.path as osp
-
-import torch
-import torchvision
-
-HERE = osp.dirname(osp.abspath(__file__))
-ASSETS = osp.dirname(osp.dirname(HERE))
-
-model = torchvision.models.resnet18()
-model.eval()
-
-traced_model = torch.jit.script(model)
-traced_model.save("resnet18.pt")
diff --git a/examples/cpp/run_model.cpp b/examples/cpp/run_model.cpp
new file mode 100644
index 00000000000..11faead5dac
--- /dev/null
+++ b/examples/cpp/run_model.cpp
@@ -0,0 +1,67 @@
+#include <torch/script.h>
+#include <torch/torch.h>
+#include <cstring>
+#include <iostream>
+
+#ifdef _WIN32
+#include <torchvision/vision.h>
+#endif // _WIN32
+
+int main(int argc, const char* argv[]) {
+  if (argc != 2) {
+    std::cout << "Usage: run_model <path_to_scripted_model>\n";
+    return -1;
+  }
+  torch::DeviceType device_type;
+  device_type = torch::kCPU;
+
+  torch::jit::script::Module model;
+  try {
+    std::cout << "Loading model\n";
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    model = torch::jit::load(argv[1]);
+    std::cout << "Model loaded\n";
+  } catch (const torch::Error&) {
+    std::cout << "error loading the model.\n";
+    return -1;
+  } catch (const std::exception& e) {
+    std::cout << "Other error: " << e.what() << "\n";
+    return -1;
+  }
+
+  // TorchScript models require a List[IValue] as input
+  std::vector<torch::jit::IValue> inputs;
+
+  if (std::strstr(argv[1], "fasterrcnn") != NULL) {
+    // Faster RCNN accepts a List[Tensor] as main input
+    std::vector<torch::Tensor> images;
+    images.push_back(torch::rand({3, 256, 275}));
+    images.push_back(torch::rand({3, 256, 275}));
+    inputs.push_back(images);
+  } else {
+    inputs.push_back(torch::rand({1, 3, 10, 10}));
+  }
+  auto out = model.forward(inputs);
+  std::cout << out << "\n";
+
+  if (torch::cuda::is_available()) {
+    // Move model and inputs to GPU
+    model.to(torch::kCUDA);
+
+    // Add GPU inputs
+    inputs.clear();
+    torch::TensorOptions options = torch::TensorOptions{torch::kCUDA};
+    if (std::strstr(argv[1], "fasterrcnn") != NULL) {
+      // Faster RCNN accepts a List[Tensor] as main input
+      std::vector<torch::Tensor> images;
+      images.push_back(torch::rand({3, 256, 275}, options));
+      images.push_back(torch::rand({3, 256, 275}, options));
+      inputs.push_back(images);
+    } else {
+      inputs.push_back(torch::rand({1, 3, 10, 10}, options));
+    }
+
+    auto gpu_out = model.forward(inputs);
+    std::cout << gpu_out << "\n";
+  }
+}
diff --git a/examples/cpp/script_model.py b/examples/cpp/script_model.py
new file mode 100644
index 00000000000..e91e888e7be
--- /dev/null
+++ b/examples/cpp/script_model.py
@@ -0,0 +1,10 @@
+import torch
+from torchvision import models
+
+for model, name in (
+    (models.resnet18(weights=None), "resnet18"),
+    (models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None), "fasterrcnn_resnet50_fpn"),
+):
+    model.eval()
+    traced_model = torch.jit.script(model)
+    traced_model.save(f"{name}.pt")
diff --git a/gallery/README.rst b/gallery/README.rst
index 868afe74351..8dfea355276 100644
--- a/gallery/README.rst
+++ b/gallery/README.rst
@@ -1,4 +1,4 @@
-Example gallery
-===============
+.. _gallery:
 
-Below is a gallery of examples
+Examples and tutorials
+======================
diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg
new file mode 120000
index 00000000000..9be80c7c273
--- /dev/null
+++ b/gallery/assets/coco/images/000000000001.jpg
@@ -0,0 +1 @@
+../../astronaut.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg
new file mode 120000
index 00000000000..9f8efef9928
--- /dev/null
+++ b/gallery/assets/coco/images/000000000002.jpg
@@ -0,0 +1 @@
+../../dog2.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json
new file mode 100644
index 00000000000..fe0e09270bf
--- /dev/null
+++ b/gallery/assets/coco/instances.json
@@ -0,0 +1 @@
+{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]}
diff --git a/gallery/assets/leaning_tower.jpg b/gallery/assets/leaning_tower.jpg
new file mode 100644
index 00000000000..fc6e0779f7c
Binary files /dev/null and b/gallery/assets/leaning_tower.jpg differ
diff --git a/gallery/others/README.rst b/gallery/others/README.rst
new file mode 100644
index 00000000000..fafb007d985
--- /dev/null
+++ b/gallery/others/README.rst
@@ -0,0 +1,2 @@
+Others
+------
diff --git a/gallery/plot_optical_flow.py b/gallery/others/plot_optical_flow.py
similarity index 90%
rename from gallery/plot_optical_flow.py
rename to gallery/others/plot_optical_flow.py
index b0a93209877..6296c8e667e 100644
--- a/gallery/plot_optical_flow.py
+++ b/gallery/others/plot_optical_flow.py
@@ -3,6 +3,10 @@
 Optical Flow: Predicting movement with the RAFT model
 =====================================================
 
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_optical_flow.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_optical_flow.py>` to download the full example code.
+
 Optical flow is the task of predicting movement between two images, usually two
 consecutive frames of a video. Optical flow models take two images as input, and
 predict a flow: the flow indicates the displacement of every single pixel in the
@@ -42,7 +46,7 @@ def plot(imgs, **imshow_kwargs):
 
     plt.tight_layout()
 
-###################################
+# %%
 # Reading Videos Using Torchvision
 # --------------------------------
 # We will first read a video using :func:`~torchvision.io.read_video`.
@@ -62,7 +66,7 @@ def plot(imgs, **imshow_kwargs):
 video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
 _ = urlretrieve(video_url, video_path)
 
-#########################
+# %%
 # :func:`~torchvision.io.read_video` returns the video frames, audio frames and
 # the metadata associated with the video. In our case, we only need the video
 # frames.
@@ -79,11 +83,12 @@ def plot(imgs, **imshow_kwargs):
 
 plot(img1_batch)
 
-#########################
+# %%
 # The RAFT model accepts RGB images. We first get the frames from
-# :func:`~torchvision.io.read_video` and resize them to ensure their
-# dimensions are divisible by 8. Then we use the transforms bundled into the
-# weights in order to preprocess the input and rescale its values to the
+# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
+# are divisible by 8. Note that we explicitly use ``antialias=False``, because
+# this is how those models were trained. Then we use the transforms bundled into
+# the weights in order to preprocess the input and rescale its values to the
 # required ``[-1, 1]`` interval.
 
 from torchvision.models.optical_flow import Raft_Large_Weights
@@ -93,8 +98,8 @@ def plot(imgs, **imshow_kwargs):
 
 
 def preprocess(img1_batch, img2_batch):
-    img1_batch = F.resize(img1_batch, size=[520, 960])
-    img2_batch = F.resize(img2_batch, size=[520, 960])
+    img1_batch = F.resize(img1_batch, size=[520, 960], antialias=False)
+    img2_batch = F.resize(img2_batch, size=[520, 960], antialias=False)
     return transforms(img1_batch, img2_batch)
 
 
@@ -103,7 +108,7 @@ def preprocess(img1_batch, img2_batch):
 print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
 
 
-####################################
+# %%
 # Estimating Optical flow using RAFT
 # ----------------------------------
 # We will use our RAFT implementation from
@@ -124,12 +129,12 @@ def preprocess(img1_batch, img2_batch):
 print(f"type = {type(list_of_flows)}")
 print(f"length = {len(list_of_flows)} = number of iterations of the model")
 
-####################################
+# %%
 # The RAFT model outputs lists of predicted flows where each entry is a
 # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
 # in the model. For more details on the iterative nature of the model, please
 # refer to the `original paper <https://arxiv.org/abs/2003.12039>`_. Here, we
-# are only interested in the final predicted flows (they are the most acccurate
+# are only interested in the final predicted flows (they are the most accurate
 # ones), so we will just retrieve the last item in the list.
 #
 # As described above, a flow is a tensor with dimensions (2, H, W) (or (N, 2, H,
@@ -143,10 +148,10 @@ def preprocess(img1_batch, img2_batch):
 print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
 
 
-####################################
+# %%
 # Visualizing predicted flows
 # ---------------------------
-# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
+# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utility to
 # convert a flow into an RGB image. It also supports batches of flows.
 # each "direction" in the flow will be mapped to a given RGB color. In the
 # images below, pixels with similar colors are assumed by the model to be moving
@@ -165,7 +170,7 @@ def preprocess(img1_batch, img2_batch):
 grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
 plot(grid)
 
-####################################
+# %%
 # Bonus: Creating GIFs of predicted flows
 # ---------------------------------------
 # In the example above we have only shown the predicted flows of 2 pairs of
@@ -186,7 +191,7 @@ def preprocess(img1_batch, img2_batch):
 #     output_folder = "/tmp/"  # Update this to the folder of your choice
 #     write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
 
-####################################
+# %%
 # Once the .jpg flow images are saved, you can convert them into a video or a
 # GIF using ffmpeg with e.g.:
 #
diff --git a/gallery/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py
similarity index 91%
rename from gallery/plot_repurposing_annotations.py
rename to gallery/others/plot_repurposing_annotations.py
index 7bb68617a17..2c2e10ffb2a 100644
--- a/gallery/plot_repurposing_annotations.py
+++ b/gallery/others/plot_repurposing_annotations.py
@@ -3,6 +3,10 @@
 Repurposing masks into bounding boxes
 =====================================
 
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_repurposing_annotations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_repurposing_annotations.py>` to download the full example code.
+
 The following example illustrates the operations available
 the :ref:`torchvision.ops <ops>` module for repurposing
 segmentation masks into object localization annotations for different tasks
@@ -20,7 +24,7 @@
 import torchvision.transforms.functional as F
 
 
-ASSETS_DIRECTORY = "assets"
+ASSETS_DIRECTORY = "../assets"
 
 plt.rcParams["savefig.bbox"] = "tight"
 
@@ -36,7 +40,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Masks
 # -----
 # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
@@ -53,7 +57,7 @@ def show(imgs):
 # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
 # localization tasks.
 
-####################################
+# %%
 # Converting Masks to Bounding Boxes
 # -----------------------------------------------
 # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
@@ -62,15 +66,15 @@ def show(imgs):
 # We will take images and masks from the `PenFudan Dataset <https://www.cis.upenn.edu/~jshi/ped_html/>`_.
 
 
-from torchvision.io import read_image
+from torchvision.io import decode_image
 
 img_path = os.path.join(ASSETS_DIRECTORY, "FudanPed00054.png")
 mask_path = os.path.join(ASSETS_DIRECTORY, "FudanPed00054_mask.png")
-img = read_image(img_path)
-mask = read_image(mask_path)
+img = decode_image(img_path)
+mask = decode_image(mask_path)
 
 
-#########################
+# %%
 # Here the masks are represented as a PNG Image, with floating point values.
 # Each pixel is encoded as different colors, with 0 being background.
 # Notice that the spatial dimensions of image and mask match.
@@ -79,7 +83,7 @@ def show(imgs):
 print(img.size())
 print(mask)
 
-############################
+# %%
 
 # We get the unique colors, as these would be the object ids.
 obj_ids = torch.unique(mask)
@@ -91,7 +95,7 @@ def show(imgs):
 # Note that this snippet would work as well if the masks were float values instead of ints.
 masks = mask == obj_ids[:, None, None]
 
-########################
+# %%
 # Now the masks are a boolean tensor.
 # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
 # The other two dimensions are height and width, which are equal to the dimensions of the image.
@@ -101,7 +105,7 @@ def show(imgs):
 print(masks.size())
 print(masks)
 
-####################################
+# %%
 # Let us visualize an image and plot its corresponding segmentation masks.
 # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
 
@@ -113,7 +117,7 @@ def show(imgs):
 
 show(drawn_masks)
 
-####################################
+# %%
 # To convert the boolean masks into bounding boxes.
 # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
 # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
@@ -124,7 +128,7 @@ def show(imgs):
 print(boxes.size())
 print(boxes)
 
-####################################
+# %%
 # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
 # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
 # provided in :ref:`torchvision.utils <utils>`.
@@ -134,7 +138,7 @@ def show(imgs):
 drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
 show(drawn_boxes)
 
-###################################
+# %%
 # These boxes can now directly be used by detection models in torchvision.
 # Here is demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -145,15 +149,15 @@ def show(imgs):
 model = fasterrcnn_resnet50_fpn(weights=weights, progress=False)
 print(img.size())
 
-tranforms = weights.transforms()
-img = tranforms(img)
+transforms = weights.transforms()
+img = transforms(img)
 target = {}
 target["boxes"] = boxes
 target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
 detection_outputs = model(img.unsqueeze(0), [target])
 
 
-####################################
+# %%
 # Converting Segmentation Dataset to Detection Dataset
 # ----------------------------------------------------
 #
@@ -177,8 +181,8 @@ def __getitem__(self, idx):
         img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
         mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
 
-        img = read_image(img_path)
-        mask = read_image(mask_path)
+        img = decode_image(img_path)
+        mask = decode_image(mask_path)
 
         img = F.convert_image_dtype(img, dtype=torch.float)
         mask = F.convert_image_dtype(mask, dtype=torch.float)
diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
new file mode 100644
index 00000000000..da2213347e3
--- /dev/null
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -0,0 +1,136 @@
+"""
+===================
+Torchscript support
+===================
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_scripted_tensor_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_scripted_tensor_transforms.py>` to download the full example code.
+
+This example illustrates `torchscript
+<https://pytorch.org/docs/stable/jit.html>`_ support of the torchvision
+:ref:`transforms <transforms>` on Tensor images.
+"""
+
+# %%
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+
+import torch
+import torch.nn as nn
+
+import torchvision.transforms as v1
+from torchvision.io import decode_image
+
+plt.rcParams["savefig.bbox"] = 'tight'
+torch.manual_seed(1)
+
+# If you're trying to run that on Colab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+import sys
+sys.path += ["../transforms"]
+from helpers import plot
+ASSETS_PATH = Path('../assets')
+
+
+# %%
+# Most transforms support torchscript. For composing transforms, we use
+# :class:`torch.nn.Sequential` instead of
+# :class:`~torchvision.transforms.v2.Compose`:
+
+dog1 = decode_image(str(ASSETS_PATH / 'dog1.jpg'))
+dog2 = decode_image(str(ASSETS_PATH / 'dog2.jpg'))
+
+transforms = torch.nn.Sequential(
+    v1.RandomCrop(224),
+    v1.RandomHorizontalFlip(p=0.3),
+)
+
+scripted_transforms = torch.jit.script(transforms)
+
+plot([dog1, scripted_transforms(dog1), dog2, scripted_transforms(dog2)])
+
+
+# %%
+# .. warning::
+#
+#     Above we have used transforms from the ``torchvision.transforms``
+#     namespace, i.e. the "v1" transforms. The v2 transforms from the
+#     ``torchvision.transforms.v2`` namespace are the :ref:`recommended
+#     <v1_or_v2>` way to use transforms in your code.
+#
+#     The v2 transforms also support torchscript, but if you call
+#     ``torch.jit.script()`` on a v2 **class** transform, you'll actually end up
+#     with its (scripted) v1 equivalent.  This may lead to slightly different
+#     results between the scripted and eager executions due to implementation
+#     differences between v1 and v2.
+#
+#     If you really need torchscript support for the v2 transforms, **we
+#     recommend scripting the functionals** from the
+#     ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
+#
+# Below we now show how to combine image transformations and a model forward
+# pass, while using ``torch.jit.script`` to obtain a single scripted module.
+#
+# Let's define a ``Predictor`` module that transforms the input tensor and then
+# applies an ImageNet model on it.
+
+from torchvision.models import resnet18, ResNet18_Weights
+
+
+class Predictor(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        weights = ResNet18_Weights.DEFAULT
+        self.resnet18 = resnet18(weights=weights, progress=False).eval()
+        self.transforms = weights.transforms(antialias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            x = self.transforms(x)
+            y_pred = self.resnet18(x)
+            return y_pred.argmax(dim=1)
+
+
+# %%
+# Now, let's define scripted and non-scripted instances of ``Predictor`` and
+# apply it on multiple tensor images of the same size
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+predictor = Predictor().to(device)
+scripted_predictor = torch.jit.script(predictor).to(device)
+
+batch = torch.stack([dog1, dog2]).to(device)
+
+res = predictor(batch)
+res_scripted = scripted_predictor(batch)
+
+# %%
+# We can verify that the prediction of the scripted and non-scripted models are
+# the same:
+
+import json
+
+with open(Path('../assets') / 'imagenet_class_index.json') as labels_file:
+    labels = json.load(labels_file)
+
+for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
+    assert pred == pred_scripted
+    print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
+
+# %%
+# Since the model is scripted, it can be easily dumped on disk and re-used
+
+import tempfile
+
+with tempfile.NamedTemporaryFile() as f:
+    scripted_predictor.save(f.name)
+
+    dumped_scripted_predictor = torch.jit.load(f.name)
+    res_scripted_dumped = dumped_scripted_predictor(batch)
+assert (res_scripted_dumped == res_scripted).all()
+
+# %%
diff --git a/gallery/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py
similarity index 82%
rename from gallery/plot_visualization_utils.py
rename to gallery/others/plot_visualization_utils.py
index b04e0b6cffa..72c35b53717 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/others/plot_visualization_utils.py
@@ -3,6 +3,10 @@
 Visualization utilities
 =======================
 
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_visualization_utils.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_visualization_utils.py>` to download the full example code.
+
 This example illustrates some of the utilities that torchvision offers for
 visualizing images, bounding boxes, segmentation masks and keypoints.
 """
@@ -30,7 +34,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Visualizing a grid of images
 # ----------------------------
 # The :func:`~torchvision.utils.make_grid` function can be used to create a
@@ -38,17 +42,17 @@ def show(imgs):
 # image of dtype ``uint8`` as input.
 
 from torchvision.utils import make_grid
-from torchvision.io import read_image
+from torchvision.io import decode_image
 from pathlib import Path
 
-dog1_int = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2_int = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1_int = decode_image(str(Path('../assets') / 'dog1.jpg'))
+dog2_int = decode_image(str(Path('../assets') / 'dog2.jpg'))
 dog_list = [dog1_int, dog2_int]
 
 grid = make_grid(dog_list)
 show(grid)
 
-####################################
+# %%
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
@@ -64,7 +68,7 @@ def show(imgs):
 show(result)
 
 
-#####################################
+# %%
 # Naturally, we can also plot bounding boxes produced by torchvision detection
 # models.  Here is a demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -85,7 +89,7 @@ def show(imgs):
 outputs = model(images)
 print(outputs)
 
-#####################################
+# %%
 # Let's plot the boxes detected by our model. We will only plot the boxes with a
 # score greater than a given threshold.
 
@@ -96,7 +100,7 @@ def show(imgs):
 ]
 show(dogs_with_boxes)
 
-#####################################
+# %%
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
@@ -125,7 +129,7 @@ def show(imgs):
 output = model(batch)['out']
 print(output.shape, output.min().item(), output.max().item())
 
-#####################################
+# %%
 # As we can see above, the output of the segmentation model is a tensor of shape
 # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
 # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
@@ -147,7 +151,7 @@ def show(imgs):
 
 show(dog_and_boat_masks)
 
-#####################################
+# %%
 # As expected, the model is confident about the dog class, but not so much for
 # the boat class.
 #
@@ -162,7 +166,7 @@ def show(imgs):
 show([m.float() for m in boolean_dog_masks])
 
 
-#####################################
+# %%
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
 # class?"
@@ -184,11 +188,11 @@ def show(imgs):
 ]
 show(dogs_with_masks)
 
-#####################################
+# %%
 # We can plot more than one mask per image! Remember that the model returned as
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
-# C, is class C the most most likely class?"
+# C, is class C the most likely class?"
 #
 # This one is a bit more involved, so we'll first show how to do it with a
 # single image, and then we'll generalize to the batch
@@ -204,7 +208,7 @@ def show(imgs):
 dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
 show(dog_with_all_masks)
 
-#####################################
+# %%
 # We can see in the image above that only 2 masks were drawn: the mask for the
 # background and the mask for the dog. This is because the model thinks that
 # only these 2 classes are the most likely ones across all the pixels. If the
@@ -231,7 +235,7 @@ def show(imgs):
 show(dogs_with_masks)
 
 
-#####################################
+# %%
 # .. _instance_seg_output:
 #
 # Instance segmentation models
@@ -265,7 +269,7 @@ def show(imgs):
 output = model(images)
 print(output)
 
-#####################################
+# %%
 # Let's break this down. For each image in the batch, the model outputs some
 # detections (or instances). The number of detections varies for each input
 # image. Each instance is described by its bounding box, its label, its score
@@ -288,7 +292,7 @@ def show(imgs):
 print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
       f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
 
-#####################################
+# %%
 # Here the masks correspond to probabilities indicating, for each pixel, how
 # likely it is to belong to the predicted label of that instance. Those
 # predicted labels correspond to the 'labels' element in the same output dict.
@@ -297,7 +301,7 @@ def show(imgs):
 print("For the first dog, the following instances were detected:")
 print([weights.meta["categories"][label] for label in dog1_output['labels']])
 
-#####################################
+# %%
 # Interestingly, the model detects two persons in the image. Let's go ahead and
 # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
 # expects boolean masks, we need to convert those probabilities into boolean
@@ -315,14 +319,14 @@ def show(imgs):
 
 show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
 
-#####################################
+# %%
 # The model seems to have properly detected the dog, but it also confused trees
-# with people. Looking more closely at the scores will help us plotting more
+# with people. Looking more closely at the scores will help us plot more
 # relevant masks:
 
 print(dog1_output['scores'])
 
-#####################################
+# %%
 # Clearly the model is more confident about the dog detection than it is about
 # the people detections. That's good news. When plotting the masks, we can ask
 # for only those that have a good score. Let's use a score threshold of .75
@@ -341,12 +345,12 @@ def show(imgs):
 ]
 show(dogs_with_masks)
 
-#####################################
+# %%
 # The two 'people' masks in the first image where not selected because they have
-# a lower score than the score threshold. Similarly in the second image, the
+# a lower score than the score threshold. Similarly, in the second image, the
 # instance with class 15 (which corresponds to 'bench') was not selected.
 
-#####################################
+# %%
 # .. _keypoint_output:
 #
 # Visualizing keypoints
@@ -358,9 +362,9 @@ def show(imgs):
 #
 
 from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
-from torchvision.io import read_image
+from torchvision.io import decode_image
 
-person_int = read_image(str(Path("assets") / "person1.jpg"))
+person_int = decode_image(str(Path("../assets") / "person1.jpg"))
 
 weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
 transforms = weights.transforms()
@@ -373,7 +377,7 @@ def show(imgs):
 outputs = model([person_float])
 print(outputs)
 
-#####################################
+# %%
 # As we see the output contains a list of dictionaries.
 # The output list is of length batch_size.
 # We currently have just a single image so length of list is 1.
@@ -388,7 +392,7 @@ def show(imgs):
 print(kpts)
 print(scores)
 
-#####################################
+# %%
 # The KeypointRCNN model detects there are two instances in the image.
 # If you plot the boxes by using :func:`~draw_bounding_boxes`
 # you would recognize they are the person and the surfboard.
@@ -402,7 +406,7 @@ def show(imgs):
 
 print(keypoints)
 
-#####################################
+# %%
 # Great, now we have the keypoints corresponding to the person.
 # Each keypoint is represented by x, y coordinates and the visibility.
 # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
@@ -413,8 +417,8 @@ def show(imgs):
 res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
 show(res)
 
-#####################################
-# As we see the keypoints appear as colored circles over the image.
+# %%
+# As we see, the keypoints appear as colored circles over the image.
 # The coco keypoints for a person are ordered and represent the following list.\
 
 coco_keypoints = [
@@ -424,7 +428,7 @@ def show(imgs):
     "left_knee", "right_knee", "left_ankle", "right_ankle",
 ]
 
-#####################################
+# %%
 # What if we are interested in joining the keypoints?
 # This is especially useful in creating pose detection or action recognition.
 # We can join the keypoints easily using the `connectivity` parameter.
@@ -450,9 +454,69 @@ def show(imgs):
     (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
 ]
 
-#####################################
+# %%
 # We pass the above list to the connectivity parameter to connect the keypoints.
 #
 
 res = draw_keypoints(person_int, keypoints, connectivity=connect_skeleton, colors="blue", radius=4, width=3)
 show(res)
+
+# %%
+# That looks pretty good.
+#
+# .. _draw_keypoints_with_visibility:
+#
+# Drawing Keypoints with Visibility
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Let's have a look at the results, another keypoint prediction module produced, and show the connectivity:
+
+prediction = torch.tensor(
+    [[[208.0176, 214.2409, 1.0000],
+      [000.0000, 000.0000, 0.0000],
+      [197.8246, 210.6392, 1.0000],
+      [000.0000, 000.0000, 0.0000],
+      [178.6378, 217.8425, 1.0000],
+      [221.2086, 253.8591, 1.0000],
+      [160.6502, 269.4662, 1.0000],
+      [243.9929, 304.2822, 1.0000],
+      [138.4654, 328.8935, 1.0000],
+      [277.5698, 340.8990, 1.0000],
+      [153.4551, 374.5145, 1.0000],
+      [000.0000, 000.0000, 0.0000],
+      [226.0053, 370.3125, 1.0000],
+      [221.8081, 455.5516, 1.0000],
+      [273.9723, 448.9486, 1.0000],
+      [193.6275, 546.1933, 1.0000],
+      [273.3727, 545.5930, 1.0000]]]
+)
+
+res = draw_keypoints(person_int, prediction, connectivity=connect_skeleton, colors="blue", radius=4, width=3)
+show(res)
+
+# %%
+# What happened there?
+# The model, which predicted the new keypoints,
+# can't detect the three points that are hidden on the upper left body of the skateboarder.
+# More precisely, the model predicted that `(x, y, vis) = (0, 0, 0)` for the left_eye, left_ear, and left_hip.
+# So we definitely don't want to display those keypoints and connections, and you don't have to.
+# Looking at the parameters of :func:`~torchvision.utils.draw_keypoints`,
+# we can see that we can pass a visibility tensor as an additional argument.
+# Given the models' prediction, we have the visibility as the third keypoint dimension, we just need to extract it.
+# Let's split the ``prediction`` into the keypoint coordinates and their respective visibility,
+# and pass both of them as arguments to :func:`~torchvision.utils.draw_keypoints`.
+
+coordinates, visibility = prediction.split([2, 1], dim=-1)
+visibility = visibility.bool()
+
+res = draw_keypoints(
+    person_int, coordinates, visibility=visibility, connectivity=connect_skeleton, colors="blue", radius=4, width=3
+)
+show(res)
+
+# %%
+# We can see that the undetected keypoints are not draw and the invisible keypoint connections were skipped.
+# This can reduce the noise on images with multiple detections, or in cases like ours,
+# when the keypoint-prediction model missed some detections.
+# Most torch keypoint-prediction models return the visibility for every prediction, ready for you to use it.
+# The :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn` model,
+# which we used in the first case, does so too.
diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py
deleted file mode 100644
index 995383d4603..00000000000
--- a/gallery/plot_scripted_tensor_transforms.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""
-=========================
-Tensor transforms and JIT
-=========================
-
-This example illustrates various features that are now supported by the
-:ref:`image transformations <transforms>` on Tensor images. In particular, we
-show how image transforms can be performed on GPU, and how one can also script
-them using JIT compilation.
-
-Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric
-and presented multiple limitations due to that. Now, since v0.8.0, transforms
-implementations are Tensor and PIL compatible and we can achieve the following
-new features:
-
-- transform multi-band torch tensor images (with more than 3-4 channels)
-- torchscript transforms together with your model for deployment
-- support for GPU acceleration
-- batched transformation such as for videos
-- read and decode data directly as torch tensor with torchscript support (for PNG and JPEG image formats)
-
-.. note::
-    These features are only possible with **Tensor** images.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-import torch
-import torchvision.transforms as T
-from torchvision.io import read_image
-
-
-plt.rcParams["savefig.bbox"] = 'tight'
-torch.manual_seed(1)
-
-
-def show(imgs):
-    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
-    for i, img in enumerate(imgs):
-        img = T.ToPILImage()(img.to('cpu'))
-        axs[0, i].imshow(np.asarray(img))
-        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-
-####################################
-# The :func:`~torchvision.io.read_image` function allows to read an image and
-# directly load it as a tensor
-
-dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
-show([dog1, dog2])
-
-####################################
-# Transforming images on GPU
-# --------------------------
-# Most transforms natively support tensors on top of PIL images (to visualize
-# the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_plot_transforms.py`).
-# Using tensor images, we can run the transforms on GPUs if cuda is available!
-
-import torch.nn as nn
-
-transforms = torch.nn.Sequential(
-    T.RandomCrop(224),
-    T.RandomHorizontalFlip(p=0.3),
-)
-
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-dog1 = dog1.to(device)
-dog2 = dog2.to(device)
-
-transformed_dog1 = transforms(dog1)
-transformed_dog2 = transforms(dog2)
-show([transformed_dog1, transformed_dog2])
-
-####################################
-# Scriptable transforms for easier deployment via torchscript
-# -----------------------------------------------------------
-# We now show how to combine image transformations and a model forward pass,
-# while using ``torch.jit.script`` to obtain a single scripted module.
-#
-# Let's define a ``Predictor`` module that transforms the input tensor and then
-# applies an ImageNet model on it.
-
-from torchvision.models import resnet18, ResNet18_Weights
-
-
-class Predictor(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        weights = ResNet18_Weights.DEFAULT
-        self.resnet18 = resnet18(weights=weights, progress=False).eval()
-        self.transforms = weights.transforms()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        with torch.no_grad():
-            x = self.transforms(x)
-            y_pred = self.resnet18(x)
-            return y_pred.argmax(dim=1)
-
-
-####################################
-# Now, let's define scripted and non-scripted instances of ``Predictor`` and
-# apply it on multiple tensor images of the same size
-
-predictor = Predictor().to(device)
-scripted_predictor = torch.jit.script(predictor).to(device)
-
-batch = torch.stack([dog1, dog2]).to(device)
-
-res = predictor(batch)
-res_scripted = scripted_predictor(batch)
-
-####################################
-# We can verify that the prediction of the scripted and non-scripted models are
-# the same:
-
-import json
-
-with open(Path('assets') / 'imagenet_class_index.json') as labels_file:
-    labels = json.load(labels_file)
-
-for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
-    assert pred == pred_scripted
-    print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
-
-####################################
-# Since the model is scripted, it can be easily dumped on disk and re-used
-
-import tempfile
-
-with tempfile.NamedTemporaryFile() as f:
-    scripted_predictor.save(f.name)
-
-    dumped_scripted_predictor = torch.jit.load(f.name)
-    res_scripted_dumped = dumped_scripted_predictor(batch)
-assert (res_scripted_dumped == res_scripted).all()
diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
deleted file mode 100644
index d83a508eabe..00000000000
--- a/gallery/plot_video_api.py
+++ /dev/null
@@ -1,341 +0,0 @@
-"""
-=======================
-Video API
-=======================
-
-This example illustrates some of the APIs that torchvision offers for
-videos, together with the examples on how to build datasets and more.
-"""
-
-####################################
-# 1. Introduction: building a new video object and examining the properties
-# -------------------------------------------------------------------------
-# First we select a video to test the object out. For the sake of argument
-# we're using one from kinetics400 dataset.
-# To create it, we need to define the path and the stream we want to use.
-
-######################################
-# Chosen video statistics:
-#
-# - WUzgd7C1pWA.mp4
-#     - source:
-#         - kinetics-400
-#     - video:
-#         - H-264
-#         - MPEG-4 AVC (part 10) (avc1)
-#         - fps: 29.97
-#     - audio:
-#         - MPEG AAC audio (mp4a)
-#         - sample rate: 48K Hz
-#
-
-import torch
-import torchvision
-from torchvision.datasets.utils import download_url
-
-# Download the sample video
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    ".",
-    "WUzgd7C1pWA.mp4"
-)
-video_path = "./WUzgd7C1pWA.mp4"
-
-######################################
-# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
-# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
-# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
-# Firstly, let's get the metadata for our particular video:
-
-stream = "video"
-video = torchvision.io.VideoReader(video_path, stream)
-video.get_metadata()
-
-######################################
-# Here we can see that video has two streams - a video and an audio stream.
-# Currently available stream types include ['video', 'audio'].
-# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
-# (which are determined by video encoding).
-# In this way, if the video container contains multiple streams of the same type,
-# users can access the one they want.
-# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
-
-######################################
-# Let's read all the frames from the video stream. By default, the return value of
-# ``next(video_reader)`` is a dict containing the following fields.
-#
-# The return fields are:
-#
-# - ``data``: containing a torch.tensor
-# - ``pts``: containing a float timestamp of this particular frame
-
-metadata = video.get_metadata()
-video.set_current_stream("audio")
-
-frames = []  # we are going to save the frames here.
-ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
-for frame in video:
-    frames.append(frame['data'])
-    ptss.append(frame['pts'])
-
-print("PTS for first five frames ", ptss[:5])
-print("Total number of frames: ", len(frames))
-approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
-print("Approx total number of datapoints we can expect: ", approx_nf)
-print("Read data size: ", frames[0].size(0) * len(frames))
-
-######################################
-# But what if we only want to read certain time segment of the video?
-# That can be done easily using the combination of our ``seek`` function, and the fact that each call
-# to next returns the presentation timestamp of the returned frame in seconds.
-#
-# Given that our implementation relies on python iterators,
-# we can leverage itertools to simplify the process and make it more pythonic.
-#
-# For example, if we wanted to read ten frames from second second:
-
-
-import itertools
-video.set_current_stream("video")
-
-frames = []  # we are going to save the frames here.
-
-# We seek into a second second of the video and use islice to get 10 frames since
-for frame, pts in itertools.islice(video.seek(2), 10):
-    frames.append(frame)
-
-print("Total number of frames: ", len(frames))
-
-######################################
-# Or if we wanted to read from 2nd to 5th second,
-# We seek into a second second of the video,
-# then we utilize the itertools takewhile to get the
-# correct number of frames:
-
-video.set_current_stream("video")
-frames = []  # we are going to save the frames here.
-video = video.seek(2)
-
-for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
-    frames.append(frame['data'])
-
-print("Total number of frames: ", len(frames))
-approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
-print("We can expect approx: ", approx_nf)
-print("Tensor size: ", frames[0].size())
-
-####################################
-# 2. Building a sample read_video function
-# ----------------------------------------------------------------------------------------
-# We can utilize the methods above to build the read video function that follows
-# the same API to the existing ``read_video`` function.
-
-
-def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
-    if end is None:
-        end = float("inf")
-    if end < start:
-        raise ValueError(
-            "end time should be larger than start time, got "
-            f"start time={start} and end time={end}"
-        )
-
-    video_frames = torch.empty(0)
-    video_pts = []
-    if read_video:
-        video_object.set_current_stream("video")
-        frames = []
-        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-            frames.append(frame['data'])
-            video_pts.append(frame['pts'])
-        if len(frames) > 0:
-            video_frames = torch.stack(frames, 0)
-
-    audio_frames = torch.empty(0)
-    audio_pts = []
-    if read_audio:
-        video_object.set_current_stream("audio")
-        frames = []
-        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-            frames.append(frame['data'])
-            audio_pts.append(frame['pts'])
-        if len(frames) > 0:
-            audio_frames = torch.cat(frames, 0)
-
-    return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
-
-
-# Total number of frames should be 327 for video and 523264 datapoints for audio
-vf, af, info, meta = example_read_video(video)
-print(vf.size(), af.size())
-
-####################################
-# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
-# -------------------------------------------------------------------------------------------------------
-# Cool, so now we can use the same principle to make the sample dataset.
-# We suggest trying out iterable dataset for this purpose.
-# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
-
-####################################
-# Make sample dataset
-import os
-os.makedirs("./dataset", exist_ok=True)
-os.makedirs("./dataset/1", exist_ok=True)
-os.makedirs("./dataset/2", exist_ok=True)
-
-####################################
-# Download the videos
-from torchvision.datasets.utils import download_url
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    "./dataset/1", "WUzgd7C1pWA.mp4"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
-    "./dataset/1",
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
-    "./dataset/2",
-    "SOX5yA1l24A.mp4"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
-    "./dataset/2",
-    "v_SoccerJuggling_g23_c01.avi"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
-    "./dataset/2",
-    "v_SoccerJuggling_g24_c01.avi"
-)
-
-####################################
-# Housekeeping and utilities
-import os
-import random
-
-from torchvision.datasets.folder import make_dataset
-from torchvision import transforms as t
-
-
-def _find_classes(dir):
-    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
-    classes.sort()
-    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
-    return classes, class_to_idx
-
-
-def get_samples(root, extensions=(".mp4", ".avi")):
-    _, class_to_idx = _find_classes(root)
-    return make_dataset(root, class_to_idx, extensions=extensions)
-
-####################################
-# We are going to define the dataset and some basic arguments.
-# We assume the structure of the FolderDataset, and add the following parameters:
-#
-# - ``clip_len``: length of a clip in frames
-# - ``frame_transform``: transform for every frame individually
-# - ``video_transform``: transform on a video sequence
-#
-# .. note::
-#   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
-#   class allows us to naturally oversample clips or images from each video if needed.
-
-
-class RandomDataset(torch.utils.data.IterableDataset):
-    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
-        super(RandomDataset).__init__()
-
-        self.samples = get_samples(root)
-
-        # Allow for temporal jittering
-        if epoch_size is None:
-            epoch_size = len(self.samples)
-        self.epoch_size = epoch_size
-
-        self.clip_len = clip_len
-        self.frame_transform = frame_transform
-        self.video_transform = video_transform
-
-    def __iter__(self):
-        for i in range(self.epoch_size):
-            # Get random sample
-            path, target = random.choice(self.samples)
-            # Get video object
-            vid = torchvision.io.VideoReader(path, "video")
-            metadata = vid.get_metadata()
-            video_frames = []  # video frame buffer
-
-            # Seek and return frames
-            max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
-            start = random.uniform(0., max_seek)
-            for frame in itertools.islice(vid.seek(start), self.clip_len):
-                video_frames.append(self.frame_transform(frame['data']))
-                current_pts = frame['pts']
-            # Stack it into a tensor
-            video = torch.stack(video_frames, 0)
-            if self.video_transform:
-                video = self.video_transform(video)
-            output = {
-                'path': path,
-                'video': video,
-                'target': target,
-                'start': start,
-                'end': current_pts}
-            yield output
-
-####################################
-# Given a path of videos in a folder structure, i.e:
-#
-# - dataset
-#     - class 1
-#         - file 0
-#         - file 1
-#         - ...
-#     - class 2
-#         - file 0
-#         - file 1
-#         - ...
-#     - ...
-#
-# We can generate a dataloader and test the dataset.
-
-
-transforms = [t.Resize((112, 112))]
-frame_transform = t.Compose(transforms)
-
-dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
-
-####################################
-from torch.utils.data import DataLoader
-loader = DataLoader(dataset, batch_size=12)
-data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
-for batch in loader:
-    for i in range(len(batch['path'])):
-        data['video'].append(batch['path'][i])
-        data['start'].append(batch['start'][i].item())
-        data['end'].append(batch['end'][i].item())
-        data['tensorsize'].append(batch['video'][i].size())
-print(data)
-
-####################################
-# 4. Data Visualization
-# ----------------------------------
-# Example of visualized video
-
-import matplotlib.pyplot as plt
-
-plt.figure(figsize=(12, 12))
-for i in range(16):
-    plt.subplot(4, 4, i + 1)
-    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
-    plt.axis("off")
-
-####################################
-# Cleanup the video and dataset:
-import os
-import shutil
-os.remove("./WUzgd7C1pWA.mp4")
-shutil.rmtree("./dataset")
diff --git a/gallery/transforms/README.rst b/gallery/transforms/README.rst
new file mode 100644
index 00000000000..1b8b1b08155
--- /dev/null
+++ b/gallery/transforms/README.rst
@@ -0,0 +1,4 @@
+.. _transforms_gallery:
+
+Transforms
+----------
diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py
new file mode 100644
index 00000000000..bc8de0d2ad1
--- /dev/null
+++ b/gallery/transforms/helpers.py
@@ -0,0 +1,56 @@
+import matplotlib.pyplot as plt
+import torch
+from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+from torchvision.transforms.v2 import functional as F
+
+
+def plot(imgs, row_title=None, bbox_width=3, **imshow_kwargs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            boxes = None
+            masks = None
+            if isinstance(img, tuple):
+                img, target = img
+                if isinstance(target, dict):
+                    boxes = target.get("boxes")
+                    masks = target.get("masks")
+                elif isinstance(target, tv_tensors.BoundingBoxes):
+                    boxes = target
+
+                    # Conversion necessary because draw_bounding_boxes() only
+                    # work with this specific format.
+                    if tv_tensors.is_rotated_bounding_format(boxes.format):
+                        boxes = v2.ConvertBoundingBoxFormat("xyxyxyxy")(boxes)
+                else:
+                    raise ValueError(f"Unexpected target type: {type(target)}")
+            img = F.to_image(img)
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+
+            img = F.to_dtype(img, torch.uint8, scale=True)
+            if boxes is not None:
+                img = draw_bounding_boxes(img, boxes, colors="yellow", width=bbox_width)
+            if masks is not None:
+                img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
+
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs)
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    if row_title is not None:
+        for row_idx in range(num_rows):
+            axs[row_idx, 0].set(ylabel=row_title[row_idx])
+
+    plt.tight_layout()
diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py
new file mode 100644
index 00000000000..d1bd9455bfb
--- /dev/null
+++ b/gallery/transforms/plot_custom_transforms.py
@@ -0,0 +1,200 @@
+"""
+===================================
+How to write your own v2 transforms
+===================================
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_transforms.py>` to download the full example code.
+
+This guide explains how to write transforms that are compatible with the
+torchvision transforms V2 API.
+"""
+
+# %%
+from typing import Any, Dict, List
+
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+# %%
+# Just create a ``nn.Module`` and override the ``forward`` method
+# ===============================================================
+#
+# In most cases, this is all you're going to need, as long as you already know
+# the structure of the input that your transform will expect. For example if
+# you're just doing image classification, your transform will typically accept a
+# single image as input, or a ``(img, label)`` input. So you can just hard-code
+# your ``forward`` method to accept just that, e.g.
+#
+# .. code:: python
+#
+#     class MyCustomTransform(torch.nn.Module):
+#         def forward(self, img, label):
+#             # Do some transformations
+#             return new_img, new_label
+#
+# .. note::
+#
+#     This means that if you have a custom transform that is already compatible
+#     with the V1 transforms (those in ``torchvision.transforms``), it will
+#     still work with the V2 transforms without any change!
+#
+# We will illustrate this more completely below with a typical detection case,
+# where our samples are just images, bounding boxes and labels:
+
+class MyCustomTransform(torch.nn.Module):
+    def forward(self, img, bboxes, label):  # we assume inputs are always structured like this
+        print(
+            f"I'm transforming an image of shape {img.shape} "
+            f"with bboxes = {bboxes}\n{label = }"
+        )
+        # Do some transformations. Here, we're just passing though the input
+        return img, bboxes, label
+
+
+transforms = v2.Compose([
+    MyCustomTransform(),
+    v2.RandomResizedCrop((224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=1),
+    v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1])
+])
+
+H, W = 256, 256
+img = torch.rand(3, H, W)
+bboxes = tv_tensors.BoundingBoxes(
+    torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]),
+    format="XYXY",
+    canvas_size=(H, W)
+)
+label = 3
+
+out_img, out_bboxes, out_label = transforms(img, bboxes, label)
+# %%
+print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
+# %%
+# .. note::
+#     While working with TVTensor classes in your code, make sure to
+#     familiarize yourself with this section:
+#     :ref:`tv_tensor_unwrapping_behaviour`
+#
+# Supporting arbitrary input structures
+# =====================================
+#
+# In the section above, we have assumed that you already know the structure of
+# your inputs and that you're OK with hard-coding this expected structure in
+# your code. If you want your custom transforms to be as flexible as possible,
+# this can be a bit limiting.
+#
+# A key feature of the builtin Torchvision V2 transforms is that they can accept
+# arbitrary input structure and return the same structure as output (with
+# transformed entries). For example, transforms can accept a single image, or a
+# tuple of ``(img, label)``, or an arbitrary nested dictionary as input. Here's
+# an example on the built-in transform :class:`~torchvision.transforms.v2.RandomHorizontalFlip`:
+
+structured_input = {
+    "img": img,
+    "annotations": (bboxes, label),
+    "something that will be ignored": (1, "hello"),
+    "another tensor that is ignored": torch.arange(10),
+}
+structured_output = v2.RandomHorizontalFlip(p=1)(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something that will be ignored"] == (1, "hello")
+assert (structured_output["another tensor that is ignored"] == torch.arange(10)).all()
+print(f"The input bboxes are:\n{structured_input['annotations'][0]}")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# Basics: override the `transform()` method
+# -----------------------------------------
+#
+# In order to support arbitrary inputs in your custom transform, you will need
+# to inherit from :class:`~torchvision.transforms.v2.Transform` and override the
+# `.transform()` method (not the `forward()` method!). Below is a basic example:
+
+
+class MyCustomTransform(v2.Transform):
+    def transform(self, inpt: Any, params: Dict[str, Any]):
+        if type(inpt) == torch.Tensor:
+            print(f"I'm transforming an image of shape {inpt.shape}")
+            return inpt + 1  # dummy transformation
+        elif isinstance(inpt, tv_tensors.BoundingBoxes):
+            print(f"I'm transforming bounding boxes! {inpt.canvas_size = }")
+            return tv_tensors.wrap(inpt + 100, like=inpt)  # dummy transformation
+
+
+my_custom_transform = MyCustomTransform()
+structured_output = my_custom_transform(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something that will be ignored"] == (1, "hello")
+assert (structured_output["another tensor that is ignored"] == torch.arange(10)).all()
+print(f"The input bboxes are:\n{structured_input['annotations'][0]}")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# An important thing to note is that when we call ``my_custom_transform`` on
+# ``structured_input``, the input is flattened and then each individual part is
+# passed to ``transform()``. That is, ``transform()``` receives the input image,
+# then the bounding boxes, etc. Within ``transform()``, you can decide how to
+# transform each input, based on their type.
+#
+# If you're curious why the other tensor (``torch.arange()``) didn't get passed
+# to ``transform()``, see :ref:`this note <passthrough_heuristic>` for more
+# details.
+#
+# Advanced: The ``make_params()`` method
+# --------------------------------------
+#
+# The ``make_params()`` method is called internally before calling
+# ``transform()`` on each input. This is typically useful to generate random
+# parameter values. In the example below, we use it to randomly apply the
+# transformation with a probability of 0.5
+
+
+class MyRandomTransform(MyCustomTransform):
+    def __init__(self, p=0.5):
+        self.p = p
+        super().__init__()
+
+    def make_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        apply_transform = (torch.rand(size=(1,)) < self.p).item()
+        params = dict(apply_transform=apply_transform)
+        return params
+
+    def transform(self, inpt: Any, params: Dict[str, Any]):
+        if not params["apply_transform"]:
+            print("Not transforming anything!")
+            return inpt
+        else:
+            return super().transform(inpt, params)
+
+
+my_random_transform = MyRandomTransform()
+
+torch.manual_seed(0)
+_ = my_random_transform(structured_input)  # transforms
+_ = my_random_transform(structured_input)  # doesn't transform
+
+# %%
+#
+# .. note::
+#
+#     It's important for such random parameter generation to happen within
+#     ``make_params()`` and not within ``transform()``, so that for a given
+#     transform call, the same RNG applies to all the inputs in the same way. If
+#     we were to perform the RNG within ``transform()``, we would risk e.g.
+#     transforming the image while *not* transforming the bounding boxes.
+#
+# The ``make_params()`` method takes the list of all the inputs as parameter
+# (each of the elements in this list will later be pased to ``transform()``).
+# You can use ``flat_inputs`` to e.g. figure out the dimensions on the input,
+# using :func:`~torchvision.transforms.v2.query_chw` or
+# :func:`~torchvision.transforms.v2.query_size`.
+#
+# ``make_params()`` should return a dict (or actually, anything you want) that
+# will then be passed to ``transform()``.
diff --git a/gallery/transforms/plot_custom_tv_tensors.py b/gallery/transforms/plot_custom_tv_tensors.py
new file mode 100644
index 00000000000..9b113901461
--- /dev/null
+++ b/gallery/transforms/plot_custom_tv_tensors.py
@@ -0,0 +1,119 @@
+"""
+====================================
+How to write your own TVTensor class
+====================================
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_tv_tensors.py>` to download the full example code.
+
+This guide is intended for advanced users and downstream library maintainers. We explain how to
+write your own TVTensor class, and how to make it compatible with the built-in
+Torchvision v2 transforms. Before continuing, make sure you have read
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+# %%
+# We will create a very simple class that just inherits from the base
+# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover
+# what you need to know to implement your more elaborate uses-cases. If you need
+# to create a class that carries meta-data, take a look at how the
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented
+# <https://github.com/pytorch/vision/blob/main/torchvision/tv_tensors/_bounding_box.py>`_.
+
+
+class MyTVTensor(tv_tensors.TVTensor):
+    pass
+
+
+my_dp = MyTVTensor([1, 2, 3])
+my_dp
+
+# %%
+# Now that we have defined our custom TVTensor class, we want it to be
+# compatible with the built-in torchvision transforms, and the functional API.
+# For that, we need to implement a kernel which performs the core of the
+# transformation, and then "hook" it to the functional that we want to support
+# via :func:`~torchvision.transforms.v2.functional.register_kernel`.
+#
+# We illustrate this process below: we create a kernel for the "horizontal flip"
+# operation of our MyTVTensor class, and register it to the functional API.
+
+from torchvision.transforms.v2 import functional as F
+
+
+@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor)
+def hflip_my_tv_tensor(my_dp, *args, **kwargs):
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see
+# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
+# we will explain it below in :ref:`param_forwarding`.
+#
+# .. note::
+#
+#     In our call to ``register_kernel`` above we used a string
+#     ``functional="hflip"`` to refer to the functional we want to hook into. We
+#     could also have used the  functional *itself*, i.e.
+#     ``@register_kernel(functional=F.hflip, ...)``.
+#
+# Now that we have registered our kernel, we can call the functional API on a
+# ``MyTVTensor`` instance:
+
+my_dp = MyTVTensor(torch.rand(3, 256, 256))
+_ = F.hflip(my_dp)
+
+# %%
+# And we can also use the
+# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally:
+t = v2.RandomHorizontalFlip(p=1)
+_ = t(my_dp)
+
+# %%
+# .. note::
+#
+#     We cannot register a kernel for a transform class, we can only register a
+#     kernel for a **functional**. The reason we can't register a transform
+#     class is because one transform may internally rely on more than one
+#     functional, so in general we can't register a single kernel for a given
+#     class.
+#
+# .. _param_forwarding:
+#
+# Parameter forwarding, and ensuring future compatibility of your kernels
+# -----------------------------------------------------------------------
+#
+# The functional API that you're hooking into is public and therefore
+# **backward** compatible: we guarantee that the parameters of these functionals
+# won't be removed or renamed without a proper deprecation cycle. However, we
+# don't guarantee **forward** compatibility, and we may add new parameters in
+# the future.
+#
+# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter
+# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you
+# already defined and registered your own kernel as
+
+def hflip_my_tv_tensor(my_dp):  # noqa
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to
+# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't
+# accept it.
+#
+# For this reason, we recommend to always define your kernels with
+# ``*args, **kwargs`` in their signature, as done above. This way, your kernel
+# will be able to accept any new parameter that we may add in the future.
+# (Technically, adding `**kwargs` only should be enough).
diff --git a/gallery/transforms/plot_cutmix_mixup.py b/gallery/transforms/plot_cutmix_mixup.py
new file mode 100644
index 00000000000..222be0ff359
--- /dev/null
+++ b/gallery/transforms/plot_cutmix_mixup.py
@@ -0,0 +1,150 @@
+
+"""
+===========================
+How to use CutMix and MixUp
+===========================
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_cutmix_mixup.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_cutmix_mixup.py>` to download the full example code.
+
+:class:`~torchvision.transforms.v2.CutMix` and
+:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
+that can improve classification accuracy.
+
+These transforms are slightly different from the rest of the Torchvision
+transforms, because they expect
+**batches** of samples as input, not individual images. In this example we'll
+explain how to use them: after the ``DataLoader``, or as part of a collation
+function.
+"""
+
+# %%
+import torch
+from torchvision.datasets import FakeData
+from torchvision.transforms import v2
+
+
+NUM_CLASSES = 100
+
+# %%
+# Pre-processing pipeline
+# -----------------------
+#
+# We'll use a simple but typical image classification pipeline:
+
+preproc = v2.Compose([
+    v2.PILToTensor(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),  # to float32 in [0, 1]
+    v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),  # typically from ImageNet
+])
+
+dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc)
+
+img, label = dataset[0]
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }")
+
+# %%
+#
+# One important thing to note is that neither CutMix nor MixUp are part of this
+# pre-processing pipeline. We'll add them a bit later once we define the
+# DataLoader. Just as a refresher, this is what the DataLoader and training loop
+# would look like if we weren't using CutMix or MixUp:
+
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    print(labels.dtype)
+    # <rest of the training loop here>
+    break
+# %%
+
+# %%
+# Where to use MixUp and CutMix
+# -----------------------------
+#
+# After the DataLoader
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Now let's add CutMix and MixUp. The simplest way to do this right after the
+# DataLoader: the Dataloader has already batched the images and labels for us,
+# and this is exactly what these transforms expect as input:
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+cutmix = v2.CutMix(num_classes=NUM_CLASSES)
+mixup = v2.MixUp(num_classes=NUM_CLASSES)
+cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
+
+for images, labels in dataloader:
+    print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }")
+    images, labels = cutmix_or_mixup(images, labels)
+    print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }")
+
+    # <rest of the training loop here>
+    break
+# %%
+#
+# Note how the labels were also transformed: we went from a batched label of
+# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The
+# transformed labels can still be passed as-is to a loss function like
+# :func:`torch.nn.functional.cross_entropy`.
+#
+# As part of the collation function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Passing the transforms after the DataLoader is the simplest way to use CutMix
+# and MixUp, but one disadvantage is that it does not take advantage of the
+# DataLoader multi-processing. For that, we can pass those transforms as part of
+# the collation function (refer to the `PyTorch docs
+# <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`_ to learn
+# more about collation).
+
+from torch.utils.data import default_collate
+
+
+def collate_fn(batch):
+    return cutmix_or_mixup(*default_collate(batch))
+
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader!
+    # <rest of the training loop here>
+    break
+
+# %%
+# Non-standard input format
+# -------------------------
+#
+# So far we've used a typical sample structure where we pass ``(images,
+# labels)`` as inputs. MixUp and CutMix will magically work by default with most
+# common sample structures: tuples where the second parameter is a tensor label,
+# or dict with a "label[s]" key. Look at the documentation of the
+# ``labels_getter`` parameter for more details.
+#
+# If your samples have a different structure, you can still use CutMix and MixUp
+# by passing a callable to the ``labels_getter`` parameter. For example:
+
+batch = {
+    "imgs": torch.rand(4, 3, 224, 224),
+    "target": {
+        "classes": torch.randint(0, NUM_CLASSES, size=(4,)),
+        "some_other_key": "this is going to be passed-through"
+    }
+}
+
+
+def labels_getter(batch):
+    return batch["target"]["classes"]
+
+
+out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
diff --git a/gallery/transforms/plot_rotated_box_transforms.py b/gallery/transforms/plot_rotated_box_transforms.py
new file mode 100644
index 00000000000..7c6e3a559df
--- /dev/null
+++ b/gallery/transforms/plot_rotated_box_transforms.py
@@ -0,0 +1,195 @@
+"""
+===============================================================
+Transforms on Rotated Bounding Boxes
+===============================================================
+
+This example illustrates how to define and use rotated bounding boxes.
+
+.. note::
+    Support for rotated bounding boxes was released in TorchVision 0.23 and is
+    currently a BETA feature. We don't expect the API to change, but there may
+    be some rare edge-cases. If you find any issues, please report them on
+    our bug tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue
+
+First, a bit of setup code:
+"""
+
+# %%
+from PIL import Image
+from pathlib import Path
+import matplotlib.pyplot as plt
+
+
+import torch
+from torchvision.tv_tensors import BoundingBoxes
+from torchvision.transforms import v2
+from helpers import plot
+
+plt.rcParams["figure.figsize"] = [10, 5]
+plt.rcParams["savefig.bbox"] = "tight"
+
+# if you change the seed, make sure that the randomly-applied transforms
+# properly show that the image can be both transformed and *not* transformed!
+torch.manual_seed(0)
+
+# If you're trying to run that on Colab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+orig_img = Image.open(Path('../assets') / 'leaning_tower.jpg')
+
+# %%
+# Creating a Rotated Bounding Box
+# -------------------------------
+# Rotated bounding boxes are created by instantiating the
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the ``format``
+# parameter of the constructor that determines if a bounding box is rotated or
+# not. In this instance, we use the CXCYWHR
+# :attr:`~torchvision.tv_tensors.BoundingBoxFormat`. The first two values are
+# the X and Y coordinates of the center of the bounding box.  The next two
+# values are the width and height of the bounding box, and the last value is the
+# rotation of the bounding box, in degrees.
+
+
+orig_box = BoundingBoxes(
+    [
+        [860.0, 1100, 570, 1840, -7],
+    ],
+    format="CXCYWHR",
+    canvas_size=(orig_img.size[1], orig_img.size[0]),
+)
+
+plot([(orig_img, orig_box)], bbox_width=10)
+
+# %%
+# Transforms illustrations
+# ------------------------
+#
+# Using :class:`~torchvision.transforms.RandomRotation`:
+rotater = v2.RandomRotation(degrees=(0, 180), expand=True)
+rotated_imgs = [rotater((orig_img, orig_box)) for _ in range(4)]
+plot([(orig_img, orig_box)] + rotated_imgs, bbox_width=10)
+
+# %%
+# Using :class:`~torchvision.transforms.Pad`:
+padded_imgs_and_boxes = [
+    v2.Pad(padding=padding)(orig_img, orig_box)
+    for padding in (30, 50, 100, 200)
+]
+plot([(orig_img, orig_box)] + padded_imgs_and_boxes, bbox_width=10)
+
+# %%
+# Using :class:`~torchvision.transforms.Resize`:
+resized_imgs = [
+    v2.Resize(size=size)(orig_img, orig_box)
+    for size in (30, 50, 100, orig_img.size)
+]
+plot([(orig_img, orig_box)] + resized_imgs, bbox_width=5)
+
+# %%
+# Note that the bounding box looking bigger in the images with less pixels is
+# an artifact, not reality. That is merely the rasterised representation of the
+# bounding box's boundaries appearing bigger because we specify a fixed width of
+# that rasterized line. When the image is, say, only 30 pixels wide, a
+# line that is 3 pixels wide is relatively large.
+#
+# .. _clamping_mode_tuto:
+#
+# Clamping Mode, and its effect on transforms
+# -------------------------------------------
+#
+# Some transforms, such as :class:`~torchvision.transforms.CenterCrop`, may
+# result in having the transformed bounding box partially outside of the
+# transformed (cropped) image. In general, this may happen on most of the
+# :ref:`geometric transforms <v2_api_ref>`.
+#
+# In such cases, the bounding box is clamped to the transformed image size based
+# on its ``clamping_mode`` attribute.  There are three values for
+# ``clamping_mode``, which determines how the box is clamped after a
+# transformation:
+#
+#  - ``None``: No clamping is applied, and the bounding box may be partially
+#    outside of the image.
+#  - `"hard"`:  The box is clamped to the image size, such that all its corners
+#    are within the image canvas. This potentially results in a loss of
+#    information, and it can lead to unintuitive resuts. But may be necessary
+#    for some applications e.g. if the model doesn't support boxes outside of
+#    their image.
+#  - `"soft"`: . This is an intermediate mode between ``None`` and "hard": the
+#    box is clamped, but not as strictly as in "hard" mode. Some box dimensions
+#    may still be outside of the image. This is the default when constucting
+#    :class:`~torchvision.tv_tensors.BoundingBoxes`.
+#
+# .. note::
+#
+#       For axis-aligned bounding boxes, the `"soft"` and `"hard"` modes behave
+#       the same, as the bounding box is always clamped to the image size.
+#
+# Let's illustrate the clamping modes with
+# :class:`~torchvision.transforms.CenterCrop` transform:
+
+assert orig_box.clamping_mode == "soft"
+
+box_hard_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode="hard")
+
+box_no_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode=None)
+
+crop_sizes = (800, 1200, 2000, orig_img.size)
+soft_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, orig_box)
+    for size in crop_sizes
+]
+
+hard_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, box_hard_clamping)
+    for size in crop_sizes
+]
+
+no_clamping_center_crops_and_boxes = [
+    v2.CenterCrop(size=size)(orig_img, box_no_clamping)
+    for size in crop_sizes
+]
+
+plot([[(orig_img, box_hard_clamping)] + hard_center_crops_and_boxes,
+      [(orig_img, orig_box)] + soft_center_crops_and_boxes,
+      [(orig_img, box_no_clamping)] + no_clamping_center_crops_and_boxes],
+     bbox_width=10)
+
+# %%
+# The plot above shows the "hard" clamping mode, "soft" and ``None``, in this
+# order. While "soft" and ``None`` result in similar plots, they do not lead to
+# the exact same clamped boxes. The non-clamped boxes will show dimensions that are further away from the image:
+print("boxes with soft clamping:")
+print(soft_center_crops_and_boxes)
+print()
+print("boxes with no clamping:")
+print(no_clamping_center_crops_and_boxes)
+
+# %%
+#
+# Setting the clamping mode
+# --------------------------
+#
+# The ``clamping_mode`` attribute, which determines the clamping strategy that
+# is applied to a box, can be set in different ways:
+#
+# - When constructing the bounding box with its
+#   :class:`~torchvision.tv_tensors.BoundingBoxes` constructor, as done in the example above.
+# - By directly setting the attribute on an existing instance, e.g. ``boxes.clamping_mode = "hard"``.
+# - By calling the :class:`~torchvision.transforms.v2.SetClampingMode` transform.
+#
+# Also, remember that you can always clamp the bounding box manually by
+# calling the :meth:`~torchvision.transforms.v2.ClampBoundingBoxes` transform!
+# Here's an example illustrating all of these option:
+
+t = v2.Compose([
+    v2.CenterCrop(size=(800,)),  # clamps according to the current clamping_mode
+                                 # attribute, in this case set by the constructor
+    v2.SetClampingMode(None),  # sets the clamping_mode attribute for future transforms
+    v2.Pad(padding=3),  # clamps according to the current clamping_mode
+                        # i.e. ``None``
+    v2.ClampBoundingBoxes(clamping_mode="soft"),  # clamps with "soft" mode.
+])
+
+out_img, out_box = t(orig_img, orig_box)
+plot([(orig_img, orig_box), (out_img, out_box)], bbox_width=10)
+
+# %%
diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py
new file mode 100644
index 00000000000..765d7ad51e5
--- /dev/null
+++ b/gallery/transforms/plot_transforms_e2e.py
@@ -0,0 +1,181 @@
+"""
+===============================================================
+Transforms v2: End-to-end object detection/segmentation example
+===============================================================
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_e2e.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_e2e.py>` to download the full example code.
+
+Object detection and segmentation tasks are natively supported:
+``torchvision.transforms.v2`` enables jointly transforming images, videos,
+bounding boxes, and masks.
+
+This example showcases an end-to-end instance segmentation training case using
+Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and
+``torchvision.transforms.v2``. Everything covered here can be applied similarly
+to object detection or semantic segmentation tasks.
+"""
+
+# %%
+import pathlib
+
+import torch
+import torch.utils.data
+
+from torchvision import models, datasets, tv_tensors
+from torchvision.transforms import v2
+
+torch.manual_seed(0)
+
+# This loads fake data for illustration purposes of this example. In practice, you'll have
+# to replace this with the proper data.
+# If you're trying to run that on Colab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+ROOT = pathlib.Path("../assets") / "coco"
+IMAGES_PATH = str(ROOT / "images")
+ANNOTATIONS_PATH = str(ROOT / "instances.json")
+from helpers import plot
+
+
+# %%
+# Dataset preparation
+# -------------------
+#
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns.
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH)
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }")
+
+
+# %%
+# Torchvision datasets preserve the data structure and types as it was intended
+# by the datasets authors. So by default, the output structure may not always be
+# compatible with the models or the transforms.
+#
+# To overcome that, we can use the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target
+# structure to a single dictionary of lists:
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks"))
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
+print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }")
+
+# %%
+# We used the ``target_keys`` parameter to specify the kind of output we're
+# interested in. Our dataset now returns a target which is dict where the values
+# are :ref:`TVTensors <what_are_tv_tensors>` (all are :class:`torch.Tensor`
+# subclasses). We're dropped all unncessary keys from the previous output, but
+# if you need any of the original keys e.g. "image_id", you can still ask for
+# it.
+#
+# .. note::
+#
+#     If you just want to do detection, you don't need and shouldn't pass
+#     "masks" in ``target_keys``: if masks are present in the sample, they will
+#     be transformed, slowing down your transformations unnecessarily.
+#
+# As baseline, let's have a look at a sample without transformations:
+
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# Transforms
+# ----------
+#
+# Let's now define our pre-processing transforms. All the transforms know how
+# to handle images, bounding boxes and masks when relevant.
+#
+# Transforms are typically passed as the ``transforms`` parameter of the
+# dataset so that they can leverage multi-processing from the
+# :class:`torch.utils.data.DataLoader`.
+
+transforms = v2.Compose(
+    [
+        v2.ToImage(),
+        v2.RandomPhotometricDistort(p=1),
+        v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}),
+        v2.RandomIoUCrop(),
+        v2.RandomHorizontalFlip(p=1),
+        v2.SanitizeBoundingBoxes(),
+        v2.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"])
+
+# %%
+# A few things are worth noting here:
+#
+# - We're converting the PIL image into a
+#   :class:`~torchvision.transforms.v2.Image` object. This isn't strictly
+#   necessary, but relying on Tensors (here: a Tensor subclass) will
+#   :ref:`generally be faster <transforms_perf>`.
+# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to
+#   make sure we remove degenerate bounding boxes, as well as their
+#   corresponding labels and masks.
+#   :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed
+#   at least once at the end of a detection pipeline; it is particularly
+#   critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+# sphinx_gallery_thumbnail_number = 2
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# We can see that the color of the images were distorted, zoomed in or out, and flipped.
+# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training.
+#
+# Data loading and training loop
+# ------------------------------
+#
+# Below we're using Mask-RCNN which is an instance segmentation model, but
+# everything we've covered in this tutorial also applies to object detection and
+# semantic segmentation tasks.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection
+    # models expect a sequence of images and target dictionaries. The default
+    # collation function tries to torch.stack() the individual elements,
+    # which fails in general for object detection, because the number of bounding
+    # boxes varies between the images of the same batch.
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()
+
+for imgs, targets in data_loader:
+    loss_dict = model(imgs, targets)
+    # Put your training logic here
+
+    print(f"{[img.shape for img in imgs] = }")
+    print(f"{[type(target) for target in targets] = }")
+    for name, loss_val in loss_dict.items():
+        print(f"{name:<20}{loss_val:.3f}")
+
+# %%
+# Training References
+# -------------------
+#
+# From there, you can check out the `torchvision references
+# <https://github.com/pytorch/vision/tree/main/references>`_ where you'll find
+# the actual training scripts we use to train our models.
+#
+# **Disclaimer** The code in our references is more complex than what you'll
+# need for your own use-cases: this is because we're supporting different
+# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and
+# v2). So don't be afraid to simplify and only keep what you need.
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
new file mode 100644
index 00000000000..d7fb36a4a77
--- /dev/null
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -0,0 +1,268 @@
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_getting_started.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_getting_started.py>` to download the full example code.
+
+This example illustrates all of what you need to know to get started with the
+new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
+image classification, and more advanced ones like object detection /
+segmentation.
+"""
+
+# %%
+# First, a bit of setup
+from pathlib import Path
+import torch
+import matplotlib.pyplot as plt
+plt.rcParams["savefig.bbox"] = 'tight'
+
+from torchvision.transforms import v2
+from torchvision.io import decode_image
+
+torch.manual_seed(1)
+
+# If you're trying to run that on Colab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+img = decode_image(str(Path('../assets') / 'astronaut.jpg'))
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }")
+
+# %%
+# The basics
+# ----------
+#
+# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in
+# fact, most of them are): instantiate a transform, pass an input, get a
+# transformed output:
+
+transform = v2.RandomCrop(size=(224, 224))
+out = transform(img)
+
+plot([img, out])
+
+# %%
+# I just want to do image classification
+# --------------------------------------
+#
+# If you just care about image classification, things are very simple. A basic
+# classification pipeline may look like this:
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+out = transforms(img)
+
+plot([img, out])
+
+# %%
+# Such transformation pipeline is typically passed as the ``transform`` argument
+# to the :ref:`Datasets <datasets>`, e.g. ``ImageNet(...,
+# transform=transforms)``.
+#
+# That's pretty much all there is. From there, read through our :ref:`main docs
+# <transforms>` to learn more about recommended practices and conventions, or
+# explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
+# transforms like :ref:`CutMix and MixUp
+# <sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py>`.
+#
+# .. note::
+#
+#     If you're already relying on the ``torchvision.transforms`` v1 API,
+#     we recommend to :ref:`switch to the new v2 transforms<v1_or_v2>`. It's
+#     very easy: the v2 transforms are fully compatible with the v1 API, so you
+#     only need to change the import!
+#
+# Videos, boxes, masks, keypoints
+# -------------------------------
+#
+# The Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform rotated or
+# axis-aligned bounding boxes, segmentation / detection masks, videos, and
+# keypoints.
+#
+# Let's briefly look at a detection example with bounding boxes.
+
+from torchvision import tv_tensors  # we'll describe this a bit later, bare with us
+
+boxes = tv_tensors.BoundingBoxes(
+    [
+        [15, 10, 370, 510],
+        [275, 340, 510, 510],
+        [130, 345, 210, 425]
+    ],
+    format="XYXY", canvas_size=img.shape[-2:])
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomPhotometricDistort(p=1),
+    v2.RandomHorizontalFlip(p=1),
+])
+out_img, out_boxes = transforms(img, boxes)
+print(type(boxes), type(out_boxes))
+
+plot([(img, boxes), (out_img, out_boxes)])
+
+# %%
+#
+# The example above focuses on object detection. But if we had masks
+# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic
+# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have
+# passed them to the transforms in exactly the same way.
+#
+# By now you likely have a few questions: what are these TVTensors, how do we
+# use them, and what is the expected input/output of those transforms? We'll
+# answer these in the next sections.
+
+# %%
+#
+# .. _what_are_tv_tensors:
+#
+# What are TVTensors?
+# --------------------
+#
+# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are
+# :class:`~torchvision.tv_tensors.Image`,
+# :class:`~torchvision.tv_tensors.BoundingBoxes`,
+# :class:`~torchvision.tv_tensors.Mask`,
+# :class:`~torchvision.tv_tensors.Video`, and
+# :class:`~torchvision.tv_tensors.KeyPoints`.
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
+# or any ``torch.*`` operator will also work on a TVTensor:
+
+img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
+
+print(f"{isinstance(img_dp, torch.Tensor) = }")
+print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
+
+# %%
+# These TVTensor classes are at the core of the transforms: in order to
+# transform a given input, the transforms first look at the **class** of the
+# object, and dispatch to the appropriate implementation accordingly.
+#
+# You don't need to know much more about TVTensors at this point, but advanced
+# users who want to learn more can refer to
+# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+#
+# What do I pass as input?
+# ------------------------
+#
+# Above, we've seen two examples: one where we passed a single image as input
+# i.e. ``out = transforms(img)``, and one where we passed both an image and
+# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``.
+#
+# In fact, transforms support **arbitrary input structures**. The input can be a
+# single image, a tuple, an arbitrarily nested dictionary... pretty much
+# anything. The same structure will be returned as output. Below, we use the
+# same detection transforms, but pass a tuple (image, target_dict) as input and
+# we're getting the same structure as output:
+
+target = {
+    "boxes": boxes,
+    "labels": torch.arange(boxes.shape[0]),
+    "this_is_ignored": ("arbitrary", {"structure": "!"})
+}
+
+# Re-using the transforms and definitions from above.
+out_img, out_target = transforms(img, target)
+
+# sphinx_gallery_thumbnail_number = 4
+plot([(img, target["boxes"]), (out_img, out_target["boxes"])])
+print(f"{out_target['this_is_ignored']}")
+
+# %%
+# We passed a tuple so we get a tuple back, and the second element is the
+# tranformed target dict. Transforms don't really care about the structure of
+# the input; as mentioned above, they only care about the **type** of the
+# objects and transforms them accordingly.
+#
+# *Foreign* objects like strings or ints are simply passed-through. This can be
+# useful e.g. if you want to associate a path with every single sample when
+# debugging!
+#
+# .. _passthrough_heuristic:
+#
+# .. note::
+#
+#     **Disclaimer** This note is slightly advanced and can be safely skipped on
+#     a first read.
+#
+#     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
+#     as videos for video-specific transforms). Indeed, you may have noticed
+#     that in the code above we haven't used the
+#     :class:`~torchvision.tv_tensors.Image` class at all, and yet our images
+#     got transformed properly. Transforms follow the following logic to
+#     determine whether a pure Tensor should be treated as an image (or video),
+#     or just ignored:
+#
+#     * If there is an :class:`~torchvision.tv_tensors.Image`,
+#       :class:`~torchvision.tv_tensors.Video`,
+#       or :class:`PIL.Image.Image` instance in the input, all other pure
+#       tensors are passed-through.
+#     * If there is no :class:`~torchvision.tv_tensors.Image` or
+#       :class:`~torchvision.tv_tensors.Video` instance, only the first pure
+#       :class:`torch.Tensor` will be transformed as image or video, while all
+#       others will be passed-through. Here "first" means "first in a depth-wise
+#       traversal".
+#
+#     This is what happened in the detection example above: the first pure
+#     tensor was the image so it got transformed properly, and all other pure
+#     tensor instances like the ``labels`` were passed-through (although labels
+#     can still be transformed by some transforms like
+#     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
+#
+# .. _transforms_datasets_intercompatibility:
+#
+# Transforms and Datasets intercompatibility
+# ------------------------------------------
+#
+# Roughly speaking, the output of the datasets must correspond to the input of
+# the transforms. How to do that depends on whether you're using the torchvision
+# :ref:`built-in datatsets <datasets>`, or your own custom datasets.
+#
+# Using built-in datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you're just doing image classification, you don't need to do anything. Just
+# use ``transform`` argument of the dataset e.g. ``ImageNet(...,
+# transform=transforms)`` and you're good to go.
+#
+# Torchvision also supports datasets for object detection or segmentation like
+# :class:`torchvision.datasets.CocoDetection`. Those datasets predate
+# the existence of the :mod:`torchvision.transforms.v2` module and of the
+# TVTensors, so they don't return TVTensors out of the box.
+#
+# An easy way to force those datasets to return TVTensors and to make them
+# compatible with v2 transforms is to use the
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
+#
+# .. code-block:: python
+#
+#    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+#
+#    dataset = CocoDetection(..., transforms=my_transforms)
+#    dataset = wrap_dataset_for_transforms_v2(dataset)
+#    # Now the dataset returns TVTensors!
+#
+# Using your own datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you have a custom dataset, then you'll need to convert your objects into
+# the appropriate TVTensor classes. Creating TVTensor instances is very easy,
+# refer to :ref:`tv_tensor_creation` for more details.
+#
+# There are two main places where you can implement that conversion logic:
+#
+# - At the end of the datasets's ``__getitem__`` method, before returning the
+#   sample (or by sub-classing the dataset).
+# - As the very first step of your transforms pipeline
+#
+# Either way, the logic will depend on your specific dataset.
diff --git a/gallery/plot_transforms.py b/gallery/transforms/plot_transforms_illustrations.py
similarity index 54%
rename from gallery/plot_transforms.py
rename to gallery/transforms/plot_transforms_illustrations.py
index c6e44a14e22..0c1f3b40021 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/transforms/plot_transforms_illustrations.py
@@ -3,317 +3,329 @@
 Illustration of transforms
 ==========================
 
-This example illustrates the various transforms available in :ref:`the
-torchvision.transforms module <transforms>`.
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_illustrations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_illustrations.py>` to download the full example code.
+
+This example illustrates some of the various transforms available in :ref:`the
+torchvision.transforms.v2 module <transforms>`.
 """
+# %%
 
 # sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png"
 
 from PIL import Image
 from pathlib import Path
 import matplotlib.pyplot as plt
-import numpy as np
 
 import torch
-import torchvision.transforms as T
-
+from torchvision.transforms import v2
 
 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)
 
-
-def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
-    if not isinstance(imgs[0], list):
-        # Make a 2d grid even if there's just 1 row
-        imgs = [imgs]
-
-    num_rows = len(imgs)
-    num_cols = len(imgs[0]) + with_orig
-    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
-    for row_idx, row in enumerate(imgs):
-        row = [orig_img] + row if with_orig else row
-        for col_idx, img in enumerate(row):
-            ax = axs[row_idx, col_idx]
-            ax.imshow(np.asarray(img), **imshow_kwargs)
-            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-    if with_orig:
-        axs[0, 0].set(title='Original image')
-        axs[0, 0].title.set_size(8)
-    if row_title is not None:
-        for row_idx in range(num_rows):
-            axs[row_idx, 0].set(ylabel=row_title[row_idx])
-
-    plt.tight_layout()
-
-
-####################################
+# If you're trying to run that on Colab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
+
+# %%
+# Geometric Transforms
+# --------------------
+# Geometric image transformation refers to the process of altering the geometric properties of an image,
+# such as its shape, size, orientation, or position.
+# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation.
+#
 # Pad
-# ---
+# ~~~
 # The :class:`~torchvision.transforms.Pad` transform
 # (see also :func:`~torchvision.transforms.functional.pad`)
-# fills image borders with some pixel values.
-padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
-plot(padded_imgs)
+# pads all image borders with some pixel values.
+padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
+plot([orig_img] + padded_imgs)
 
-####################################
+# %%
 # Resize
-# ------
+# ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
 # (see also :func:`~torchvision.transforms.functional.resize`)
 # resizes an image.
-resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(resized_imgs)
+resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + resized_imgs)
 
-####################################
+# %%
 # CenterCrop
-# ----------
+# ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
 # (see also :func:`~torchvision.transforms.functional.center_crop`)
 # crops the given image at the center.
-center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(center_crops)
+center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + center_crops)
 
-####################################
+# %%
 # FiveCrop
-# --------
+# ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
 # (see also :func:`~torchvision.transforms.functional.five_crop`)
 # crops the given image into four corners and the central crop.
-(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
-plot([top_left, top_right, bottom_left, bottom_right, center])
-
-####################################
-# Grayscale
-# ---------
-# The :class:`~torchvision.transforms.Grayscale` transform
-# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
-# converts an image to grayscale
-gray_img = T.Grayscale()(orig_img)
-plot([gray_img], cmap='gray')
-
-####################################
-# Random transforms
-# -----------------
-# The following transforms are random, which means that the same transfomer
-# instance will produce different result each time it transforms a given image.
-#
-# ColorJitter
-# ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.ColorJitter` transform
-# randomly changes the brightness, saturation, and other properties of an image.
-jitter = T.ColorJitter(brightness=.5, hue=.3)
-jitted_imgs = [jitter(orig_img) for _ in range(4)]
-plot(jitted_imgs)
-
-####################################
-# GaussianBlur
-# ~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.GaussianBlur` transform
-# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
-# performs gaussian blur transform on an image.
-blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
-blurred_imgs = [blurrer(orig_img) for _ in range(4)]
-plot(blurred_imgs)
+(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img)
+plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center])
 
-####################################
+# %%
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPerspective` transform
 # (see also :func:`~torchvision.transforms.functional.perspective`)
 # performs random perspective transform on an image.
-perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
+perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
-plot(perspective_imgs)
+plot([orig_img] + perspective_imgs)
 
-####################################
+# %%
 # RandomRotation
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomRotation` transform
 # (see also :func:`~torchvision.transforms.functional.rotate`)
 # rotates an image with random angle.
-rotater = T.RandomRotation(degrees=(0, 180))
+rotater = v2.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
-plot(rotated_imgs)
+plot([orig_img] + rotated_imgs)
 
-####################################
+# %%
 # RandomAffine
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAffine` transform
 # (see also :func:`~torchvision.transforms.functional.affine`)
 # performs random affine transform on an image.
-affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
+affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
-plot(affine_imgs)
+plot([orig_img] + affine_imgs)
 
-####################################
+# %%
 # ElasticTransform
 # ~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ElasticTransform` transform
 # (see also :func:`~torchvision.transforms.functional.elastic_transform`)
 # Randomly transforms the morphology of objects in images and produces a
 # see-through-water-like effect.
-elastic_transformer = T.ElasticTransform(alpha=250.0)
+elastic_transformer = v2.ElasticTransform(alpha=250.0)
 transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
-####################################
+# %%
 # RandomCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomCrop` transform
 # (see also :func:`~torchvision.transforms.functional.crop`)
 # crops an image at a random location.
-cropper = T.RandomCrop(size=(128, 128))
+cropper = v2.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
-plot(crops)
+plot([orig_img] + crops)
 
-####################################
+# %%
 # RandomResizedCrop
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomResizedCrop` transform
 # (see also :func:`~torchvision.transforms.functional.resized_crop`)
 # crops an image at a random location, and then resizes the crop to a given
 # size.
-resize_cropper = T.RandomResizedCrop(size=(32, 32))
+resize_cropper = v2.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
-plot(resized_crops)
+plot([orig_img] + resized_crops)
+
+# %%
+# Photometric Transforms
+# ----------------------
+# Photometric image transformation refers to the process of modifying the photometric properties of an image,
+# such as its brightness, contrast, color, or tone.
+# These transformations are applied to change the visual appearance of an image
+# while preserving its geometric structure.
+#
+# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random,
+# which means that the same transform
+# instance will produce different result each time it transforms a given image.
+#
+# Grayscale
+# ~~~~~~~~~
+# The :class:`~torchvision.transforms.Grayscale` transform
+# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
+# converts an image to grayscale
+gray_img = v2.Grayscale()(orig_img)
+plot([orig_img, gray_img], cmap='gray')
 
-####################################
+# %%
+# ColorJitter
+# ~~~~~~~~~~~
+# The :class:`~torchvision.transforms.ColorJitter` transform
+# randomly changes the brightness, contrast, saturation, hue, and other properties of an image.
+jitter = v2.ColorJitter(brightness=.5, hue=.3)
+jittered_imgs = [jitter(orig_img) for _ in range(4)]
+plot([orig_img] + jittered_imgs)
+
+# %%
+# GaussianBlur
+# ~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.GaussianBlur` transform
+# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
+# performs gaussian blur transform on an image.
+blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.))
+blurred_imgs = [blurrer(orig_img) for _ in range(4)]
+plot([orig_img] + blurred_imgs)
+
+# %%
 # RandomInvert
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomInvert` transform
 # (see also :func:`~torchvision.transforms.functional.invert`)
 # randomly inverts the colors of the given image.
-inverter = T.RandomInvert()
+inverter = v2.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
-plot(invertered_imgs)
+plot([orig_img] + invertered_imgs)
 
-####################################
+# %%
 # RandomPosterize
 # ~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPosterize` transform
 # (see also :func:`~torchvision.transforms.functional.posterize`)
 # randomly posterizes the image by reducing the number of bits
 # of each color channel.
-posterizer = T.RandomPosterize(bits=2)
+posterizer = v2.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
-plot(posterized_imgs)
+plot([orig_img] + posterized_imgs)
 
-####################################
+# %%
 # RandomSolarize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomSolarize` transform
 # (see also :func:`~torchvision.transforms.functional.solarize`)
 # randomly solarizes the image by inverting all pixel values above
 # the threshold.
-solarizer = T.RandomSolarize(threshold=192.0)
+solarizer = v2.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
-plot(solarized_imgs)
+plot([orig_img] + solarized_imgs)
 
-####################################
+# %%
 # RandomAdjustSharpness
 # ~~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
 # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`)
 # randomly adjusts the sharpness of the given image.
-sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
+sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
-plot(sharpened_imgs)
+plot([orig_img] + sharpened_imgs)
 
-####################################
+# %%
 # RandomAutocontrast
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
 # (see also :func:`~torchvision.transforms.functional.autocontrast`)
 # randomly applies autocontrast to the given image.
-autocontraster = T.RandomAutocontrast()
+autocontraster = v2.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
-plot(autocontrasted_imgs)
+plot([orig_img] + autocontrasted_imgs)
 
-####################################
+# %%
 # RandomEqualize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomEqualize` transform
 # (see also :func:`~torchvision.transforms.functional.equalize`)
 # randomly equalizes the histogram of the given image.
-equalizer = T.RandomEqualize()
+equalizer = v2.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
-plot(equalized_imgs)
+plot([orig_img] + equalized_imgs)
 
-####################################
+# %%
+# JPEG
+# ~~~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.v2.JPEG` transform
+# (see also :func:`~torchvision.transforms.v2.functional.jpeg`)
+# applies JPEG compression to the given image with random
+# degree of compression.
+jpeg = v2.JPEG((5, 50))
+jpeg_imgs = [jpeg(orig_img) for _ in range(4)]
+plot([orig_img] + jpeg_imgs)
+
+# %%
+# Augmentation Transforms
+# -----------------------
+# The following transforms are combinations of multiple transforms,
+# either geometric or photometric, or both.
+#
 # AutoAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.AutoAugment` transform
 # automatically augments data based on a given auto-augmentation policy.
 # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies.
-policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN]
-augmenters = [T.AutoAugment(policy) for policy in policies]
+policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN]
+augmenters = [v2.AutoAugment(policy) for policy in policies]
 imgs = [
     [augmenter(orig_img) for _ in range(4)]
     for augmenter in augmenters
 ]
 row_title = [str(policy).split('.')[-1] for policy in policies]
-plot(imgs, row_title=row_title)
+plot([[orig_img] + row for row in imgs], row_title=row_title)
 
-####################################
+# %%
 # RandAugment
 # ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.RandAugment` transform automatically augments the data.
-augmenter = T.RandAugment()
+# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
+augmenter = v2.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
-####################################
+# %%
 # TrivialAugmentWide
 # ~~~~~~~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.TrivialAugmentWide` transform automatically augments the data.
-augmenter = T.TrivialAugmentWide()
+# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
+# However, instead of transforming an image multiple times, it transforms an image only once
+# using a random transform from a given list with a random strength number.
+augmenter = v2.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
-####################################
+# %%
 # AugMix
 # ~~~~~~
-# The :class:`~torchvision.transforms.AugMix` transform automatically augments the data.
-augmenter = T.AugMix()
+# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
+augmenter = v2.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
-####################################
-# Randomly-applied transforms
+# %%
+# Randomly-applied Transforms
 # ---------------------------
 #
-# Some transforms are randomly-applied given a probability ``p``.  That is, the
-# transformed image may actually be the same as the original one, even when
-# called with the same transformer instance!
+# The following transforms are randomly-applied given a probability ``p``.  That is, given ``p = 0.5``,
+# there is a 50% chance to return the original image, and a 50% chance to return the transformed image,
+# even when called with the same transform instance!
 #
 # RandomHorizontalFlip
 # ~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.hflip`)
 # performs horizontal flip of an image, with a given probability.
-hflipper = T.RandomHorizontalFlip(p=0.5)
+hflipper = v2.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
-####################################
+# %%
 # RandomVerticalFlip
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.vflip`)
 # performs vertical flip of an image, with a given probability.
-vflipper = T.RandomVerticalFlip(p=0.5)
+vflipper = v2.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
-####################################
+# %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform
 # randomly applies a list of transforms, with a given probability.
-applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5)
+applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5)
 transformed_imgs = [applier(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
diff --git a/gallery/transforms/plot_tv_tensors.py b/gallery/transforms/plot_tv_tensors.py
new file mode 100644
index 00000000000..2c6ebbf9031
--- /dev/null
+++ b/gallery/transforms/plot_tv_tensors.py
@@ -0,0 +1,233 @@
+"""
+=============
+TVTensors FAQ
+=============
+
+.. note::
+    Try on `Colab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_tv_tensors.py>` to download the full example code.
+
+
+TVTensors are Tensor subclasses introduced together with
+``torchvision.transforms.v2``. This example showcases what these TVTensors are
+and how they behave.
+
+.. warning::
+
+    **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you
+    probably do not need to read this guide. This is a fairly low-level topic
+    that most users will not need to worry about: you do not need to understand
+    the internals of TVTensors to efficiently rely on
+    ``torchvision.transforms.v2``. It may however be useful for advanced users
+    trying to implement their own datasets, transforms, or work directly with
+    the TVTensors.
+"""
+
+# %%
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+
+
+# %%
+# What are TVTensors?
+# -------------------
+#
+# TVTensors are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = tv_tensors.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+# %%
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# :mod:`torchvision.tv_tensors` supports five types of TVTensors:
+#
+# * :class:`~torchvision.tv_tensors.Image`
+# * :class:`~torchvision.tv_tensors.Video`
+# * :class:`~torchvision.tv_tensors.BoundingBoxes`
+# * :class:`~torchvision.tv_tensors.KeyPoints`
+# * :class:`~torchvision.tv_tensors.Mask`
+#
+# What can I do with a TVTensor?
+# ------------------------------
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
+# any ``torch.*`` operator will also work on TVTensors. See
+# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas.
+
+# %%
+# .. _tv_tensor_creation:
+#
+# How do I construct a TVTensor?
+# ------------------------------
+#
+# Using the constructor
+# ^^^^^^^^^^^^^^^^^^^^^
+#
+# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = tv_tensors.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+# %%
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+# %%
+# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a
+# :class:`PIL.Image.Image` directly:
+
+image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+# %%
+# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example,
+# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
+# corresponding image (``canvas_size``) alongside the actual values. These
+# metadata are required to properly transform the bounding boxes.
+# In a similar fashion, :class:`~torchvision.tv_tensors.KeyPoints` also require the ``canvas_size`` metadata to be added.
+
+bboxes = tv_tensors.BoundingBoxes(
+    [[17, 16, 344, 495], [0, 10, 0, 10]],
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    canvas_size=image.shape[-2:]
+)
+print(bboxes)
+
+
+keypoints = tv_tensors.KeyPoints(
+    [[17, 16], [344, 495], [0, 10], [0, 10]],
+    canvas_size=image.shape[-2:]
+)
+print(keypoints)
+
+# %%
+# Using ``tv_tensors.wrap()``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object
+# into a TVTensor. This is useful when you already have an object of the
+# desired type, which typically happens when writing transforms: you just want
+# to wrap the output like the input.
+
+new_bboxes = torch.tensor([0, 20, 30, 40])
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+assert new_bboxes.canvas_size == bboxes.canvas_size
+
+# %%
+# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
+# it as a parameter to override it.
+#
+# .. _tv_tensor_unwrapping_behaviour:
+#
+# I had a TVTensor but now I have a Tensor. Help!
+# -----------------------------------------------
+#
+# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects
+# will return a pure Tensor:
+
+
+assert isinstance(bboxes, tv_tensors.BoundingBoxes)
+
+# Shift bboxes by 3 pixels in both H and W
+new_bboxes = bboxes + 3
+
+assert isinstance(new_bboxes, torch.Tensor)
+assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# .. note::
+#
+#    This behavior only affects native ``torch`` operations. If you are using
+#    the built-in ``torchvision`` transforms or functionals, you will always get
+#    as output the same type that you passed as input (pure ``Tensor`` or
+#    ``TVTensor``).
+
+# %%
+# But I want a TVTensor back!
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor
+# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function
+# (see more details above in :ref:`tv_tensor_creation`):
+
+new_bboxes = bboxes + 3
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type`
+# as a global config setting for the whole program, or as a context manager
+# (read its docs to learn more about caveats):
+
+with tv_tensors.set_return_type("TVTensor"):
+    new_bboxes = bboxes + 3
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Why is this happening?
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor`
+# classes are Tensor subclasses, so any operation involving a
+# :class:`~torchvision.tv_tensors.TVTensor` object will go through the
+# `__torch_function__
+# <https://pytorch.org/docs/stable/notes/extending.html#extending-torch>`_
+# protocol. This induces a small overhead, which we want to avoid when possible.
+# This doesn't matter for built-in ``torchvision`` transforms because we can
+# avoid the overhead there, but it could be a problem in your model's
+# ``forward``.
+#
+# **The alternative isn't much better anyway.** For every operation where
+# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes
+# sense, there are just as many operations where returning a pure Tensor is
+# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`?
+# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all
+# the way, even model's logits or the output of the loss function would end up
+# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not
+# desirable.
+#
+# .. note::
+#
+#    This behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# Exceptions
+# ^^^^^^^^^^
+#
+# There are a few exceptions to this "unwrapping" rule:
+# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
+# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
+# the TVTensor type.
+#
+# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of
+# ``obj``. However, the **returned** value of inplace operations will be a pure
+# tensor:
+
+image = tv_tensors.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+# image got transformed in-place and is still a TVTensor Image, but new_image
+# is a Tensor. They share the same underlying data and they're equal, just
+# different classes.
+assert isinstance(image, tv_tensors.Image)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image)
+assert (new_image == image).all()
+assert new_image.data_ptr() == image.data_ptr()
diff --git a/hubconf.py b/hubconf.py
index 57ce7a0d12a..637827127ca 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -20,6 +20,7 @@
 )
 from torchvision.models.googlenet import googlenet
 from torchvision.models.inception import inception_v3
+from torchvision.models.maxvit import maxvit_t
 from torchvision.models.mnasnet import mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3
 from torchvision.models.mobilenetv2 import mobilenet_v2
 from torchvision.models.mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small
@@ -68,6 +69,17 @@
     shufflenet_v2_x2_0,
 )
 from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
-from torchvision.models.swin_transformer import swin_b, swin_s, swin_t
+from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t
 from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn
+from torchvision.models.video import (
+    mc3_18,
+    mvit_v1_b,
+    mvit_v2_s,
+    r2plus1d_18,
+    r3d_18,
+    s3d,
+    swin3d_b,
+    swin3d_s,
+    swin3d_t,
+)
 from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32
diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt
index 6b9fd3925b2..4201240a427 100644
--- a/ios/CMakeLists.txt
+++ b/ios/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(LIBTORCH_HEADER_ROOT ${LIBTORCH_HEADER_ROOT})
 set(LIBRARY_OUTPUT_PATH ../lib)
 
diff --git a/ios/LibTorchvision.podspec b/ios/LibTorchvision.podspec
index ba87820e142..b88fb70ac40 100644
--- a/ios/LibTorchvision.podspec
+++ b/ios/LibTorchvision.podspec
@@ -1,8 +1,8 @@
-pytorch_version = '1.12.0'
+pytorch_version = '2.0.0'
 
 Pod::Spec.new do |s|
     s.name             = 'LibTorchvision'
-    s.version          = '0.13.0'
+    s.version          = '0.15.1'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/vision'
diff --git a/ios/README.md b/ios/README.md
new file mode 100644
index 00000000000..0b50245f1ee
--- /dev/null
+++ b/ios/README.md
@@ -0,0 +1,3 @@
+## Status
+
+The iOS demo of TorchVision is currently unmaintained, untested and likely out-of-date.
diff --git a/ios/VisionTestApp/VisionTestApp/AppDelegate.h b/ios/VisionTestApp/VisionTestApp/AppDelegate.h
index 0dde86886e6..27716f4b6ab 100644
--- a/ios/VisionTestApp/VisionTestApp/AppDelegate.h
+++ b/ios/VisionTestApp/VisionTestApp/AppDelegate.h
@@ -1,7 +1,7 @@
 #import <UIKit/UIKit.h>
 
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
 
-@property(strong, nonatomic) UIWindow *window;
+@property(strong, nonatomic) UIWindow* window;
 
 @end
diff --git a/ios/VisionTestApp/VisionTestApp/ModelRunner.h b/ios/VisionTestApp/VisionTestApp/ModelRunner.h
index f71c80c981c..cfef3a3f347 100644
--- a/ios/VisionTestApp/VisionTestApp/ModelRunner.h
+++ b/ios/VisionTestApp/VisionTestApp/ModelRunner.h
@@ -5,7 +5,7 @@ NS_ASSUME_NONNULL_BEGIN
 
 @interface ModelRunner : NSObject
 
-+ (NSString* )run;
++ (NSString*)run;
 + (BOOL)setUp;
 
 @end
diff --git a/ios/VisionTestApp/VisionTestApp/ViewController.h b/ios/VisionTestApp/VisionTestApp/ViewController.h
index 82cb7c57f8a..d29a133d373 100644
--- a/ios/VisionTestApp/VisionTestApp/ViewController.h
+++ b/ios/VisionTestApp/VisionTestApp/ViewController.h
@@ -3,5 +3,4 @@
 
 @interface ViewController : UIViewController
 
-
 @end
diff --git a/maintainer_guide.md b/maintainer_guide.md
new file mode 100644
index 00000000000..3d66a701be1
--- /dev/null
+++ b/maintainer_guide.md
@@ -0,0 +1,76 @@
+# Torchvision maintainers guide
+
+This document aims at documenting user-facing policies / principles used when
+developing and maintaining torchvision. Other maintainer info (e.g. release
+process) can be found in the meta-internal wiki.
+
+### What is public and what is private?
+
+For the Python API, torchvision largely follows the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation)
+which is consistent with other major packages
+([numpy](https://numpy.org/neps/nep-0023-backwards-compatibility.html),
+[scikit-learn](https://scikit-learn.org/dev/glossary.html#term-API) etc.).
+We recognize that his policy is somewhat imperfect for some edge cases, and that
+it's difficult to come up with an accurate technical definition. In broad terms,
+which are usually well understood by users, the policy is that:
+
+- modules that can be accessed without leading underscore are public
+- objects in a public file that don't have a leading underscore are public
+- class attributes are public iff they have no leading underscore
+- the rest of the modules / objects / class attributes are considered private
+
+The public API has backward-compatible (BC) guarantees defined in our
+deprecation policy (see below). The private API has not BC guarantees.
+
+For C++, code is private. For Meta employees: if a C++ change breaks fbcode, fix
+fbcode or revert the change. We should be careful about models running in
+production and relying on torchvision ops.
+
+The `test` folder is not importable and is **private.** Even meta-internal
+projects should *not* rely on it (it has happened in the past and is now
+programmatically impossible).
+
+The training references do not have BC guarantees. Breaking changes are
+possible, but we should make sure that the tutorials are still running properly,
+and that their intended narrative is preserved (by e.g. checking outputs,
+etc.).
+
+The rest of the folders (build, android, ios, etc.) are private and have no BC
+guarantees.
+
+### Deprecation policy.
+
+Because they're disruptive, **deprecations should only be used sparingly**.
+
+We largely follow the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy):
+breaking changes require a deprecation period of at least 2 versions.
+
+Deprecations should clearly indicate their deadline in the docs and warning
+messages. Avoid not committing to a deadline, or keeping deprecated APIs for too
+long: it gives no incentive for users to update their code, sends conflicting
+messages ("why was this API removed while this other one is still around?"), and
+accumulates debt in the project.
+
+### Should this attribute be public? Should this function be private?
+
+When designing an API it’s not always obvious what should be exposed as public,
+and what should be kept as a private implementation detail. The following
+guidelines can be useful:
+
+* Functional consistency throughout the library is a top priority, for users and
+  developers’ sake. In doubt and unless it’s clearly wrong, expose what other
+  similar classes expose.
+* Think really hard about the users and their use-cases, and try to expose what
+  they would need to address those use-cases. Aggressively keep everything else
+  private. Remember that the “private -> public” direction is way smoother than
+  the “public -> private” one: in doubt, keep it private.
+* When thinking about use-cases, the general API motto applies: make what’s
+  simple and common easy, and make what’s complex possible (80% / 20% rule).
+  There might be a ~1% left that’s not addressed: that’s OK. Also, **make what’s
+  wrong very hard**, if not impossible.
+
+As a good practice, always create new files and even classes with a leading
+underscore in their name. This way, everything is private by default and the
+only public surface is explicitly present in an `__init__.py` file.
diff --git a/mypy.ini b/mypy.ini
index c1d174f4595..e25212a169d 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,7 +7,7 @@ allow_redefinition = True
 no_implicit_optional = True
 warn_redundant_casts = True
 
-[mypy-torchvision.prototype.features.*]
+[mypy-torchvision.prototype.datapoints.*]
 
 ; untyped definitions and calls
 disallow_untyped_defs = True
@@ -17,95 +17,43 @@ no_implicit_optional = True
 
 ; warnings
 warn_unused_ignores = True
-warn_return_any = True
 
 ; miscellaneous strictness flags
 allow_redefinition = True
 
 [mypy-torchvision.prototype.transforms.*]
 
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
-[mypy-torchvision.prototype.datasets.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-warn_unreachable = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
-[mypy-torchvision.io.image.*]
-
-ignore_errors = True
-
-[mypy-torchvision.io.video.*]
-
-ignore_errors = True
-
-[mypy-torchvision.models.densenet.*]
-
-ignore_errors=True
-
-[mypy-torchvision.models.detection.anchor_utils]
-
-ignore_errors = True
-
-[mypy-torchvision.models.detection.transform]
-
 ignore_errors = True
 
-[mypy-torchvision.models.detection.roi_heads]
-
-ignore_errors = True
-
-[mypy-torchvision.models.detection.faster_rcnn]
+[mypy-torchvision.prototype.datasets.*]
 
 ignore_errors = True
 
-[mypy-torchvision.models.detection.mask_rcnn]
+[mypy-torchvision.prototype.models.*]
 
 ignore_errors = True
 
-[mypy-torchvision.models.detection.keypoint_rcnn]
+[mypy-torchvision.io.image.*]
 
 ignore_errors = True
 
-[mypy-torchvision.models.detection.retinanet]
+[mypy-torchvision.io.video.*]
 
 ignore_errors = True
 
-[mypy-torchvision.models.detection.ssd]
+[mypy-torchvision.io.video_reader]
 
 ignore_errors = True
 
-[mypy-torchvision.models.detection.ssdlite]
+[mypy-torchvision.models.*]
 
-ignore_errors = True
+ignore_errors=True
 
-[mypy-torchvision.models.detection.fcos]
+[mypy-torchvision.ops.*]
 
 ignore_errors = True
 
-[mypy-torchvision.ops.*]
+[mypy-torchvision.transforms._functional_pil]
 
 ignore_errors = True
 
@@ -156,3 +104,7 @@ ignore_missing_imports = True
 [mypy-h5py.*]
 
 ignore_missing_imports = True
+
+[mypy-gdown.*]
+
+ignore_missing_imports = True
diff --git a/packaging/README.md b/packaging/README.md
deleted file mode 100644
index 3ceac53030e..00000000000
--- a/packaging/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Building torchvision packages for release
-
-TorchVision release packages are built by using `build_wheel.sh` and `build_conda.sh` for all permutations of
-supported operating systems, compute platforms and python versions.
-
-OS/Python/Compute matrix is defined in https://github.com/pytorch/vision/blob/main/.circleci/regenerate.py
diff --git a/packaging/build_cmake.sh b/packaging/build_cmake.sh
deleted file mode 100755
index 35dfbc4a697..00000000000
--- a/packaging/build_cmake.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-set -ex
-
-PARALLELISM=8
-if [ -n "$MAX_JOBS" ]; then
-    PARALLELISM=$MAX_JOBS
-fi
-
-if [[ "$(uname)" != Darwin && "$OSTYPE" != "msys" ]]; then
-    eval "$(./conda/bin/conda shell.bash hook)"
-    conda activate ./env
-fi
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_plain_constraint
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    conda install -yq conda-build cmake future
-    pip install dataclasses
-fi
-
-setup_visual_studio_constraint
-setup_junit_results_folder
-
-if [[ "$(uname)" == Darwin ]]; then
-  # TODO: this can be removed as soon as mkl's CMake support works with clang
-  #  see https://github.com/pytorch/vision/pull/4203 for details
-  MKL_CONSTRAINT='mkl==2021.2.0'
-else
-  MKL_CONSTRAINT=''
-fi
-
-if [[ $CONDA_BUILD_VARIANT == "cpu" ]]; then
-  PYTORCH_MUTEX_CONSTRAINT='pytorch-mutex=1.0=cpu'
-else
-  PYTORCH_MUTEX_CONSTRAINT=''
-fi
-
-conda install -yq \pytorch=$PYTORCH_VERSION $CONDA_CUDATOOLKIT_CONSTRAINT $PYTORCH_MUTEX_CONSTRAINT $MKL_CONSTRAINT numpy -c nvidia -c "pytorch-${UPLOAD_CHANNEL}"
-TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)"))
-
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    conda install -yq libpng jpeg
-else
-    yum install -y libpng-devel libjpeg-turbo-devel
-fi
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    source .circleci/unittest/windows/scripts/set_cuda_envs.sh
-fi
-
-mkdir cpp_build
-pushd cpp_build
-
-# Generate libtorchvision files
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-
-# Compile and install libtorchvision
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cmake.bat" $PARALLELISM
-    CONDA_PATH=$(dirname $(which python))
-    cp -r "C:/Program Files (x86)/torchvision/include/torchvision" $CONDA_PATH/include
-else
-    make -j$PARALLELISM
-    make install
-
-    if [[ "$(uname)" == Darwin ]]; then
-        CONDA_PATH=$(dirname $(dirname $(which python)))
-        cp -r /usr/local/include/torchvision $CONDA_PATH/include/
-        export C_INCLUDE_PATH=/usr/local/include
-        export CPLUS_INCLUDE_PATH=/usr/local/include
-    fi
-fi
-
-popd
-
-# Install torchvision locally
-python setup.py develop
-
-# Trace, compile and run project that uses Faster-RCNN
-pushd test/tracing/frcnn
-mkdir build
-
-# Trace model
-python trace_model.py
-cp fasterrcnn_resnet50_fpn.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_frcnn.bat" $PARALLELISM
-    mv fasterrcnn_resnet50_fpn.pt Release
-    cd Release
-    export PATH=$(cygpath "C:/Program Files (x86)/torchvision/bin"):$(cygpath $TORCH_PATH)/lib:$PATH
-else
-    make -j$PARALLELISM
-fi
-
-# Run traced program
-./test_frcnn_tracing
-
-# Compile and run the CPP example
-popd
-cd examples/cpp/hello_world
-mkdir build
-
-# Trace model
-python trace_model.py
-cp resnet18.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cpp_example.bat" $PARALLELISM
-    mv resnet18.pt Release
-    cd Release
-else
-    make -j$PARALLELISM
-fi
-
-# Run CPP example
-./hello-world
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
deleted file mode 100755
index 7c45aa3e6d9..00000000000
--- a/packaging/build_conda.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_constraint
-setup_visual_studio_constraint
-setup_junit_results_folder
-export CUDATOOLKIT_CHANNEL="nvidia"
-
-conda build -c $CUDATOOLKIT_CHANNEL -c defaults $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchvision
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
deleted file mode 100755
index 3299d16ec92..00000000000
--- a/packaging/build_wheel.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=wheel
-setup_env
-setup_wheel_python
-pip_install numpy pyyaml future ninja
-pip_install --upgrade setuptools
-setup_pip_pytorch_version
-python setup.py clean
-
-# Copy binaries to be included in the wheel distribution
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    if [[ "$(uname)" == Darwin ]]; then
-        # Install delocate to relocate the required binaries
-        pip_install "delocate>=0.9"
-    else
-        cp "$bin_path/Library/bin/libpng16.dll" torchvision
-        cp "$bin_path/Library/bin/libjpeg.dll" torchvision
-    fi
-else
-    # Install auditwheel to get some inspection utilities
-    pip_install auditwheel
-
-    # Point to custom libraries
-    export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
-    export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include
-    export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib
-fi
-
-download_copy_ffmpeg
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
-else
-    IS_WHEEL=1 python setup.py bdist_wheel
-fi
-
-
-if [[ "$(uname)" == Darwin ]]; then
-    pushd dist/
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    for whl in *.whl; do
-        DYLD_FALLBACK_LIBRARY_PATH="$env_path/lib/:$DYLD_FALLBACK_LIBRARY_PATH" delocate-wheel -v --ignore-missing-dependencies $whl
-    done
-else
-    if [[ "$OSTYPE" == "msys" ]]; then
-        "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py
-    else
-        LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py
-    fi
-fi
diff --git a/packaging/cut_release.sh b/packaging/cut_release.sh
new file mode 100755
index 00000000000..91e0e5ff15d
--- /dev/null
+++ b/packaging/cut_release.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+#
+# Usage (run from root of project):
+# TEST_INFRA_BRANCH=release/2.1 RELEASE_BRANCH=release/2.1 RELEASE_VERSION=2.1.0 packaging/cut_release.sh
+#
+# TEST_INFRA_BRANCH: The release branch of test-infra that houses all reusable
+# workflows
+#
+# RELEASE_BRANCH: The name of the release branch for this repo
+#
+# RELEASE_VERSION: Version of this current release
+
+set -eou pipefail
+
+# Create and Check out to Release Branch
+git checkout -b "${RELEASE_BRANCH}"
+
+# Change all GitHub Actions to reference the test-infra release branch
+# as opposed to main.
+for i in .github/workflows/*.yml; do 
+  if [[ "$OSTYPE" == "darwin"* ]]; then
+    sed -i '' -e s#@main#@"${TEST_INFRA_BRANCH}"# $i;
+    sed -i '' -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i;
+  else
+    sed -i -e s#@main#@"${TEST_INFRA_BRANCH}"# $i;
+    sed -i -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i;
+  fi
+done
+
+# Update the Release Version in version.txt
+echo "${RELEASE_VERSION}" >version.txt
+
+# Optional
+# git add ./github/workflows/*.yml version.txt
+# git commit -m "[RELEASE-ONLY CHANGES] Branch Cut for Release {RELEASE_VERSION}"
+# git push origin "${RELEASE_BRANCH}"
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
deleted file mode 100644
index ad0f4f94d2f..00000000000
--- a/packaging/pkg_helpers.bash
+++ /dev/null
@@ -1,359 +0,0 @@
-# A set of useful bash functions for common functionality we need to do in
-# many build scripts
-
-
-# Setup CUDA environment variables, based on CU_VERSION
-#
-# Inputs:
-#   CU_VERSION (cpu, cu92, cu100)
-#   NO_CUDA_PACKAGE (bool)
-#   BUILD_TYPE (conda, wheel)
-#
-# Outputs:
-#   VERSION_SUFFIX (e.g., "")
-#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
-#   WHEEL_DIR (e.g., cu100/)
-#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   FORCE_CUDA (respected by torchvision setup.py)
-#   NVCC_FLAGS (respected by torchvision setup.py)
-#
-# Precondition: CUDA versions are installed in their conventional locations in
-# /usr/local/cuda-*
-#
-# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
-# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
-# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
-# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
-# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
-# version of a Python package.  But that doesn't apply if you're on OS X,
-# since the default CU_VERSION on OS X is cpu.
-setup_cuda() {
-
-  # First, compute version suffixes.  By default, assume no version suffixes
-  export VERSION_SUFFIX=""
-  export PYTORCH_VERSION_SUFFIX=""
-  export WHEEL_DIR=""
-  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
-  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
-    export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
-    # Match the suffix scheme of pytorch, unless this package does not have
-    # CUDA builds (in which case, use default)
-    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
-      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
-      export WHEEL_DIR="$CU_VERSION/"
-    fi
-  fi
-
-  # Now work out the CUDA settings
-  case "$CU_VERSION" in
-    cu116)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.6/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu113)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.3"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.3/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu102)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2"
-      else
-        export CUDA_HOME=/usr/local/cuda-10.2/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
-      ;;
-    cpu)
-      ;;
-    rocm*)
-      export FORCE_CUDA=1
-      ;;
-    *)
-      echo "Unrecognized CU_VERSION=$CU_VERSION"
-      exit 1
-      ;;
-  esac
-  if [[ -n "$CUDA_HOME" ]]; then
-    # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one
-    export PATH="$CUDA_HOME/bin:$PATH"
-    export FORCE_CUDA=1
-  fi
-}
-
-# Populate build version if necessary, and add version suffix
-#
-# Inputs:
-#   BUILD_VERSION (e.g., 0.2.0 or empty)
-#   VERSION_SUFFIX (e.g., +cpu)
-#
-# Outputs:
-#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
-#
-# Fill BUILD_VERSION if it doesn't exist already with a nightly string
-# Usage: setup_build_version 0.2.0
-setup_build_version() {
-  if [[ -z "$BUILD_VERSION" ]]; then
-    if [[ -z "$1" ]]; then
-      setup_base_build_version
-    else
-      BUILD_VERSION="$1"
-    fi
-    BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
-  else
-    BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
-  fi
-
-  # Set build version based on tag if on tag
-  if [[ -n "${CIRCLE_TAG}" ]]; then
-    # Strip tag
-    BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
-  fi
-
-  export BUILD_VERSION
-}
-
-setup_base_build_version() {
-  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-  # version.txt for some reason has `a` character after major.minor.rev
-  # command below yields 0.10.0 from version.txt containing 0.10.0a0
-  BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" )
-  export BUILD_VERSION
-}
-
-# Set some useful variables for OS X, if applicable
-setup_macos() {
-  if [[ "$(uname)" == Darwin ]]; then
-    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
-  fi
-}
-
-
-# Top-level entry point for things every package will need to do
-#
-# Usage: setup_env 0.2.0
-setup_env() {
-  setup_cuda
-  setup_build_version "$1"
-  setup_macos
-}
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Inputs:
-#   PYTHON_VERSION (3.7, 3.8, 3.9)
-#   UNICODE_ABI (bool)
-#
-# Outputs:
-#   PATH modified to put correct Python version in PATH
-#
-# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
-setup_wheel_python() {
-  if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    eval "$(conda shell.bash hook)"
-    conda env remove -n "env$PYTHON_VERSION" || true
-    conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
-    conda activate "env$PYTHON_VERSION"
-    # Install libpng from Anaconda (defaults)
-    conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
-  else
-    # Install native CentOS libJPEG, freetype and GnuTLS
-    yum install -y libjpeg-turbo-devel freetype gnutls
-    case "$PYTHON_VERSION" in
-      3.7) python_abi=cp37-cp37m ;;
-      3.8) python_abi=cp38-cp38 ;;
-      3.9) python_abi=cp39-cp39 ;;
-      3.10) python_abi=cp310-cp310 ;;
-      *)
-        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
-        exit 1
-        ;;
-    esac
-    # Download all the dependencies required to compile image and video_reader
-    # extensions
-
-    mkdir -p ext_libraries
-    pushd ext_libraries
-    popd
-    export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH"
-  fi
-}
-
-# Install with pip a bit more robustly than the default
-pip_install() {
-  retry pip install --progress-bar off "$@"
-}
-
-# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
-# version into PYTORCH_VERSION, if applicable
-setup_pip_pytorch_version() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    # Install latest prerelease version of torch, per our nightlies, consistent
-    # with the requested cuda version
-    pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html"
-    if [[ "$CUDA_VERSION" == "cpu" ]]; then
-      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
-      # in this case
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
-    else
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
-    fi
-  else
-    pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \
-      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html"
-  fi
-}
-
-# Fill PYTORCH_VERSION with the latest conda nightly version, and
-# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
-#
-# You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
-setup_conda_pytorch_constraint() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-nightly -c pytorch"
-    PYTHON="python"
-    # Check if we have python 3 instead and prefer that
-    if python3 --version >/dev/null 2>/dev/null; then
-      PYTHON="python3"
-    fi
-    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
-                              ${PYTHON} -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
-                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               print(re.sub(r'\\+.*$', '', \
-                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
-                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
-                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
-    if [[ -z "$PYTORCH_VERSION" ]]; then
-      echo "PyTorch version auto detection failed"
-      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
-      exit 1
-    fi
-  else
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}"
-  fi
-  if [[ "$CU_VERSION" == cpu ]]; then
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
-  else
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-  fi
-  if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev"
-  fi
-}
-
-# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
-setup_conda_cudatoolkit_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-  else
-    case "$CU_VERSION" in
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]"
-        ;;
-      cu113)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.3,<11.4 # [not osx]"
-        ;;
-      cu102)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-setup_conda_cudatoolkit_plain_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  export CMAKE_USE_CUDA=1
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-    export CMAKE_USE_CUDA=0
-  else
-    case "$CU_VERSION" in
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.6"
-        ;;
-      cu113)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=11.3"
-        ;;
-      cu102)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.2"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        export CMAKE_USE_CUDA=0
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-# Build the proper compiler package before building the final package
-setup_visual_studio_constraint() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-      export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR
-      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml
-  fi
-}
-
-setup_junit_results_folder() {
-  if [[ "$CI" == "true" ]]; then
-    export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml"
-  fi
-}
-
-
-download_copy_ffmpeg() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-    # conda install -yq ffmpeg=4.2 -c pytorch
-    # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2
-    # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=-
-    # cp Library/bin/*.dll ../torchvision
-    echo "FFmpeg is disabled currently on Windows"
-  else
-    if [[ "$(uname)" == Darwin ]]; then
-      conda install -yq ffmpeg=4.2 -c pytorch
-      conda install -yq wget
-    else
-      # pushd ext_libraries
-      # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # ldconfig
-      # which ffmpeg
-      # popd
-      echo "FFmpeg is disabled currently on Linux"
-    fi
-  fi
-}
diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
new file mode 100644
index 00000000000..7aefa2649e6
--- /dev/null
+++ b/packaging/post_build_script.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -euxo pipefail
+
+if [ -n "${CUDA_HOME:-}" ]; then
+    LD_LIBRARY_PATH="/usr/local/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+fi
+
+python packaging/wheel/relocate.py
+
+if [[ "$(uname)" == "Linux" && "$(uname -m)" != "aarch64" ]]; then
+    extra_decoders_channel="--pre --index-url https://download.pytorch.org/whl/nightly/cpu"
+else
+    extra_decoders_channel=""
+fi
+
+pip install torchvision-extra-decoders $extra_decoders_channel
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
new file mode 100644
index 00000000000..fcacf4bf8a4
--- /dev/null
+++ b/packaging/pre_build_script.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+if [[ "$(uname)" == Darwin ]]; then
+  # Uninstall Conflicting jpeg brew formulae
+  jpeg_packages=$(brew list | grep jpeg)
+  echo "Existing Jpeg-related Brew libraries"
+  echo $jpeg_packages
+  for pkg in $jpeg_packages; do
+    brew uninstall --ignore-dependencies --force $pkg || true
+  done
+
+  conda install -y wget
+fi
+
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
+  conda install libpng libwebp -y
+  # Installing webp also installs a non-turbo jpeg, so we uninstall jpeg stuff
+  # before re-installing them
+  conda uninstall libjpeg-turbo libjpeg -y
+  conda install -y ffmpeg=4.2 -c pytorch
+  conda install -y libjpeg-turbo -c pytorch
+
+  # Copy binaries to be included in the wheel distribution
+  if [[ "$OSTYPE" == "msys" ]]; then
+      python_exec="$(which python)"
+      bin_path=$(dirname $python_exec)
+      cp "$bin_path/Library/bin/libjpeg.dll" torchvision
+  fi
+else
+
+  if [[ "$ARCH" == "aarch64" ]]; then
+    conda install libpng -y
+    conda install -y ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly
+  fi
+
+  conda install libwebp -y
+  conda install libjpeg-turbo -c pytorch
+  yum install -y freetype gnutls
+  pip install "auditwheel<6.3.0"
+fi
+
+pip install numpy pyyaml future ninja
+pip install --upgrade setuptools==72.1.0
diff --git a/packaging/pre_build_script_arm64.sh b/packaging/pre_build_script_arm64.sh
new file mode 100644
index 00000000000..b157139c26a
--- /dev/null
+++ b/packaging/pre_build_script_arm64.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+echo "Building vision dependencies and wheel started."
+
+# Set environment variables
+export SRC_PATH="$GITHUB_WORKSPACE/$SRC_DIR"
+export CMAKE_BUILD_TYPE="$BUILD_TYPE"
+export VCVARSALL_PATH="$DEPENDENCIES_DIR/VSBuildTools/VC/Auxiliary/Build/vcvarsall.bat"
+export TRIPLET_FILE="triplets/arm64-windows.cmake"
+export PYTORCH_VERSION="$PYTORCH_VERSION"
+export CHANNEL="$CHANNEL"
+
+# Dependencies
+mkdir -p "$DOWNLOADS_DIR"
+mkdir -p "$DEPENDENCIES_DIR"
+echo "*" > "$DOWNLOADS_DIR/.gitignore"
+echo "*" > "$DEPENDENCIES_DIR/.gitignore"
+
+# Install vcpkg
+cd "$DOWNLOADS_DIR" || exit
+git clone https://github.com/microsoft/vcpkg.git
+cd vcpkg || exit
+./bootstrap-vcpkg.sh
+
+# Set vcpkg to only build release packages
+echo "set(VCPKG_BUILD_TYPE release)" >> "$TRIPLET_FILE"
+
+# Install dependencies using vcpkg
+./vcpkg install libjpeg-turbo:arm64-windows --x-install-root="$DEPENDENCIES_DIR"
+./vcpkg install libwebp:arm64-windows --x-install-root="$DEPENDENCIES_DIR"
+./vcpkg install libpng[tools]:arm64-windows --x-install-root="$DEPENDENCIES_DIR"
+
+# Copy files using cp
+cp "$DEPENDENCIES_DIR/arm64-windows/lib/libpng16.lib" "$DEPENDENCIES_DIR/arm64-windows/lib/libpng.lib"
+cp "$DEPENDENCIES_DIR/arm64-windows/bin/libpng16.dll" "$DEPENDENCIES_DIR/arm64-windows/bin/libpng.dll"
+cp "$DEPENDENCIES_DIR/arm64-windows/bin/libpng16.pdb" "$DEPENDENCIES_DIR/arm64-windows/bin/libpng.pdb"
+mkdir -p "$DEPENDENCIES_DIR/Library/"
+cp -r "$DEPENDENCIES_DIR/arm64-windows/"* "$DEPENDENCIES_DIR/Library/"
+cp -r "$DEPENDENCIES_DIR/Library/tools/libpng/"* "$DEPENDENCIES_DIR/Library/bin/"
+cp -r "$DEPENDENCIES_DIR/Library/bin/"* "$SRC_PATH/torchvision"
+
+# Source directory
+cd "$SRC_PATH" || exit
+
+# Create virtual environment
+python -m pip install --upgrade pip
+python -m venv .venv
+echo "*" > .venv/.gitignore
+source .venv/Scripts/activate
+
+# Install dependencies
+pip install numpy==2.2.3
+
+if [ "$CHANNEL" = "release" ]; then
+  echo "Installing latest stable version of PyTorch."
+  # TODO: update when arm64 torch available on pypi
+  pip3 install --pre torch --index-url https://download.pytorch.org/whl/torch/
+elif [ "$CHANNEL" = "test" ]; then
+  echo "Installing PyTorch version $PYTORCH_VERSION."
+  pip3 install --pre torch=="$PYTORCH_VERSION" --index-url https://download.pytorch.org/whl/test
+else
+  echo "CHANNEL is not set, installing PyTorch from nightly."
+  pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+fi
+
+echo "Dependencies install finished successfully."
diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml
deleted file mode 100644
index 52b95952ddf..00000000000
--- a/packaging/torchvision/conda_build_config.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-channel_sources:
-  - pytorch-nightly,pytorch,defaults
-blas_impl:
-  - mkl                        # [x86_64]
-c_compiler:
-  - vs2017                     # [win]
-cxx_compiler:
-  - vs2017                     # [win]
-python:
-  - 3.7
-# This differs from target_platform in that it determines what subdir the compiler
-#    will target, not what subdir the compiler package will be itself.
-#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
-#    code on win-64 miniconda.
-cross_compiler_target_platform:
-  - win-64                     # [win]
-target_platform:
-  - win-64                     # [win]
-vc:
-  - 14
-zip_keys:
-  -                             # [win]
-    - vc                        # [win]
-    - c_compiler                # [win]
-    - cxx_compiler              # [win]
diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml
deleted file mode 100644
index 105e28c453e..00000000000
--- a/packaging/torchvision/meta.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-{% set build_variant = environ.get('CONDA_BUILD_VARIANT', 'cpu') %}
-package:
-  name: torchvision
-  version: "{{ environ.get('BUILD_VERSION') }}"
-
-source:
- path: "{{ environ.get('SOURCE_ROOT_DIR') }}"
-
-requirements:
-  build:
-    - {{ compiler('c') }} # [win]
-    - libpng
-    - jpeg
-    # NOTE: The only ffmpeg version that we build is actually 4.2
-    - ffmpeg >=4.2  # [not win]
-
-  host:
-    - python
-    - setuptools
-    - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
-    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }}
-    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }}
-
-  run:
-    - python
-    - defaults::numpy >=1.11
-    - requests
-    - libpng
-    - ffmpeg >=4.2  # [not win]
-    - jpeg
-    - pillow >=5.3.0, !=8.3.*
-    - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
-    {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
-    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }}
-
-  {% if build_variant == 'cpu' %}
-  run_constrained:
-    - cpuonly
-  {% elif not osx %}
-  run_constrained:
-     - cpuonly <0
-  {% endif %}
-
-build:
-  string: py{{py}}_{{ environ['CU_VERSION'] }}
-  script: python setup.py install --single-version-externally-managed --record=record.txt
-  script_env:
-    - CUDA_HOME
-    - FORCE_CUDA
-    - BUILD_VERSION
-    - TORCH_CUDA_ARCH_LIST
-    - MACOSX_DEPLOYMENT_TARGET
-
-test:
-  imports:
-    - torchvision
-    - torchvision.datasets
-    - torchvision.transforms
-  source_files:
-    - test
-  requires:
-    - pytest
-    - scipy
-    - jpeg
-    - ca-certificates
-
-
-about:
-  home: https://github.com/pytorch/vision
-  license: BSD
-  license_file: LICENSE
-  summary: 'image and video datasets and models for torch deep learning'
diff --git a/packaging/vs2017/activate.bat b/packaging/vs2017/activate.bat
deleted file mode 100644
index ccecfc25442..00000000000
--- a/packaging/vs2017/activate.bat
+++ /dev/null
@@ -1,44 +0,0 @@
-:: Set env vars that tell distutils to use the compiler that we put on path
-SET DISTUTILS_USE_SDK=1
-SET MSSdk=1
-
-SET "VS_VERSION=15.0"
-SET "VS_MAJOR=15"
-SET "VS_YEAR=2017"
-
-set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
-set "MSYS2_ENV_CONV_EXCL=CL"
-
-:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
-:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
-set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VSINSTALLDIR=%%i\"
-        goto :vswhere
-    )
-)
-
-:vswhere
-
-:: Shorten PATH to avoid the `input line too long` error.
-SET MyPath=%PATH%
-
-setlocal EnableDelayedExpansion
-
-SET TempPath="%MyPath:;=";"%"
-SET var=
-FOR %%a IN (%TempPath%) DO (
-    IF EXIST %%~sa (
-        SET "var=!var!;%%~sa"
-    )
-)
-
-set "TempPath=!var:~1!"
-endlocal & set "PATH=%TempPath%"
-
-:: Shorten current directory too
-FOR %%A IN (.) DO CD "%%~sA"
-
-:: other things added by install_activate.bat at package build time
diff --git a/packaging/vs2017/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml
deleted file mode 100644
index 2479ceb3e76..00000000000
--- a/packaging/vs2017/conda_build_config.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-blas_impl:
-  - mkl                        # [x86_64]
-c_compiler:
-  - vs2017                     # [win]
-cxx_compiler:
-  - vs2017                     # [win]
-python:
-  - 3.7
-# This differs from target_platform in that it determines what subdir the compiler
-#    will target, not what subdir the compiler package will be itself.
-#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
-#    code on win-64 miniconda.
-cross_compiler_target_platform:
-  - win-64                     # [win]
-target_platform:
-  - win-64                     # [win]
-vc:
-  - 14
-zip_keys:
-  -                             # [win]
-    - vc                        # [win]
-    - c_compiler                # [win]
-    - cxx_compiler              # [win]
diff --git a/packaging/vs2017/install_activate.bat b/packaging/vs2017/install_activate.bat
deleted file mode 100644
index 253d2f2c2c1..00000000000
--- a/packaging/vs2017/install_activate.bat
+++ /dev/null
@@ -1,29 +0,0 @@
-set YEAR=2017
-set VER=15
-
-mkdir "%PREFIX%\etc\conda\activate.d"
-COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-
-IF "%cross_compiler_target_platform%" == "win-64" (
-  set "target_platform=amd64"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  IF "%VSDEVCMD_ARGS%" == "" (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) ELSE (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  )
-  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) else (
-  set "target_platform=x86"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo popd
-  )
diff --git a/packaging/vs2017/install_runtime.bat b/packaging/vs2017/install_runtime.bat
deleted file mode 100644
index 5163c16cf24..00000000000
--- a/packaging/vs2017/install_runtime.bat
+++ /dev/null
@@ -1,49 +0,0 @@
-set VC_PATH=x86
-if "%ARCH%"=="64" (
-   set VC_PATH=x64
-)
-
-set MSC_VER=2017
-
-rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
-rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
-rem     set SP=%%A
-rem     )
-
-rem if not "%SP%" == "%PKG_VERSION%" (
-rem    echo "Version detected from registry: %SP%"
-rem    echo    "does not match version of package being built (%PKG_VERSION%)"
-rem    echo "Do you have current updates for VS 2015 installed?"
-rem    exit 1
-rem )
-
-
-REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
-if %ERRORLEVEL% GEQ 8 exit 1
-
-REM ========== This one comes from visual studio 2017
-set "VC_VER=141"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto :eof
-    )
-)
-
-@setlocal
-call "%VS15VARSALL%" x64
-
-set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
-
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-@endlocal
diff --git a/packaging/vs2017/meta.yaml b/packaging/vs2017/meta.yaml
deleted file mode 100644
index 1f569525ee1..00000000000
--- a/packaging/vs2017/meta.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-{% set vcver="14.1" %}
-{% set vcfeature="14" %}
-{% set vsyear="2017" %}
-{% set fullver="15.4.27004.2010" %}
-
-package:
-  name: vs{{ vsyear }}
-  version: {{ fullver }}
-
-build:
-  skip: True  [not win]
-  script_env:
-    - VSDEVCMD_ARGS # [win]
-
-outputs:
-  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
-    script: install_activate.bat
-    track_features:
-      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
-      strong:
-        - vc{{ vcfeature }}
-    about:
-      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
-      license: BSD 3-clause
diff --git a/packaging/vs2019/activate.bat b/packaging/vs2019/activate.bat
deleted file mode 100644
index 6f607ba7518..00000000000
--- a/packaging/vs2019/activate.bat
+++ /dev/null
@@ -1,44 +0,0 @@
-:: Set env vars that tell distutils to use the compiler that we put on path
-SET DISTUTILS_USE_SDK=1
-SET MSSdk=1
-
-SET "VS_VERSION=16.0"
-SET "VS_MAJOR=16"
-SET "VS_YEAR=2019"
-
-set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
-set "MSYS2_ENV_CONV_EXCL=CL"
-
-:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
-:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
-set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VSINSTALLDIR=%%i\"
-        goto :vswhere
-    )
-)
-
-:vswhere
-
-:: Shorten PATH to avoid the `input line too long` error.
-SET MyPath=%PATH%
-
-setlocal EnableDelayedExpansion
-
-SET TempPath="%MyPath:;=";"%"
-SET var=
-FOR %%a IN (%TempPath%) DO (
-    IF EXIST %%~sa (
-        SET "var=!var!;%%~sa"
-    )
-)
-
-set "TempPath=!var:~1!"
-endlocal & set "PATH=%TempPath%"
-
-:: Shorten current directory too
-FOR %%A IN (.) DO CD "%%~sA"
-
-:: other things added by install_activate.bat at package build time
diff --git a/packaging/vs2019/conda_build_config.yaml b/packaging/vs2019/conda_build_config.yaml
deleted file mode 100644
index 7bd8de2ea5b..00000000000
--- a/packaging/vs2019/conda_build_config.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-blas_impl:
-  - mkl                        # [x86_64]
-c_compiler:
-  - vs2019                     # [win]
-cxx_compiler:
-  - vs2019                     # [win]
-python:
-  - 3.7
-# This differs from target_platform in that it determines what subdir the compiler
-#    will target, not what subdir the compiler package will be itself.
-#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
-#    code on win-64 miniconda.
-cross_compiler_target_platform:
-  - win-64                     # [win]
-target_platform:
-  - win-64                     # [win]
-vc:
-  - 14
-zip_keys:
-  -                             # [win]
-    - vc                        # [win]
-    - c_compiler                # [win]
-    - cxx_compiler              # [win]
diff --git a/packaging/vs2019/install_activate.bat b/packaging/vs2019/install_activate.bat
deleted file mode 100644
index 9e60ccfd2dc..00000000000
--- a/packaging/vs2019/install_activate.bat
+++ /dev/null
@@ -1,29 +0,0 @@
-set YEAR=2019
-set VER=16
-
-mkdir "%PREFIX%\etc\conda\activate.d"
-COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-
-IF "%cross_compiler_target_platform%" == "win-64" (
-  set "target_platform=amd64"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  IF "%VSDEVCMD_ARGS%" == "" (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) ELSE (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  )
-  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) else (
-  set "target_platform=x86"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo popd
-  )
diff --git a/packaging/vs2019/install_runtime.bat b/packaging/vs2019/install_runtime.bat
deleted file mode 100644
index e09a5ccfb0f..00000000000
--- a/packaging/vs2019/install_runtime.bat
+++ /dev/null
@@ -1,49 +0,0 @@
-set VC_PATH=x86
-if "%ARCH%"=="64" (
-   set VC_PATH=x64
-)
-
-set MSC_VER=2019
-
-rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
-rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
-rem     set SP=%%A
-rem     )
-
-rem if not "%SP%" == "%PKG_VERSION%" (
-rem    echo "Version detected from registry: %SP%"
-rem    echo    "does not match version of package being built (%PKG_VERSION%)"
-rem    echo "Do you have current updates for VS 2015 installed?"
-rem    exit 1
-rem )
-
-
-REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
-if %ERRORLEVEL% GEQ 8 exit 1
-
-REM ========== This one comes from visual studio 2019
-set "VC_VER=142"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto :eof
-    )
-)
-
-@setlocal
-call "%VS15VARSALL%" x64
-
-set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
-
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-@endlocal
diff --git a/packaging/vs2019/meta.yaml b/packaging/vs2019/meta.yaml
deleted file mode 100644
index 94a0ed4db3e..00000000000
--- a/packaging/vs2019/meta.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-{% set vcver="14.2" %}
-{% set vcfeature="14" %}
-{% set vsyear="2019" %}
-{% set fullver="15.4.27004.2010" %}
-
-package:
-  name: vs{{ vsyear }}
-  version: {{ fullver }}
-
-build:
-  skip: True  [not win]
-  script_env:
-    - VSDEVCMD_ARGS # [win]
-
-outputs:
-  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
-    script: install_activate.bat
-    track_features:
-      # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14.  Tools are "v142".
-      strong:
-        - vc{{ vcfeature }}
-    about:
-      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
-      license: BSD 3-clause
diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py
index e6a4ef9d458..4587f3798da 100644
--- a/packaging/wheel/relocate.py
+++ b/packaging/wheel/relocate.py
@@ -2,7 +2,6 @@
 
 import glob
 import hashlib
-import io
 
 # Standard library imports
 import os
@@ -16,7 +15,10 @@
 
 # Third party imports
 if sys.platform == "linux":
-    from auditwheel.lddtree import lddtree
+    try:
+        from auditwheel.lddtree import lddtree
+    except ImportError:
+        from auditwheel import lddtree
 
 
 ALLOWLIST = {
@@ -65,21 +67,12 @@
 PYTHON_VERSION = sys.version_info
 
 
-def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE):
-    """Yield pieces of data from a file-like object until EOF."""
-    while True:
-        chunk = file.read(size)
-        if not chunk:
-            break
-        yield chunk
-
-
 def rehash(path, blocksize=1 << 20):
     """Return (hash, length) for path using hashlib.sha256()"""
     h = hashlib.sha256()
     length = 0
     with open(path, "rb") as f:
-        for block in read_chunks(f, size=blocksize):
+        while block := f.read(blocksize):
             length += len(block)
             h.update(block)
     digest = "sha256=" + urlsafe_b64encode(h.digest()).decode("latin1").rstrip("=")
@@ -191,7 +184,7 @@ def relocate_elf_library(patchelf, output_dir, output_library, binary):
 
     print("Copying dependencies to wheel directory")
     new_libraries_path = osp.join(output_dir, "torchvision.libs")
-    os.makedirs(new_libraries_path)
+    os.makedirs(new_libraries_path, exist_ok=True)
 
     new_names = {binary: binary_path}
 
diff --git a/packaging/windows/internal/build_cpp_example.bat b/packaging/windows/internal/build_cpp_example.bat
index e3f7afe9f02..129c574e391 100644
--- a/packaging/windows/internal/build_cpp_example.bat
+++ b/packaging/windows/internal/build_cpp_example.bat
@@ -1,3 +1,3 @@
 @echo on
 set CL=/I"C:\Program Files (x86)\torchvision\include"
-msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" hello-world.vcxproj -maxcpucount:%1
+msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" run_model.vcxproj -maxcpucount:%1
diff --git a/packaging/windows/internal/build_frcnn.bat b/packaging/windows/internal/build_frcnn.bat
deleted file mode 100644
index 36e3757d01c..00000000000
--- a/packaging/windows/internal/build_frcnn.bat
+++ /dev/null
@@ -1,3 +0,0 @@
-@echo on
-set CL=/I"C:\Program Files (x86)\torchvision\include"
-msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" test_frcnn_tracing.vcxproj -maxcpucount:%1
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
deleted file mode 100644
index de5bbd04515..00000000000
--- a/packaging/windows/internal/cuda_install.bat
+++ /dev/null
@@ -1,171 +0,0 @@
-@echo on
-
-if "%CU_VERSION%" == "cpu" (
-    echo Skipping for CPU builds
-    exit /b 0
-)
-
-set SRC_DIR=%~dp0\..
-
-if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
-
-rem in unit test workflow, we get CUDA_VERSION, for example 11.1
-if defined CUDA_VERSION (
-    set CUDA_VER=%CUDA_VERSION:.=%
-) else (
-    set CUDA_VER=%CU_VERSION:cu=%
-)
-
-set /a CUDA_VER=%CU_VERSION:cu=%
-set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
-set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
-set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
-set CUDNN_FOLDER="cuda"
-set CUDNN_LIB_FOLDER="lib\x64"
-
-if %CUDA_VER% EQU 102 goto cuda102
-if %CUDA_VER% EQU 113 goto cuda113
-if %CUDA_VER% EQU 116 goto cuda116
-
-echo CUDA %CUDA_VERSION_STR% is not supported
-exit /b 1
-
-:cuda102
-
-if not exist "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.2.89_441.22_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
-    set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvjpeg_10.2 nvjpeg_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-10.2-windows10-x64-v7.6.5.32.zip --output "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-)
-
-rem The below only for cu102, if it's used in other version, e.g. cu111, torch.cuda.is_availabe() would be False.
-if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
-    curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
-    if errorlevel 1 exit /b 1
-)
-
-echo Installing GPU driver DLLs
-7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32"
-
-goto cuda_common
-
-:cuda113
-
-set CUDA_INSTALL_EXE=cuda_11.3.0_465.89_win10.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.3 nvcc_11.3 cuobjdump_11.3 nvprune_11.3 nvprof_11.3 cupti_11.3 cublas_11.3 cublas_dev_11.3 cudart_11.3 cufft_11.3 cufft_dev_11.3 curand_11.3 curand_dev_11.3 cusolver_11.3 cusolver_dev_11.3 cusparse_11.3 cusparse_dev_11.3 npp_11.3 npp_dev_11.3 nvjpeg_11.3 nvjpeg_dev_11.3 nvrtc_11.3 nvrtc_dev_11.3 nvml_dev_11.3"
-
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
-:cuda116
-
-set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6"
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
-:cuda_common
-
-if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
-    curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
-    if errorlevel 1 exit /b 1
-)
-
-echo Installing CUDA toolkit...
-7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
-pushd "%SRC_DIR%\temp_build\cuda"
-sc config wuauserv start= disabled
-sc stop wuauserv
-sc query wuauserv
-
-start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs"
-echo %errorlevel%
-
-popd
-
-echo Installing VS integration...
-rem It's for VS 2019
-if "%CUDA_VER_MAJOR%" == "10" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-if "%CUDA_VER_MAJOR%" == "11" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-
-echo Installing NvToolsExt...
-7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-
-echo Setting up environment...
-set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
-set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-
-if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
-    echo CUDA %CUDA_VERSION_STR% installed failed.
-    echo --------- RunDll32.exe.log
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log"
-    echo --------- setup.exe.log -------
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log"
-    exit /b 1
-)
-
-echo Installing cuDNN...
-7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
-
-echo Cleaning temp files
-rd /s /q "%SRC_DIR%\temp_build" || ver > nul
diff --git a/packaging/windows/internal/driver_update.bat b/packaging/windows/internal/driver_update.bat
deleted file mode 100644
index 00b43affc01..00000000000
--- a/packaging/windows/internal/driver_update.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
-if errorlevel 1 exit /b 1
-
-start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
-if errorlevel 1 exit /b 1
-
-del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL
-
-setlocal EnableDelayedExpansion
-set NVIDIA_GPU_EXISTS=0
-for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
-    set GPUS=%%i
-    if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
-        SET NVIDIA_GPU_EXISTS=1
-        goto gpu_check_end
-    )
-)
-:gpu_check_end
-endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
-
-if "%NVIDIA_GPU_EXISTS%" == "0" (
-    echo "CUDA Driver installation Failed"
-    exit /b 1
-)
diff --git a/packaging/windows/internal/vc_env_helper.bat b/packaging/windows/internal/vc_env_helper.bat
index e85a372f93d..1f50b1b05b5 100644
--- a/packaging/windows/internal/vc_env_helper.bat
+++ b/packaging/windows/internal/vc_env_helper.bat
@@ -1,11 +1,7 @@
 @echo on
 
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
-if "%VC_YEAR%" == "2017" (
-    set VC_VERSION_LOWER=15
-    set VC_VERSION_UPPER=16
-)
+set VC_VERSION_LOWER=17
+set VC_VERSION_UPPER=18
 
 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
     if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
@@ -24,6 +20,8 @@ if "%VSDEVCMD_ARGS%" == "" (
 
 @echo on
 
+if "%CU_VERSION%" == "xpu" call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+
 set DISTUTILS_USE_SDK=1
 
 set args=%1
diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh
deleted file mode 100644
index cdae18065b9..00000000000
--- a/packaging/windows/internal/vc_install_helper.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-if [[ "$CU_VERSION" == "cu92" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS="-vcvars_ver=14.13"
-  powershell packaging/windows/internal/vs2017_install.ps1
-elif [[ "$CU_VERSION" == "cu100" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS=""
-  powershell packaging/windows/internal/vs2017_install.ps1
-else
-  export VC_YEAR=2019
-  export VSDEVCMD_ARGS=""
-fi
diff --git a/packaging/windows/internal/vs2017_install.ps1 b/packaging/windows/internal/vs2017_install.ps1
deleted file mode 100644
index 3e953de1ab7..00000000000
--- a/packaging/windows/internal/vs2017_install.ps1
+++ /dev/null
@@ -1,25 +0,0 @@
-$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.14.13",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2017 installer failed"
-    exit 1
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
diff --git a/packaging/windows/internal/vs2019_install.ps1 b/packaging/windows/internal/vs2019_install.ps1
deleted file mode 100644
index e436051f0db..00000000000
--- a/packaging/windows/internal/vs2019_install.ps1
+++ /dev/null
@@ -1,21 +0,0 @@
-$VS_DOWNLOAD_LINK = "https://aka.ms/vs/16/release/vs_buildtools.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2019 installer failed"
-    exit 1
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
diff --git a/pyproject.toml b/pyproject.toml
index 8f0be4245bd..61e4a957fc5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ first_party_detection = false
 [tool.black]
 
 line-length = 120
-target-version = ["py37"]
+target-version = ["py38"]
 
 [tool.ufmt]
 
diff --git a/pytest.ini b/pytest.ini
index 1dde465d32f..8d52b55d5a6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,9 +1,9 @@
 [pytest]
 addopts =
-    # show summary of all tests that did not pass
-    -ra
+    # show tests that (f)ailed, (E)rror, or (X)passed in the summary
+    -rfEX
     # Make tracebacks shorter
-    --tb=native
+    --tb=short
     # enable all warnings
     -Wd
     --ignore=test/test_datasets_download.py
diff --git a/references/classification/README.md b/references/classification/README.md
index e8d62134ca2..bc481f421ed 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -120,7 +120,7 @@ Here `$MODEL` is one of `efficientnet_v2_s` and `efficientnet_v2_m`.
 Note that the Small variant had a `$TRAIN_SIZE` of `300` and a `$EVAL_SIZE` of `384`, while the Medium `384` and `480` respectively.
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 4 nodes, each with 8 GPUs (for a total of 32 GPUs),
+For generating the pre-trained weights, we trained with 4 nodes, each with 8 GPUs (for a total of 32 GPUs),
 and `--batch_size 32`.
 
 The weights of the Large variant are ported from the original paper rather than trained from scratch. See the `EfficientNet_V2_L_Weights` entry for their exact preprocessing transforms.
@@ -135,7 +135,7 @@ torchrun --nproc_per_node=8 train.py\
      --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\
      --lr-warmup-epochs=5 --lr-warmup-decay=0.1
 ```
-Here `$MODEL` is one of `regnet_x_400mf`, `regnet_x_800mf`, `regnet_x_1_6gf`, `regnet_y_400mf`, `regnet_y_800mf` and `regnet_y_1_6gf`. Please note we used learning rate 0.4 for `regent_y_400mf` to get the same Acc@1 as [the paper)(https://arxiv.org/abs/2003.13678).
+Here `$MODEL` is one of `regnet_x_400mf`, `regnet_x_800mf`, `regnet_x_1_6gf`, `regnet_y_400mf`, `regnet_y_800mf` and `regnet_y_1_6gf`. Please note we used learning rate 0.4 for `regent_y_400mf` to get the same Acc@1 as [the paper](https://arxiv.org/abs/2003.13678).
 
 #### Medium models
 ```
@@ -167,7 +167,7 @@ torchrun --nproc_per_node=8 train.py\
 ```
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
+For generating the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
 and `--batch_size 64`.
 
 #### vit_b_32
@@ -180,7 +180,7 @@ torchrun --nproc_per_node=8 train.py\
 ```
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
+For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
 and `--batch_size 256`.
 
 #### vit_l_16
@@ -193,7 +193,7 @@ torchrun --nproc_per_node=8 train.py\
 ```
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
+For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
 and `--batch_size 64`.
 
 #### vit_l_32
@@ -206,7 +206,7 @@ torchrun --nproc_per_node=8 train.py\
 ```
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
+For generating the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
 and `--batch_size 64`.
 
 
@@ -221,7 +221,7 @@ torchrun --nproc_per_node=8 train.py\
 Here `$MODEL` is one of `convnext_tiny`, `convnext_small`, `convnext_base` and `convnext_large`. Note that each variant had its `--val-resize-size` optimized in a post-training step, see their `Weights` entry for their exact value.
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
+For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
 and `--batch_size 64`.
 
 
@@ -245,6 +245,14 @@ Here `$MODEL` is one of `swin_v2_t`, `swin_v2_s` or `swin_v2_b`.
 Note that `--val-resize-size` was optimized in a post-training step, see their `Weights` entry for the exact value.
 
 
+### MaxViT
+```
+torchrun --nproc_per_node=8 --n_nodes=4 train.py\
+--model $MODEL --epochs 400 --batch-size 128 --opt adamw --lr 3e-3 --weight-decay 0.05 --lr-scheduler cosineannealinglr --lr-min 1e-5 --lr-warmup-method linear  --lr-warmup-epochs 32  --label-smoothing 0.1 --mixup-alpha 0.8 --clip-grad-norm 1.0 --interpolation bicubic --auto-augment ta_wide --policy-magnitude 15 --model-ema --val-resize-size 224\
+--val-crop-size 224 --train-crop-size 224 --amp  --model-ema-steps 32 --transformer-embedding-decay 0 --sync-bn
+```
+Here `$MODEL` is `maxvit_t`.
+Note that `--val-resize-size` was not optimized in a post-training step.
 
 
 ### ShuffleNet V2
@@ -281,24 +289,24 @@ For all post training quantized models, the settings are:
 2. num_workers: 16
 3. batch_size: 32
 4. eval_batch_size: 128
-5. backend: 'fbgemm'
+5. qbackend: 'fbgemm'
 
 ```
-python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' --model='$MODEL'
+python train_quantization.py --device='cpu' --post-training-quantize --qbackend='fbgemm' --model='$MODEL'
 ```
 Here `$MODEL` is one of `googlenet`, `inception_v3`, `resnet18`, `resnet50`, `resnext101_32x8d`, `shufflenet_v2_x0_5` and `shufflenet_v2_x1_0`.
 
 ### Quantized ShuffleNet V2
 
-Here are commands that we use to quantized the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
+Here are commands that we use to quantize the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
 ```
 # For shufflenet_v2_x1_5
-python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \
+python train_quantization.py --device='cpu' --post-training-quantize --qbackend='fbgemm' \
     --model=shufflenet_v2_x1_5 --weights="ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1" \
     --train-crop-size 176 --val-resize-size 232 --data-path /datasets01_ontap/imagenet_full_size/061417/
 
 # For shufflenet_v2_x2_0
-python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \
+python train_quantization.py --device='cpu' --post-training-quantize --qbackend='fbgemm' \
     --model=shufflenet_v2_x2_0 --weights="ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1" \
     --train-crop-size 176 --val-resize-size 232 --data-path /datasets01_ontap/imagenet_full_size/061417/
 ```
@@ -309,7 +317,7 @@ For Mobilenet-v2, the model was trained with quantization aware training, the se
 1. num_workers: 16
 2. batch_size: 32
 3. eval_batch_size: 128
-4. backend: 'qnnpack'
+4. qbackend: 'qnnpack'
 5. learning-rate: 0.0001
 6. num_epochs: 90
 7. num_observer_update_epochs:4
@@ -331,7 +339,7 @@ For Mobilenet-v3 Large, the model was trained with quantization aware training,
 1. num_workers: 16
 2. batch_size: 32
 3. eval_batch_size: 128
-4. backend: 'qnnpack'
+4. qbackend: 'qnnpack'
 5. learning-rate: 0.001
 6. num_epochs: 90
 7. num_observer_update_epochs:4
@@ -351,7 +359,7 @@ For post training quant, device is set to CPU. For training, the device is set t
 ### Command to evaluate quantized models using the pre-trained weights:
 
 ```
-python train_quantization.py --device='cpu' --test-only --backend='<backend>' --model='<model_name>'
+python train_quantization.py --device='cpu' --test-only --qbackend='<qbackend>' --model='<model_name>'
 ```
 
 For inception_v3 you need to pass the following extra parameters:
diff --git a/references/classification/presets.py b/references/classification/presets.py
index 6bc38e72953..8653957a576 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -1,9 +1,23 @@
 import torch
-from torchvision.transforms import autoaugment, transforms
 from torchvision.transforms.functional import InterpolationMode
 
 
+def get_module(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+
+        return torchvision.transforms.v2
+    else:
+        import torchvision.transforms
+
+        return torchvision.transforms
+
+
 class ClassificationPresetTrain:
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter. We may change that in the
+    # future though, if we change the output type from the dataset.
     def __init__(
         self,
         *,
@@ -13,32 +27,51 @@ def __init__(
         interpolation=InterpolationMode.BILINEAR,
         hflip_prob=0.5,
         auto_augment_policy=None,
+        ra_magnitude=9,
+        augmix_severity=3,
         random_erase_prob=0.0,
+        backend="pil",
+        use_v2=False,
     ):
-        trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+        T = get_module(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
+
+        transforms.append(T.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
         if hflip_prob > 0:
-            trans.append(transforms.RandomHorizontalFlip(hflip_prob))
+            transforms.append(T.RandomHorizontalFlip(hflip_prob))
         if auto_augment_policy is not None:
             if auto_augment_policy == "ra":
-                trans.append(autoaugment.RandAugment(interpolation=interpolation))
+                transforms.append(T.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
             elif auto_augment_policy == "ta_wide":
-                trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation))
+                transforms.append(T.TrivialAugmentWide(interpolation=interpolation))
             elif auto_augment_policy == "augmix":
-                trans.append(autoaugment.AugMix(interpolation=interpolation))
+                transforms.append(T.AugMix(interpolation=interpolation, severity=augmix_severity))
             else:
-                aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy)
-                trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation))
-        trans.extend(
+                aa_policy = T.AutoAugmentPolicy(auto_augment_policy)
+                transforms.append(T.AutoAugment(policy=aa_policy, interpolation=interpolation))
+
+        if backend == "pil":
+            transforms.append(T.PILToTensor())
+
+        transforms.extend(
             [
-                transforms.PILToTensor(),
-                transforms.ConvertImageDtype(torch.float),
-                transforms.Normalize(mean=mean, std=std),
+                T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
+                T.Normalize(mean=mean, std=std),
             ]
         )
         if random_erase_prob > 0:
-            trans.append(transforms.RandomErasing(p=random_erase_prob))
+            transforms.append(T.RandomErasing(p=random_erase_prob))
 
-        self.transforms = transforms.Compose(trans)
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
@@ -53,17 +86,34 @@ def __init__(
         mean=(0.485, 0.456, 0.406),
         std=(0.229, 0.224, 0.225),
         interpolation=InterpolationMode.BILINEAR,
+        backend="pil",
+        use_v2=False,
     ):
+        T = get_module(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
-        self.transforms = transforms.Compose(
-            [
-                transforms.Resize(resize_size, interpolation=interpolation),
-                transforms.CenterCrop(crop_size),
-                transforms.PILToTensor(),
-                transforms.ConvertImageDtype(torch.float),
-                transforms.Normalize(mean=mean, std=std),
-            ]
-        )
+        transforms += [
+            T.Resize(resize_size, interpolation=interpolation, antialias=True),
+            T.CenterCrop(crop_size),
+        ]
+
+        if backend == "pil":
+            transforms.append(T.PILToTensor())
+
+        transforms += [
+            T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
+            T.Normalize(mean=mean, std=std),
+        ]
+
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
diff --git a/references/classification/train.py b/references/classification/train.py
index 14360b042ed..d52124fcf33 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -7,12 +7,13 @@
 import torch
 import torch.utils.data
 import torchvision
-import transforms
+import torchvision.transforms
 import utils
 from sampler import RASampler
 from torch import nn
 from torch.utils.data.dataloader import default_collate
 from torchvision.transforms.functional import InterpolationMode
+from transforms import get_mixup_cutmix
 
 
 def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None):
@@ -113,7 +114,11 @@ def _get_cache_path(filepath):
 def load_data(traindir, valdir, args):
     # Data loading code
     print("Loading data")
-    val_resize_size, val_crop_size, train_crop_size = args.val_resize_size, args.val_crop_size, args.train_crop_size
+    val_resize_size, val_crop_size, train_crop_size = (
+        args.val_resize_size,
+        args.val_crop_size,
+        args.train_crop_size,
+    )
     interpolation = InterpolationMode(args.interpolation)
 
     print("Loading training data")
@@ -122,10 +127,15 @@ def load_data(traindir, valdir, args):
     if args.cache_dataset and os.path.exists(cache_path):
         # Attention, as the transforms are also cached!
         print(f"Loading dataset_train from {cache_path}")
-        dataset, _ = torch.load(cache_path)
+        # TODO: this could probably be weights_only=True
+        dataset, _ = torch.load(cache_path, weights_only=False)
     else:
+        # We need a default value for the variables below because args may come
+        # from train_quantization.py which doesn't define them.
         auto_augment_policy = getattr(args, "auto_augment", None)
         random_erase_prob = getattr(args, "random_erase", 0.0)
+        ra_magnitude = getattr(args, "ra_magnitude", None)
+        augmix_severity = getattr(args, "augmix_severity", None)
         dataset = torchvision.datasets.ImageFolder(
             traindir,
             presets.ClassificationPresetTrain(
@@ -133,6 +143,10 @@ def load_data(traindir, valdir, args):
                 interpolation=interpolation,
                 auto_augment_policy=auto_augment_policy,
                 random_erase_prob=random_erase_prob,
+                ra_magnitude=ra_magnitude,
+                augmix_severity=augmix_severity,
+                backend=args.backend,
+                use_v2=args.use_v2,
             ),
         )
         if args.cache_dataset:
@@ -146,14 +160,22 @@ def load_data(traindir, valdir, args):
     if args.cache_dataset and os.path.exists(cache_path):
         # Attention, as the transforms are also cached!
         print(f"Loading dataset_test from {cache_path}")
-        dataset_test, _ = torch.load(cache_path)
+        # TODO: this could probably be weights_only=True
+        dataset_test, _ = torch.load(cache_path, weights_only=False)
     else:
         if args.weights and args.test_only:
             weights = torchvision.models.get_weight(args.weights)
-            preprocessing = weights.transforms()
+            preprocessing = weights.transforms(antialias=True)
+            if args.backend == "tensor":
+                preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing])
+
         else:
             preprocessing = presets.ClassificationPresetEval(
-                crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation
+                crop_size=val_crop_size,
+                resize_size=val_resize_size,
+                interpolation=interpolation,
+                backend=args.backend,
+                use_v2=args.use_v2,
             )
 
         dataset_test = torchvision.datasets.ImageFolder(
@@ -198,16 +220,18 @@ def main(args):
     val_dir = os.path.join(args.data_path, "val")
     dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
 
-    collate_fn = None
     num_classes = len(dataset.classes)
-    mixup_transforms = []
-    if args.mixup_alpha > 0.0:
-        mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha))
-    if args.cutmix_alpha > 0.0:
-        mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha))
-    if mixup_transforms:
-        mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms)
-        collate_fn = lambda batch: mixupcutmix(*default_collate(batch))  # noqa: E731
+    mixup_cutmix = get_mixup_cutmix(
+        mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_classes=num_classes, use_v2=args.use_v2
+    )
+    if mixup_cutmix is not None:
+
+        def collate_fn(batch):
+            return mixup_cutmix(*default_collate(batch))
+
+    else:
+        collate_fn = default_collate
+
     data_loader = torch.utils.data.DataLoader(
         dataset,
         batch_size=args.batch_size,
@@ -303,11 +327,11 @@ def main(args):
 
     model_ema = None
     if args.model_ema:
-        # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at:
+        # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at:
         # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
         #
         # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
-        # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus:
+        # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus:
         # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
         adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs
         alpha = 1.0 - args.model_ema_decay
@@ -315,7 +339,7 @@ def main(args):
         model_ema = utils.ExponentialMovingAverage(model_without_ddp, device=device, decay=1.0 - alpha)
 
     if args.resume:
-        checkpoint = torch.load(args.resume, map_location="cpu")
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
         model_without_ddp.load_state_dict(checkpoint["model"])
         if not args.test_only:
             optimizer.load_state_dict(checkpoint["optimizer"])
@@ -448,6 +472,8 @@ def get_args_parser(add_help=True):
         action="store_true",
     )
     parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)")
+    parser.add_argument("--ra-magnitude", default=9, type=int, help="magnitude of auto augment policy")
+    parser.add_argument("--augmix-severity", default=3, type=int, help="severity of augmix policy")
     parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)")
 
     # Mixed precision training parameters
@@ -492,7 +518,8 @@ def get_args_parser(add_help=True):
         "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)"
     )
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
-
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
     return parser
 
 
diff --git a/references/classification/train_quantization.py b/references/classification/train_quantization.py
index ed36e13a028..bd324c6eef7 100644
--- a/references/classification/train_quantization.py
+++ b/references/classification/train_quantization.py
@@ -23,9 +23,9 @@ def main(args):
         raise RuntimeError("Post training quantization example should not be performed on distributed mode")
 
     # Set backend engine to ensure that quantized model runs on the correct kernels
-    if args.backend not in torch.backends.quantized.supported_engines:
-        raise RuntimeError("Quantized backend not supported: " + str(args.backend))
-    torch.backends.quantized.engine = args.backend
+    if args.qbackend not in torch.backends.quantized.supported_engines:
+        raise RuntimeError("Quantized backend not supported: " + str(args.qbackend))
+    torch.backends.quantized.engine = args.qbackend
 
     device = torch.device(args.device)
     torch.backends.cudnn.benchmark = True
@@ -55,7 +55,7 @@ def main(args):
 
     if not (args.test_only or args.post_training_quantize):
         model.fuse_model(is_qat=True)
-        model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.backend)
+        model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.qbackend)
         torch.ao.quantization.prepare_qat(model, inplace=True)
 
         if args.distributed and args.sync_bn:
@@ -74,7 +74,7 @@ def main(args):
         model_without_ddp = model.module
 
     if args.resume:
-        checkpoint = torch.load(args.resume, map_location="cpu")
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
         model_without_ddp.load_state_dict(checkpoint["model"])
         optimizer.load_state_dict(checkpoint["optimizer"])
         lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
@@ -89,7 +89,7 @@ def main(args):
         )
         model.eval()
         model.fuse_model(is_qat=False)
-        model.qconfig = torch.ao.quantization.get_default_qconfig(args.backend)
+        model.qconfig = torch.ao.quantization.get_default_qconfig(args.qbackend)
         torch.ao.quantization.prepare(model, inplace=True)
         # Calibrate first
         print("Calibrating")
@@ -161,7 +161,7 @@ def get_args_parser(add_help=True):
 
     parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path")
     parser.add_argument("--model", default="mobilenet_v2", type=str, help="model name")
-    parser.add_argument("--backend", default="qnnpack", type=str, help="fbgemm or qnnpack")
+    parser.add_argument("--qbackend", default="qnnpack", type=str, help="Quantized backend: fbgemm or qnnpack")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
 
     parser.add_argument(
@@ -257,9 +257,17 @@ def get_args_parser(add_help=True):
     parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)")
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
+
     return parser
 
 
 if __name__ == "__main__":
     args = get_args_parser().parse_args()
+    if args.backend in ("fbgemm", "qnnpack"):
+        raise ValueError(
+            "The --backend parameter has been re-purposed to specify the backend of the transforms (PIL or Tensor) "
+            "instead of the quantized backend. Please use the --qbackend parameter to specify the quantized backend."
+        )
     main(args)
diff --git a/references/classification/transforms.py b/references/classification/transforms.py
index 9a8ef7877d6..96236608eec 100644
--- a/references/classification/transforms.py
+++ b/references/classification/transforms.py
@@ -2,12 +2,35 @@
 from typing import Tuple
 
 import torch
+from presets import get_module
 from torch import Tensor
 from torchvision.transforms import functional as F
 
 
-class RandomMixup(torch.nn.Module):
-    """Randomly apply Mixup to the provided batch and targets.
+def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_classes, use_v2):
+    transforms_module = get_module(use_v2)
+
+    mixup_cutmix = []
+    if mixup_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.MixUp(alpha=mixup_alpha, num_classes=num_classes)
+            if use_v2
+            else RandomMixUp(num_classes=num_classes, p=1.0, alpha=mixup_alpha)
+        )
+    if cutmix_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.CutMix(alpha=cutmix_alpha, num_classes=num_classes)
+            if use_v2
+            else RandomCutMix(num_classes=num_classes, p=1.0, alpha=cutmix_alpha)
+        )
+    if not mixup_cutmix:
+        return None
+
+    return transforms_module.RandomChoice(mixup_cutmix)
+
+
+class RandomMixUp(torch.nn.Module):
+    """Randomly apply MixUp to the provided batch and targets.
     The class implements the data augmentations as described in the paper
     `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
 
@@ -89,8 +112,8 @@ def __repr__(self) -> str:
         return s
 
 
-class RandomCutmix(torch.nn.Module):
-    """Randomly apply Cutmix to the provided batch and targets.
+class RandomCutMix(torch.nn.Module):
+    """Randomly apply CutMix to the provided batch and targets.
     The class implements the data augmentations as described in the paper
     `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
     <https://arxiv.org/abs/1905.04899>`_.
diff --git a/references/classification/utils.py b/references/classification/utils.py
index c31f3928e86..7d9f0136ae8 100644
--- a/references/classification/utils.py
+++ b/references/classification/utils.py
@@ -287,8 +287,7 @@ def average_checkpoints(inputs):
     for fpath in inputs:
         with open(fpath, "rb") as f:
             state = torch.load(
-                f,
-                map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")),
+                f, map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")), weights_only=True
             )
         # Copies over the settings from the first checkpoint
         if new_state is None:
@@ -365,12 +364,12 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T
     checkpoint_path = os.path.abspath(checkpoint_path)
     output_dir = os.path.dirname(checkpoint_path)
 
-    # Deep copy to avoid side-effects on the model object.
+    # Deep copy to avoid side effects on the model object.
     model = copy.deepcopy(model)
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
 
     # Load the weights to the model to validate that everything works
-    # and remove unnecessary weights (such as auxiliaries, etc)
+    # and remove unnecessary weights (such as auxiliaries, etc.)
     if checkpoint_key == "model_ema":
         del checkpoint[checkpoint_key]["n_averaged"]
         torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.")
diff --git a/references/depth/stereo/README.md b/references/depth/stereo/README.md
new file mode 100644
index 00000000000..22bcae27ab0
--- /dev/null
+++ b/references/depth/stereo/README.md
@@ -0,0 +1,180 @@
+# Stereo Matching reference training scripts
+
+This folder contains reference training scripts for Stereo Matching.
+They serve as a log of how to train specific models, so as to provide baseline
+training and evaluation scripts to quickly bootstrap research.
+
+
+### CREStereo
+
+The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**.
+A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well.
+Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The
+rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo.
+The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate
+schedule to one that starts decaying the weight much sooner. Throughout the experiments we found that this reduces 
+overfitting during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
+
+```
+torchrun --nproc_per_node 8 --nnodes 1 train.py \
+    --dataset-root $dataset_root \
+    --name $name_cre \
+    --model crestereo_base \
+    --train-datasets crestereo eth3d-train middlebury2014-other \
+    --dataset-steps 264000 18000 18000
+    --batch-size 2 \
+    --lr 0.0004 \
+    --min-lr 0.00002 \
+    --lr-decay-method cosine \
+    --warmup-steps 6000 \
+    --decay-after-steps 30000 \
+    --clip-grad-norm 1.0 \
+```
+
+We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggressive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
+
+```
+torchrun --nproc_per_node 8 --nnodes 1 train.py \
+    --dataset-root $dataset_root \
+    --name $name_things \
+    --model crestereo_base \
+    --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \
+    --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000
+    --batch-size 2 \
+    --scale-range 0.2 0.8 \
+    --lr 0.0004 \
+    --lr-decay-method cosine \
+    --decay-after-steps 0 \
+    --warmup-steps 0 \
+    --min-lr 0.00002 \
+    --resume-path $checkpoint_dir/$name_cre.pth
+```
+
+
+### Evaluation
+
+Evaluating the base weights
+
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1
+```
+
+This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
+
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511}
+		5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128}
+		10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388}
+		20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396}
+		5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186}
+		10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379}
+		20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054}
+	},
+}
+```
+
+You can also evaluate the Finetuned weights:
+
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1
+```
+
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736}
+		5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596}
+		10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042}
+		20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89}
+		5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556}
+		10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464}
+		20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417}
+	},
+}
+```
+
+Evaluating the author provided weights:
+
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1
+```
+
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464}
+		5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186}
+		10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429}
+		20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313}
+		5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979}
+		10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43}
+		20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269}
+	},
+}
+```
+
+# Concerns when training
+
+We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targeting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence of naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
+
+ Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments.
+
+### Disparity scaling
+
+##### Sample A
+ The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`.
+
+![Disparity1](assets/disparity-domain-drift.jpg)
+
+From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correctly estimate the continuous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremely large scene the crop size of `384x512` does not correctly capture the general training distribution.
+
+
+
+
+##### Sample B
+
+The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exaggerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
+
+![Disparity2](assets/disparity-background-mode-collapse.jpg)
+
+
+For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493
+
+
+### Metric overfitting
+
+##### Learning is critical in the beginning
+
+We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule.
+
+![Loss1](assets/Loss.jpg)
+
+In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration.
+
+##### Gradient norm saves time
+
+![Loss2](assets/gradient-norm-removal.jpg)
+
+In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**.
+
+Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for.
diff --git a/references/depth/stereo/__init__.py b/references/depth/stereo/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/references/depth/stereo/assets/Loss.jpg b/references/depth/stereo/assets/Loss.jpg
new file mode 100644
index 00000000000..b6db8e204af
Binary files /dev/null and b/references/depth/stereo/assets/Loss.jpg differ
diff --git a/references/depth/stereo/assets/disparity-background-mode-collapse.jpg b/references/depth/stereo/assets/disparity-background-mode-collapse.jpg
new file mode 100644
index 00000000000..b6542e8814f
Binary files /dev/null and b/references/depth/stereo/assets/disparity-background-mode-collapse.jpg differ
diff --git a/references/depth/stereo/assets/disparity-domain-drift.jpg b/references/depth/stereo/assets/disparity-domain-drift.jpg
new file mode 100644
index 00000000000..8a98de03675
Binary files /dev/null and b/references/depth/stereo/assets/disparity-domain-drift.jpg differ
diff --git a/references/depth/stereo/assets/gradient-norm-removal.jpg b/references/depth/stereo/assets/gradient-norm-removal.jpg
new file mode 100644
index 00000000000..2c3c8459d5e
Binary files /dev/null and b/references/depth/stereo/assets/gradient-norm-removal.jpg differ
diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py
new file mode 100644
index 00000000000..7cb6413f1a5
--- /dev/null
+++ b/references/depth/stereo/cascade_evaluation.py
@@ -0,0 +1,299 @@
+import os
+import warnings
+
+import torch
+import torchvision
+import torchvision.prototype.models.depth.stereo
+import utils
+from torch.nn import functional as F
+from train import make_eval_loader
+
+from utils.metrics import AVAILABLE_METRICS
+from visualization import make_prediction_image_side_to_side
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help)
+    parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use")
+    parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset")
+
+    parser.add_argument("--checkpoint", type=str, default="", help="path to weights")
+    parser.add_argument("--weights", type=str, default=None, help="torchvision API weight")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="crestereo_base",
+        help="which model to use if not speciffying a training checkpoint",
+    )
+    parser.add_argument("--img-folder", type=str, default="images")
+
+    parser.add_argument("--batch-size", type=int, default=1, help="batch size")
+    parser.add_argument("--workers", type=int, default=0, help="number of workers")
+
+    parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size")
+    parser.add_argument(
+        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
+    )
+    parser.add_argument(
+        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
+    )
+    parser.add_argument(
+        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
+    )
+    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
+    parser.add_argument(
+        "--interpolation-strategy",
+        type=str,
+        default="bilinear",
+        help="interpolation strategy",
+        choices=["bilinear", "bicubic", "mixed"],
+    )
+
+    parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations")
+    parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades")
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
+        help="metrics to log",
+        choices=AVAILABLE_METRICS,
+    )
+    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
+
+    parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes")
+    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
+    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
+
+    parser.add_argument("--save-images", action="store_true", help="save images of the predictions")
+    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
+
+    return parser
+
+
+def cascade_inference(model, image_left, image_right, iterations, cascades):
+    # check that image size is divisible by 16 * (2 ** (cascades - 1))
+    for image in [image_left, image_right]:
+        if image.shape[-2] % ((2 ** (cascades - 1))) != 0:
+            raise ValueError(
+                f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
+            )
+
+        if image.shape[-1] % ((2 ** (cascades - 1))) != 0:
+            raise ValueError(
+                f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
+            )
+
+    left_image_pyramid = [image_left]
+    right_image_pyramid = [image_right]
+    for idx in range(0, cascades - 1):
+        ds_factor = int(2 ** (idx + 1))
+        ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor)
+        left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0)
+        right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(
+            0
+        )
+
+    flow_init = None
+    for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)):
+        flow_pred = model(left_image, right_image, flow_init, num_iters=iterations)
+        # flow pred is a list
+        flow_init = flow_pred[-1]
+
+    return flow_init
+
+
+@torch.inference_mode()
+def _evaluate(
+    model,
+    args,
+    val_loader,
+    *,
+    padder_mode,
+    print_freq=10,
+    writer=None,
+    step=None,
+    iterations=10,
+    cascades=1,
+    batch_size=None,
+    header=None,
+    save_images=False,
+    save_path="",
+):
+    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.
+    We process as many samples as possible with ddp.
+    """
+    model.eval()
+    header = header or "Test:"
+    device = torch.device(args.device)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+
+    iterations = iterations or args.recurrent_updates
+
+    logger = utils.MetricLogger()
+    for meter_name in args.metrics:
+        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
+    if "fl-all" not in args.metrics:
+        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
+
+    num_processed_samples = 0
+    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        batch_idx = 0
+        for blob in metric_logger.log_every(val_loader, print_freq, header):
+            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
+            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
+            image_left, image_right = padder.pad(image_left, image_right)
+
+            disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades)
+            disp_pred = disp_pred[:, :1, :, :]
+            disp_pred = padder.unpad(disp_pred)
+
+            if save_images:
+                if args.distributed:
+                    rank_prefix = args.rank
+                else:
+                    rank_prefix = 0
+                make_prediction_image_side_to_side(
+                    disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}"
+                )
+
+            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
+            num_processed_samples += image_left.shape[0]
+            for name in metrics:
+                logger.meters[name].update(metrics[name], n=1)
+
+            batch_idx += 1
+
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size
+
+    print("Num_processed_samples: ", num_processed_samples)
+    if (
+        hasattr(val_loader.dataset, "__len__")
+        and len(val_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        warnings.warn(
+            f"Number of processed samples {num_processed_samples} is different"
+            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
+            "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results."
+        )
+
+    if writer is not None and args.rank == 0:
+        for meter_name, meter_value in logger.meters.items():
+            scalar_name = f"{meter_name} {header}"
+            writer.add_scalar(scalar_name, meter_value.avg, step)
+
+    logger.synchronize_between_processes()
+    print(header, logger)
+
+    logger_metrics = {k: v.global_avg for k, v in logger.meters.items()}
+    return logger_metrics
+
+
+def evaluate(model, loader, args, writer=None, step=None):
+    os.makedirs(args.img_folder, exist_ok=True)
+    checkpoint_name = os.path.basename(args.checkpoint) or args.weights
+    image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name)
+
+    metrics = {}
+    base_image_folder = os.path.join(image_checkpoint_folder, args.dataset)
+    os.makedirs(base_image_folder, exist_ok=True)
+
+    for n_cascades in args.n_cascades:
+        for n_iters in args.n_iterations:
+
+            config = f"{n_cascades}c_{n_iters}i"
+            config_image_folder = os.path.join(base_image_folder, config)
+            os.makedirs(config_image_folder, exist_ok=True)
+
+            metrics[config] = _evaluate(
+                model,
+                args,
+                loader,
+                padder_mode=args.padder_type,
+                header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}",
+                batch_size=args.batch_size,
+                writer=writer,
+                step=step,
+                iterations=n_iters,
+                cascades=n_cascades,
+                save_path=config_image_folder,
+                save_images=args.save_images,
+            )
+
+    metric_log = []
+    metric_log_dict = {}
+    # print the final results
+    for config in metrics:
+        config_tokens = config.split("_")
+        config_iters = config_tokens[1][:-1]
+        config_cascades = config_tokens[0][:-1]
+
+        metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {})
+        metric_log_dict[config_cascades][config_iters] = metrics[config]
+
+        evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}"
+        metrics_str = f"Metrics: {metrics[config]}"
+        metric_log.extend([evaluation_str, metrics_str])
+
+        print(evaluation_str)
+        print(metrics_str)
+
+    eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log"
+    print("Saving eval log to: ", eval_log_name)
+    with open(eval_log_name, "w") as f:
+        f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n")
+        # write the dict line by line for each key, and each value in the keys
+        for config_cascades in metric_log_dict:
+            f.write("{\n")
+            f.write(f"\t{config_cascades}: {{\n")
+            for config_iters in metric_log_dict[config_cascades]:
+                # convert every metric to 4 decimal places
+                metrics = metric_log_dict[config_cascades][config_iters]
+                metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()}
+                f.write(f"\t\t{config_iters}: {metrics}\n")
+            f.write("\t},\n")
+            f.write("}\n")
+
+
+def load_checkpoint(args):
+    utils.setup_ddp(args)
+
+    if not args.weights:
+        checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu"), weights_only=True)
+        if "model" in checkpoint:
+            experiment_args = checkpoint["args"]
+            model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None)
+            model.load_state_dict(checkpoint["model"])
+        else:
+            model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None)
+            model.load_state_dict(checkpoint)
+
+        # set the appropriate devices
+        if args.distributed and args.device == "cpu":
+            raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
+        device = torch.device(args.device)
+    else:
+        model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
+
+    # convert to DDP if need be
+    if args.distributed:
+        model = model.to(args.device)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+    else:
+        model.to(device)
+
+    return model
+
+
+def main(args):
+    model = load_checkpoint(args)
+    loader = make_eval_loader(args.dataset, args)
+    evaluate(model, loader, args)
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/references/depth/stereo/parsing.py b/references/depth/stereo/parsing.py
new file mode 100644
index 00000000000..71a3ba9904e
--- /dev/null
+++ b/references/depth/stereo/parsing.py
@@ -0,0 +1,89 @@
+import argparse
+from functools import partial
+
+import torch
+
+from presets import StereoMatchingEvalPreset, StereoMatchingTrainPreset
+from torchvision.datasets import (
+    CarlaStereo,
+    CREStereo,
+    ETH3DStereo,
+    FallingThingsStereo,
+    InStereo2k,
+    Kitti2012Stereo,
+    Kitti2015Stereo,
+    Middlebury2014Stereo,
+    SceneFlowStereo,
+    SintelStereo,
+)
+
+VALID_DATASETS = {
+    "crestereo": partial(CREStereo),
+    "carla-highres": partial(CarlaStereo),
+    "instereo2k": partial(InStereo2k),
+    "sintel": partial(SintelStereo),
+    "sceneflow-monkaa": partial(SceneFlowStereo, variant="Monkaa", pass_name="both"),
+    "sceneflow-flyingthings": partial(SceneFlowStereo, variant="FlyingThings3D", pass_name="both"),
+    "sceneflow-driving": partial(SceneFlowStereo, variant="Driving", pass_name="both"),
+    "fallingthings": partial(FallingThingsStereo, variant="both"),
+    "eth3d-train": partial(ETH3DStereo, split="train"),
+    "eth3d-test": partial(ETH3DStereo, split="test"),
+    "kitti2015-train": partial(Kitti2015Stereo, split="train"),
+    "kitti2015-test": partial(Kitti2015Stereo, split="test"),
+    "kitti2012-train": partial(Kitti2012Stereo, split="train"),
+    "kitti2012-test": partial(Kitti2012Stereo, split="train"),
+    "middlebury2014-other": partial(
+        Middlebury2014Stereo, split="additional", use_ambient_view=True, calibration="both"
+    ),
+    "middlebury2014-train": partial(Middlebury2014Stereo, split="train", calibration="perfect"),
+    "middlebury2014-test": partial(Middlebury2014Stereo, split="test", calibration=None),
+    "middlebury2014-train-ambient": partial(
+        Middlebury2014Stereo, split="train", use_ambient_views=True, calibrartion="perfect"
+    ),
+}
+
+
+def make_train_transform(args: argparse.Namespace) -> torch.nn.Module:
+    return StereoMatchingTrainPreset(
+        resize_size=args.resize_size,
+        crop_size=args.crop_size,
+        rescale_prob=args.rescale_prob,
+        scaling_type=args.scaling_type,
+        scale_range=args.scale_range,
+        scale_interpolation_type=args.interpolation_strategy,
+        use_grayscale=args.use_grayscale,
+        mean=args.norm_mean,
+        std=args.norm_std,
+        horizontal_flip_prob=args.flip_prob,
+        gpu_transforms=args.gpu_transforms,
+        max_disparity=args.max_disparity,
+        spatial_shift_prob=args.spatial_shift_prob,
+        spatial_shift_max_angle=args.spatial_shift_max_angle,
+        spatial_shift_max_displacement=args.spatial_shift_max_displacement,
+        spatial_shift_interpolation_type=args.interpolation_strategy,
+        gamma_range=args.gamma_range,
+        brightness=args.brightness_range,
+        contrast=args.contrast_range,
+        saturation=args.saturation_range,
+        hue=args.hue_range,
+        asymmetric_jitter_prob=args.asymmetric_jitter_prob,
+    )
+
+
+def make_eval_transform(args: argparse.Namespace) -> torch.nn.Module:
+    if args.eval_size is None:
+        resize_size = args.crop_size
+    else:
+        resize_size = args.eval_size
+
+    return StereoMatchingEvalPreset(
+        mean=args.norm_mean,
+        std=args.norm_std,
+        use_grayscale=args.use_grayscale,
+        resize_size=resize_size,
+        interpolation_type=args.interpolation_strategy,
+    )
+
+
+def make_dataset(dataset_name: str, dataset_root: str, transforms: torch.nn.Module) -> torch.utils.data.Dataset:
+    return VALID_DATASETS[dataset_name](root=dataset_root, transforms=transforms)
diff --git a/references/depth/stereo/presets.py b/references/depth/stereo/presets.py
new file mode 100644
index 00000000000..cadd2405178
--- /dev/null
+++ b/references/depth/stereo/presets.py
@@ -0,0 +1,144 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import transforms as T
+
+
+class StereoMatchingEvalPreset(torch.nn.Module):
+    def __init__(
+        self,
+        mean: float = 0.5,
+        std: float = 0.5,
+        resize_size: Optional[Tuple[int, ...]] = None,
+        max_disparity: Optional[float] = None,
+        interpolation_type: str = "bilinear",
+        use_grayscale: bool = False,
+    ) -> None:
+        super().__init__()
+
+        transforms = [
+            T.ToTensor(),
+            T.ConvertImageDtype(torch.float32),
+        ]
+
+        if use_grayscale:
+            transforms.append(T.ConvertToGrayscale())
+
+        if resize_size is not None:
+            transforms.append(T.Resize(resize_size, interpolation_type=interpolation_type))
+
+        transforms.extend(
+            [
+                T.Normalize(mean=mean, std=std),
+                T.MakeValidDisparityMask(max_disparity=max_disparity),
+                T.ValidateModelInput(),
+            ]
+        )
+
+        self.transforms = T.Compose(transforms)
+
+    def forward(self, images, disparities, masks):
+        return self.transforms(images, disparities, masks)
+
+
+class StereoMatchingTrainPreset(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        resize_size: Optional[Tuple[int, ...]],
+        resize_interpolation_type: str = "bilinear",
+        # RandomResizeAndCrop params
+        crop_size: Tuple[int, int],
+        rescale_prob: float = 1.0,
+        scaling_type: str = "exponential",
+        scale_range: Tuple[float, float] = (-0.2, 0.5),
+        scale_interpolation_type: str = "bilinear",
+        # convert to grayscale
+        use_grayscale: bool = False,
+        # normalization params
+        mean: float = 0.5,
+        std: float = 0.5,
+        # processing device
+        gpu_transforms: bool = False,
+        # masking
+        max_disparity: Optional[int] = 256,
+        # SpatialShift params
+        spatial_shift_prob: float = 0.5,
+        spatial_shift_max_angle: float = 0.5,
+        spatial_shift_max_displacement: float = 0.5,
+        spatial_shift_interpolation_type: str = "bilinear",
+        # AssymetricColorJitter
+        gamma_range: Tuple[float, float] = (0.8, 1.2),
+        brightness: Union[int, Tuple[int, int]] = (0.8, 1.2),
+        contrast: Union[int, Tuple[int, int]] = (0.8, 1.2),
+        saturation: Union[int, Tuple[int, int]] = 0.0,
+        hue: Union[int, Tuple[int, int]] = 0.0,
+        asymmetric_jitter_prob: float = 1.0,
+        # RandomHorizontalFlip
+        horizontal_flip_prob: float = 0.5,
+        # RandomOcclusion
+        occlusion_prob: float = 0.0,
+        occlusion_px_range: Tuple[int, int] = (50, 100),
+        # RandomErase
+        erase_prob: float = 0.0,
+        erase_px_range: Tuple[int, int] = (50, 100),
+        erase_num_repeats: int = 1,
+    ) -> None:
+
+        if scaling_type not in ["linear", "exponential"]:
+            raise ValueError(f"Unknown scaling type: {scaling_type}. Available types: linear, exponential")
+
+        super().__init__()
+        transforms = [T.ToTensor()]
+
+        # when fixing size across multiple datasets, we ensure
+        # that the same size is used for all datasets when cropping
+        if resize_size is not None:
+            transforms.append(T.Resize(resize_size, interpolation_type=resize_interpolation_type))
+
+        if gpu_transforms:
+            transforms.append(T.ToGPU())
+
+        # color handling
+        color_transforms = [
+            T.AsymmetricColorJitter(
+                brightness=brightness, contrast=contrast, saturation=saturation, hue=hue, p=asymmetric_jitter_prob
+            ),
+            T.AsymetricGammaAdjust(p=asymmetric_jitter_prob, gamma_range=gamma_range),
+        ]
+
+        if use_grayscale:
+            color_transforms.append(T.ConvertToGrayscale())
+
+        transforms.extend(color_transforms)
+
+        transforms.extend(
+            [
+                T.RandomSpatialShift(
+                    p=spatial_shift_prob,
+                    max_angle=spatial_shift_max_angle,
+                    max_px_shift=spatial_shift_max_displacement,
+                    interpolation_type=spatial_shift_interpolation_type,
+                ),
+                T.ConvertImageDtype(torch.float32),
+                T.RandomRescaleAndCrop(
+                    crop_size=crop_size,
+                    scale_range=scale_range,
+                    rescale_prob=rescale_prob,
+                    scaling_type=scaling_type,
+                    interpolation_type=scale_interpolation_type,
+                ),
+                T.RandomHorizontalFlip(horizontal_flip_prob),
+                # occlusion after flip, otherwise we're occluding the reference image
+                T.RandomOcclusion(p=occlusion_prob, occlusion_px_range=occlusion_px_range),
+                T.RandomErase(p=erase_prob, erase_px_range=erase_px_range, max_erase=erase_num_repeats),
+                T.Normalize(mean=mean, std=std),
+                T.MakeValidDisparityMask(max_disparity),
+                T.ValidateModelInput(),
+            ]
+        )
+
+        self.transforms = T.Compose(transforms)
+
+    def forward(self, images, disparties, mask):
+        return self.transforms(images, disparties, mask)
diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py
new file mode 100644
index 00000000000..83db313ae80
--- /dev/null
+++ b/references/depth/stereo/train.py
@@ -0,0 +1,788 @@
+import argparse
+import os
+import warnings
+from pathlib import Path
+from typing import List, Union
+
+import numpy.typing as npt
+import torch
+import torch.distributed as dist
+import torchvision.models.optical_flow
+import torchvision.prototype.models.depth.stereo
+import utils
+import visualization
+
+from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS
+from torch import nn
+from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize
+from utils.metrics import AVAILABLE_METRICS
+from utils.norm import freeze_batch_norm
+
+
+def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor:
+    """Helper function to make stereo flow from a given model output"""
+    if isinstance(flow, list):
+        return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow]
+
+    B, C, H, W = flow.shape
+    # we need to add zero flow if the model outputs 2 channels
+    if C == 1 and model_out_channels == 2:
+        zero_flow = torch.zeros_like(flow)
+        # by convention the flow is X-Y axis, so we need the Y flow last
+        flow = torch.cat([flow, zero_flow], dim=1)
+    return flow
+
+
+def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> npt.NDArray:
+    """Helper function to return a learning rate scheduler for CRE-stereo"""
+    if args.decay_after_steps < args.warmup_steps:
+        raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")
+
+    warmup_steps = args.warmup_steps if args.warmup_steps else 0
+    flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0
+    decay_lr_steps = args.total_iterations - flat_lr_steps
+
+    max_lr = args.lr
+    min_lr = args.min_lr
+
+    schedulers = []
+    milestones = []
+
+    if warmup_steps > 0:
+        if args.lr_warmup_method == "linear":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps
+            )
+        elif args.lr_warmup_method == "constant":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
+                optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps
+            )
+        else:
+            raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}")
+        schedulers.append(warmup_lr_scheduler)
+        milestones.append(warmup_steps)
+
+    if flat_lr_steps > 0:
+        flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps)
+        schedulers.append(flat_lr_scheduler)
+        milestones.append(flat_lr_steps + warmup_steps)
+
+    if decay_lr_steps > 0:
+        if args.lr_decay_method == "cosine":
+            decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                optimizer, T_max=decay_lr_steps, eta_min=min_lr
+            )
+        elif args.lr_decay_method == "linear":
+            decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps
+            )
+        elif args.lr_decay_method == "exponential":
+            decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
+                optimizer, gamma=args.lr_decay_gamma, last_epoch=-1
+            )
+        else:
+            raise ValueError(f"Unknown lr decay method {args.lr_decay_method}")
+        schedulers.append(decay_lr_scheduler)
+
+    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones)
+    return scheduler
+
+
+def shuffle_dataset(dataset):
+    """Shuffle the dataset"""
+    perm = torch.randperm(len(dataset))
+    return torch.utils.data.Subset(dataset, perm)
+
+
+def resize_dataset_to_n_steps(
+    dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace
+) -> torch.utils.data.Dataset:
+    original_size = len(dataset)
+    if args.steps_is_epochs:
+        samples_per_step = original_size
+    target_size = dataset_steps * samples_per_step
+
+    dataset_copies = []
+    n_expands, remainder = divmod(target_size, original_size)
+    for idx in range(n_expands):
+        dataset_copies.append(dataset)
+
+    if remainder > 0:
+        dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder))))
+
+    if args.dataset_shuffle:
+        dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies]
+
+    dataset = torch.utils.data.ConcatDataset(dataset_copies)
+    return dataset
+
+
+def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset:
+    datasets = []
+    for dataset_name in args.train_datasets:
+        transform = make_train_transform(args)
+        dataset = make_dataset(dataset_name, dataset_root, transform)
+        datasets.append(dataset)
+
+    if len(datasets) == 0:
+        raise ValueError("No datasets specified for training")
+
+    samples_per_step = args.world_size * args.batch_size
+
+    for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)):
+        datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args)
+
+    dataset = torch.utils.data.ConcatDataset(datasets)
+    if args.dataset_order_shuffle:
+        dataset = shuffle_dataset(dataset)
+
+    print(f"Training dataset: {len(dataset)} samples")
+    return dataset
+
+
+@torch.inference_mode()
+def _evaluate(
+    model,
+    args,
+    val_loader,
+    *,
+    padder_mode,
+    print_freq=10,
+    writer=None,
+    step=None,
+    iterations=None,
+    batch_size=None,
+    header=None,
+):
+    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset."""
+    model.eval()
+    header = header or "Test:"
+    device = torch.device(args.device)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+
+    iterations = iterations or args.recurrent_updates
+
+    logger = utils.MetricLogger()
+    for meter_name in args.metrics:
+        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
+    if "fl-all" not in args.metrics:
+        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
+
+    num_processed_samples = 0
+    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        for blob in metric_logger.log_every(val_loader, print_freq, header):
+            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
+            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
+            image_left, image_right = padder.pad(image_left, image_right)
+
+            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations)
+            disp_pred = disp_predictions[-1][:, :1, :, :]
+            disp_pred = padder.unpad(disp_pred)
+
+            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
+            num_processed_samples += image_left.shape[0]
+            for name in metrics:
+                logger.meters[name].update(metrics[name], n=1)
+
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+
+    print("Num_processed_samples: ", num_processed_samples)
+    if (
+        hasattr(val_loader.dataset, "__len__")
+        and len(val_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        warnings.warn(
+            f"Number of processed samples {num_processed_samples} is different"
+            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
+            "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results."
+        )
+
+    if writer is not None and args.rank == 0:
+        for meter_name, meter_value in logger.meters.items():
+            scalar_name = f"{meter_name} {header}"
+            writer.add_scalar(scalar_name, meter_value.avg, step)
+
+    logger.synchronize_between_processes()
+    print(header, logger)
+
+
+def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader:
+    if args.weights:
+        weights = torchvision.models.get_weight(args.weights)
+        trans = weights.transforms()
+
+        def preprocessing(image_left, image_right, disp, valid_disp_mask):
+            C_o, H_o, W_o = get_dimensions(image_left)
+            image_left, image_right = trans(image_left, image_right)
+
+            C_t, H_t, W_t = get_dimensions(image_left)
+            scale_factor = W_t / W_o
+
+            if disp is not None and not isinstance(disp, torch.Tensor):
+                disp = torch.from_numpy(disp)
+                if W_t != W_o:
+                    disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor
+            if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor):
+                valid_disp_mask = torch.from_numpy(valid_disp_mask)
+                if W_t != W_o:
+                    valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST)
+            return image_left, image_right, disp, valid_disp_mask
+
+    else:
+        preprocessing = make_eval_transform(args)
+
+    val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing)
+    if args.distributed:
+        sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False)
+    else:
+        sampler = torch.utils.data.SequentialSampler(val_dataset)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        pin_memory=True,
+        num_workers=args.workers,
+    )
+
+    return val_loader
+
+
+def evaluate(model, loaders, args, writer=None, step=None):
+    for loader_name, loader in loaders.items():
+        _evaluate(
+            model,
+            args,
+            loader,
+            iterations=args.recurrent_updates,
+            padder_mode=args.padder_type,
+            header=f"{loader_name} evaluation",
+            batch_size=args.batch_size,
+            writer=writer,
+            step=step,
+        )
+
+
+def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args):
+    device = torch.device(args.device)
+    # wrap the loader in a logger
+    loader = iter(logger.log_every(train_loader))
+    # output channels
+    model_out_channels = model.module.output_channels if args.distributed else model.output_channels
+
+    torch.set_num_threads(args.threads)
+
+    sequence_criterion = utils.SequenceLoss(
+        gamma=args.gamma,
+        max_flow=args.max_disparity,
+        exclude_large_flows=args.flow_loss_exclude_large,
+    ).to(device)
+
+    if args.consistency_weight:
+        consistency_criterion = utils.FlowSequenceConsistencyLoss(
+            args.gamma,
+            resize_factor=0.25,
+            rescale_factor=0.25,
+            rescale_mode="bilinear",
+        ).to(device)
+    else:
+        consistency_criterion = None
+
+    if args.psnr_weight:
+        psnr_criterion = utils.PSNRLoss().to(device)
+    else:
+        psnr_criterion = None
+
+    if args.smoothness_weight:
+        smoothness_criterion = utils.SmoothnessLoss().to(device)
+    else:
+        smoothness_criterion = None
+
+    if args.photometric_weight:
+        photometric_criterion = utils.FlowPhotoMetricLoss(
+            ssim_weight=args.photometric_ssim_weight,
+            max_displacement_ratio=args.photometric_max_displacement_ratio,
+            ssim_use_padding=False,
+        ).to(device)
+    else:
+        photometric_criterion = None
+
+    for step in range(args.start_step + 1, args.total_iterations + 1):
+        data_blob = next(loader)
+        optimizer.zero_grad()
+
+        # unpack the data blob
+        image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob)
+        with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates)
+            # different models have different outputs, make sure we get the right ones for this task
+            disp_predictions = make_stereo_flow(disp_predictions, model_out_channels)
+            # should the architecture or training loop require it, we have to adjust the disparity mask
+            # target to possibly look like an optical flow mask
+            disp_mask = make_stereo_flow(disp_mask, model_out_channels)
+            # sequence loss on top of the model outputs
+
+        loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight
+
+        if args.consistency_weight > 0:
+            loss_consistency = consistency_criterion(disp_predictions)
+            loss += loss_consistency * args.consistency_weight
+
+        if args.psnr_weight > 0:
+            loss_psnr = 0.0
+            for pred in disp_predictions:
+                # predictions might have 2 channels
+                loss_psnr += psnr_criterion(
+                    pred * valid_disp_mask.unsqueeze(1),
+                    disp_mask * valid_disp_mask.unsqueeze(1),
+                ).mean()  # mean the psnr loss over the batch
+            loss += loss_psnr / len(disp_predictions) * args.psnr_weight
+
+        if args.photometric_weight > 0:
+            loss_photometric = 0.0
+            for pred in disp_predictions:
+                # predictions might have 1 channel, therefore we need to inpute 0s for the second channel
+                if model_out_channels == 1:
+                    pred = torch.cat([pred, torch.zeros_like(pred)], dim=1)
+
+                loss_photometric += photometric_criterion(
+                    image_left, image_right, pred, valid_disp_mask
+                )  # photometric loss already comes out meaned over the batch
+            loss += loss_photometric / len(disp_predictions) * args.photometric_weight
+
+        if args.smoothness_weight > 0:
+            loss_smoothness = 0.0
+            for pred in disp_predictions:
+                # predictions might have 2 channels
+                loss_smoothness += smoothness_criterion(
+                    image_left, pred[:, :1, :, :]
+                ).mean()  # mean the smoothness loss over the batch
+            loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight
+
+        with torch.no_grad():
+            metrics, _ = utils.compute_metrics(
+                disp_predictions[-1][:, :1, :, :],  # predictions might have 2 channels
+                disp_mask[:, :1, :, :],  # so does the ground truth
+                valid_disp_mask,
+                args.metrics,
+            )
+
+        metrics.pop("fl-all", None)
+        logger.update(loss=loss, **metrics)
+
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            if args.clip_grad_norm:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            if args.clip_grad_norm:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
+            optimizer.step()
+
+        scheduler.step()
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            if writer is not None and step % args.tensorboard_log_frequency == 0:
+                # log the loss and metrics to tensorboard
+
+                writer.add_scalar("loss", loss, step)
+                for name, value in logger.meters.items():
+                    writer.add_scalar(name, value.avg, step)
+                # log the images to tensorboard
+                pred_grid = visualization.make_training_sample_grid(
+                    image_left, image_right, disp_mask, valid_disp_mask, disp_predictions
+                )
+                writer.add_image("predictions", pred_grid, step, dataformats="HWC")
+
+                # second thing we want to see is how relevant the iterative refinement is
+                pred_sequence_grid = visualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
+                writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC")
+
+        if step % args.save_frequency == 0:
+            if not args.distributed or args.rank == 0:
+                model_without_ddp = (
+                    model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+                )
+                checkpoint = {
+                    "model": model_without_ddp.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "scheduler": scheduler.state_dict(),
+                    "step": step,
+                    "args": args,
+                }
+                os.makedirs(args.checkpoint_dir, exist_ok=True)
+                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
+                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
+
+        if step % args.valid_frequency == 0:
+            evaluate(model, val_loaders, args, writer, step)
+            model.train()
+            if args.freeze_batch_norm:
+                if isinstance(model, nn.parallel.DistributedDataParallel):
+                    freeze_batch_norm(model.module)
+                else:
+                    freeze_batch_norm(model)
+
+    # one final save at the end
+    if not args.distributed or args.rank == 0:
+        model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        checkpoint = {
+            "model": model_without_ddp.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "scheduler": scheduler.state_dict(),
+            "step": step,
+            "args": args,
+        }
+        os.makedirs(args.checkpoint_dir, exist_ok=True)
+        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
+        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
+
+
+def main(args):
+    args.total_iterations = sum(args.dataset_steps)
+
+    # initialize DDP setting
+    utils.setup_ddp(args)
+    print(args)
+
+    args.test_only = args.train_datasets is None
+
+    # set the appropriate devices
+    if args.distributed and args.device == "cpu":
+        raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
+    device = torch.device(args.device)
+
+    # select model architecture
+    model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
+
+    # convert to DDP if need be
+    if args.distributed:
+        model = model.to(args.gpu)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    else:
+        model.to(device)
+        model_without_ddp = model
+
+    os.makedirs(args.checkpoint_dir, exist_ok=True)
+
+    val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets}
+
+    # EVAL ONLY configurations
+    if args.test_only:
+        evaluate(model, val_loaders, args)
+        return
+
+    # Sanity check for the parameter count
+    print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
+
+    # Compose the training dataset
+    train_dataset = get_train_dataset(args.dataset_root, args)
+
+    # initialize the optimizer
+    if args.optimizer == "adam":
+        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    elif args.optimizer == "sgd":
+        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9)
+    else:
+        raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd")
+
+    # initialize the learning rate schedule
+    scheduler = make_lr_schedule(args, optimizer)
+
+    # load them from checkpoint if needed
+    args.start_step = 0
+    if args.resume_path is not None:
+        checkpoint = torch.load(args.resume_path, map_location="cpu", weights_only=True)
+        if "model" in checkpoint:
+            # this means the user requested to resume from a training checkpoint
+            model_without_ddp.load_state_dict(checkpoint["model"])
+            # this means the user wants to continue training from where it was left off
+            if args.resume_schedule:
+                optimizer.load_state_dict(checkpoint["optimizer"])
+                scheduler.load_state_dict(checkpoint["scheduler"])
+                args.start_step = checkpoint["step"] + 1
+                # modify starting point of the dat
+                sample_start_step = args.start_step * args.batch_size * args.world_size
+                train_dataset = train_dataset[sample_start_step:]
+
+        else:
+            # this means the user wants to finetune on top of a model state dict
+            # and that no other changes are required
+            model_without_ddp.load_state_dict(checkpoint)
+
+    torch.backends.cudnn.benchmark = True
+
+    # enable training mode
+    model.train()
+    if args.freeze_batch_norm:
+        freeze_batch_norm(model_without_ddp)
+
+    # put dataloader on top of the dataset
+    # make sure to disable shuffling since the dataset is already shuffled
+    # in order to guarantee quasi randomness whilst retaining a deterministic
+    # dataset consumption order
+    if args.distributed:
+        # the train dataset is preshuffled in order to respect the iteration order
+        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True)
+    else:
+        # the train dataset is already shuffled, so we can use a simple SequentialSampler
+        sampler = torch.utils.data.SequentialSampler(train_dataset)
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        pin_memory=True,
+        num_workers=args.workers,
+    )
+
+    # initialize the logger
+    if args.tensorboard_summaries:
+        from torch.utils.tensorboard import SummaryWriter
+
+        tensorboard_path = Path(args.checkpoint_dir) / "tensorboard"
+        os.makedirs(tensorboard_path, exist_ok=True)
+
+        tensorboard_run = tensorboard_path / f"{args.name}"
+        writer = SummaryWriter(tensorboard_run)
+    else:
+        writer = None
+
+    logger = utils.MetricLogger(delimiter="  ")
+
+    scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None
+    # run the training loop
+    # this will perform optimization, respectively logging and saving checkpoints
+    # when need be
+    run(
+        model=model,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        train_loader=train_loader,
+        val_loaders=val_loaders,
+        logger=logger,
+        writer=writer,
+        scaler=scaler,
+        args=args,
+    )
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help)
+    # checkpointing
+    parser.add_argument("--name", default="crestereo", help="name of the experiment")
+    parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume")
+    parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory")
+
+    # dataset
+    parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory")
+    parser.add_argument(
+        "--train-datasets",
+        type=str,
+        nargs="+",
+        default=["crestereo"],
+        help="dataset(s) to train on",
+        choices=list(VALID_DATASETS.keys()),
+    )
+    parser.add_argument(
+        "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset"
+    )
+    parser.add_argument(
+        "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs"
+    )
+    parser.add_argument(
+        "--test-datasets",
+        type=str,
+        nargs="+",
+        default=["middlebury2014-train"],
+        help="dataset(s) to test on",
+        choices=["middlebury2014-train"],
+    )
+    parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True)
+    parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True)
+    parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU")
+    parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU")
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=16,
+        help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.",
+    )
+
+    # model architecture
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="crestereo_base",
+        help="model architecture",
+        choices=["crestereo_base", "raft_stereo"],
+    )
+    parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates")
+    parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters")
+
+    # loss parameters
+    parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss")
+    parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss")
+    parser.add_argument(
+        "--flow-loss-exclude-large",
+        action="store_true",
+        help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm",
+        default=False,
+    )
+    parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight")
+    parser.add_argument(
+        "--consistency-resize-factor",
+        type=float,
+        default=0.25,
+        help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image",
+    )
+    parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight")
+    parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight")
+    parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight")
+    parser.add_argument(
+        "--photometric-max-displacement-ratio",
+        type=float,
+        default=0.15,
+        help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss",
+    )
+    parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight")
+
+    # transforms parameters
+    parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms")
+    parser.add_argument(
+        "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation"
+    )
+    parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size")
+    parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size")
+    parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range")
+    parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image")
+    parser.add_argument(
+        "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"]
+    )
+    parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image")
+    parser.add_argument(
+        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
+    )
+    parser.add_argument(
+        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
+    )
+    parser.add_argument(
+        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
+    )
+    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
+    parser.add_argument(
+        "--interpolation-strategy",
+        type=str,
+        default="bilinear",
+        help="interpolation strategy",
+        choices=["bilinear", "bicubic", "mixed"],
+    )
+    parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image")
+    parser.add_argument(
+        "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift"
+    )
+    parser.add_argument(
+        "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift"
+    )
+    parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction")
+    parser.add_argument(
+        "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction"
+    )
+    parser.add_argument(
+        "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction"
+    )
+    parser.add_argument(
+        "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction"
+    )
+    parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction")
+    parser.add_argument(
+        "--asymmetric-jitter-prob",
+        type=float,
+        default=1.0,
+        help="probability of using asymmetric jitter instead of symmetric jitter",
+    )
+    parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage")
+    parser.add_argument(
+        "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels"
+    )
+    parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images")
+    parser.add_argument(
+        "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels"
+    )
+    parser.add_argument(
+        "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation"
+    )
+
+    # optimizer parameters
+    parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"])
+    parser.add_argument("--lr", type=float, default=4e-4, help="learning rate")
+    parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay")
+    parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm")
+
+    # lr_scheduler parameters
+    parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate")
+    parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps")
+    parser.add_argument(
+        "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr"
+    )
+    parser.add_argument(
+        "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"]
+    )
+    parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate")
+    parser.add_argument(
+        "--lr-decay-method",
+        type=str,
+        default="linear",
+        help="decay method",
+        choices=["linear", "cosine", "exponential"],
+    )
+    parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate")
+
+    # deterministic behaviour
+    parser.add_argument("--seed", type=int, default=42, help="seed for random number generators")
+
+    # mixed precision training
+    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
+
+    # logging
+    parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard")
+    parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency")
+    parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency")
+    parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency")
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
+        help="metrics to log",
+        choices=AVAILABLE_METRICS,
+    )
+
+    # distributed parameters
+    parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes")
+    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
+    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
+
+    # weights API
+    parser.add_argument("--weights", type=str, default=None, help="weights API url")
+    parser.add_argument(
+        "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning"
+    )
+    parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state")
+
+    # padder parameters
+    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py
new file mode 100644
index 00000000000..9c4a6bab6d3
--- /dev/null
+++ b/references/depth/stereo/transforms.py
@@ -0,0 +1,650 @@
+import random
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from torch import Tensor
+
+T_FLOW = Union[Tensor, np.ndarray, None]
+T_MASK = Union[Tensor, np.ndarray, None]
+T_STEREO_TENSOR = Tuple[Tensor, Tensor]
+T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]]
+
+
+def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor:
+    return (low - high) * torch.rand(size) + high
+
+
+class InterpolationStrategy:
+
+    _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"]
+
+    def __init__(self, mode: str = "mixed") -> None:
+        if mode not in self._valid_modes:
+            raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}")
+
+        if mode == "mixed":
+            self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC]
+        elif mode == "bicubic":
+            self.strategies = [F.InterpolationMode.BICUBIC]
+        elif mode == "bilinear":
+            self.strategies = [F.InterpolationMode.BILINEAR]
+
+    def __call__(self) -> F.InterpolationMode:
+        return random.choice(self.strategies)
+
+    @classmethod
+    def is_valid(mode: str) -> bool:
+        return mode in InterpolationStrategy._valid_modes
+
+    @property
+    def valid_modes() -> List[str]:
+        return InterpolationStrategy._valid_modes
+
+
+class ValidateModelInput(torch.nn.Module):
+    # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects
+    def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK):
+        if images[0].shape != images[1].shape:
+            raise ValueError("img1 and img2 should have the same shape.")
+        h, w = images[0].shape[-2:]
+        if disparities[0] is not None and disparities[0].shape != (1, h, w):
+            raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}")
+        if masks[0] is not None:
+            if masks[0].shape != (h, w):
+                raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}")
+            if masks[0].dtype != torch.bool:
+                raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}")
+
+        return images, disparities, masks
+
+
+class ConvertToGrayscale(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        images: Tuple[PIL.Image.Image, PIL.Image.Image],
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.rgb_to_grayscale(images[0], num_output_channels=3)
+        img_right = F.rgb_to_grayscale(images[1], num_output_channels=3)
+
+        return (img_left, img_right), disparities, masks
+
+
+class MakeValidDisparityMask(torch.nn.Module):
+    def __init__(self, max_disparity: Optional[int] = 256) -> None:
+        super().__init__()
+        self.max_disparity = max_disparity
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        valid_masks = tuple(
+            torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask
+            for idx, mask in enumerate(masks)
+        )
+
+        valid_masks = tuple(
+            torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask
+            for mask, disparity in zip(valid_masks, disparities)
+        )
+
+        if self.max_disparity is not None:
+            valid_masks = tuple(
+                torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask
+                for mask, disparity in zip(valid_masks, disparities)
+            )
+
+        return images, disparities, valid_masks
+
+
+class ToGPU(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        dev_images = tuple(image.cuda() for image in images)
+        dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities))
+        dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks))
+        return dev_images, dev_disparities, dev_masks
+
+
+class ConvertImageDtype(torch.nn.Module):
+    def __init__(self, dtype: torch.dtype):
+        super().__init__()
+        self.dtype = dtype
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.convert_image_dtype(images[0], dtype=self.dtype)
+        img_right = F.convert_image_dtype(images[1], dtype=self.dtype)
+
+        img_left = img_left.contiguous()
+        img_right = img_right.contiguous()
+
+        return (img_left, img_right), disparities, masks
+
+
+class Normalize(torch.nn.Module):
+    def __init__(self, mean: List[float], std: List[float]) -> None:
+        super().__init__()
+        self.mean = mean
+        self.std = std
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        img_left = F.normalize(images[0], mean=self.mean, std=self.std)
+        img_right = F.normalize(images[1], mean=self.mean, std=self.std)
+
+        img_left = img_left.contiguous()
+        img_right = img_right.contiguous()
+
+        return (img_left, img_right), disparities, masks
+
+
+class ToTensor(torch.nn.Module):
+    def forward(
+        self,
+        images: Tuple[PIL.Image.Image, PIL.Image.Image],
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        if images[0] is None:
+            raise ValueError("img_left is None")
+        if images[1] is None:
+            raise ValueError("img_right is None")
+
+        img_left = F.pil_to_tensor(images[0])
+        img_right = F.pil_to_tensor(images[1])
+        disparity_tensors = ()
+        mask_tensors = ()
+
+        for idx in range(2):
+            disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,)
+            mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,)
+
+        return (img_left, img_right), disparity_tensors, mask_tensors
+
+
+class AsymmetricColorJitter(T.ColorJitter):
+    # p determines the probability of doing asymmetric vs symmetric color jittering
+    def __init__(
+        self,
+        brightness: T_COLOR_AUG_PARAM = 0,
+        contrast: T_COLOR_AUG_PARAM = 0,
+        saturation: T_COLOR_AUG_PARAM = 0,
+        hue: T_COLOR_AUG_PARAM = 0,
+        p: float = 0.2,
+    ):
+        super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+        self.p = p
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        if torch.rand(1) < self.p:
+            # asymmetric: different transform for img1 and img2
+            img_left = super().forward(images[0])
+            img_right = super().forward(images[1])
+        else:
+            # symmetric: same transform for img1 and img2
+            batch = torch.stack(images)
+            batch = super().forward(batch)
+            img_left, img_right = batch[0], batch[1]
+
+        return (img_left, img_right), disparities, masks
+
+
+class AsymetricGammaAdjust(torch.nn.Module):
+    def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None:
+        super().__init__()
+        self.gamma_range = gamma_range
+        self.gain = gain
+        self.p = p
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item()
+
+        if torch.rand(1) < self.p:
+            # asymmetric: different transform for img1 and img2
+            img_left = F.adjust_gamma(images[0], gamma, gain=self.gain)
+            img_right = F.adjust_gamma(images[1], gamma, gain=self.gain)
+        else:
+            # symmetric: same transform for img1 and img2
+            batch = torch.stack(images)
+            batch = F.adjust_gamma(batch, gamma, gain=self.gain)
+            img_left, img_right = batch[0], batch[1]
+
+        return (img_left, img_right), disparities, masks
+
+
+class RandomErase(torch.nn.Module):
+    # Produces multiple symmetric random erasures
+    # these can be viewed as occlusions present in both camera views.
+    # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map
+    def __init__(
+        self,
+        p: float = 0.5,
+        erase_px_range: Tuple[int, int] = (50, 100),
+        value: Union[Tensor, float] = 0,
+        inplace: bool = False,
+        max_erase: int = 2,
+    ):
+        super().__init__()
+        self.min_px_erase = erase_px_range[0]
+        self.max_px_erase = erase_px_range[1]
+        if self.max_px_erase < 0:
+            raise ValueError("erase_px_range[1] should be equal or greater than 0")
+        if self.min_px_erase < 0:
+            raise ValueError("erase_px_range[0] should be equal or greater than 0")
+        if self.min_px_erase > self.max_px_erase:
+            raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]")
+
+        self.p = p
+        self.value = value
+        self.inplace = inplace
+        self.max_erase = max_erase
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        if torch.rand(1) < self.p:
+            return images, disparities, masks
+
+        image_left, image_right = images
+        mask_left, mask_right = masks
+        for _ in range(torch.randint(self.max_erase, size=(1,)).item()):
+            y, x, h, w, v = self._get_params(image_left)
+            image_right = F.erase(image_right, y, x, h, w, v, self.inplace)
+            image_left = F.erase(image_left, y, x, h, w, v, self.inplace)
+            # similarly to optical flow occlusion prediction, we consider
+            # any erasure pixels that are in both images to be occluded therefore
+            # we mark them as invalid
+            if mask_left is not None:
+                mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace)
+            if mask_right is not None:
+                mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace)
+
+        return (image_left, image_right), disparities, (mask_left, mask_right)
+
+    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
+        img_h, img_w = img.shape[-2:]
+        crop_h, crop_w = (
+            random.randint(self.min_px_erase, self.max_px_erase),
+            random.randint(self.min_px_erase, self.max_px_erase),
+        )
+        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
+
+        return crop_y, crop_x, crop_h, crop_w, self.value
+
+
+class RandomOcclusion(torch.nn.Module):
+    # This adds an occlusion in the right image
+    # the occluded patch works as a patch erase where the erase value is the mean
+    # of the pixels from the selected zone
+    def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False):
+        super().__init__()
+
+        self.min_px_occlusion = occlusion_px_range[0]
+        self.max_px_occlusion = occlusion_px_range[1]
+
+        if self.max_px_occlusion < 0:
+            raise ValueError("occlusion_px_range[1] should be greater or equal than 0")
+        if self.min_px_occlusion < 0:
+            raise ValueError("occlusion_px_range[0] should be greater or equal than 0")
+        if self.min_px_occlusion > self.max_px_occlusion:
+            raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]")
+
+        self.p = p
+        self.inplace = inplace
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        left_image, right_image = images
+
+        if torch.rand(1) < self.p:
+            return images, disparities, masks
+
+        y, x, h, w, v = self._get_params(right_image)
+        right_image = F.erase(right_image, y, x, h, w, v, self.inplace)
+
+        return ((left_image, right_image), disparities, masks)
+
+    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
+        img_h, img_w = img.shape[-2:]
+        crop_h, crop_w = (
+            random.randint(self.min_px_occlusion, self.max_px_occlusion),
+            random.randint(self.min_px_occlusion, self.max_px_occlusion),
+        )
+
+        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
+        occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True)
+
+        return (crop_y, crop_x, crop_h, crop_w, occlusion_value)
+
+
+class RandomSpatialShift(torch.nn.Module):
+    # This transform applies a vertical shift and a slight angle rotation and the same time
+    def __init__(
+        self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear"
+    ) -> None:
+        super().__init__()
+        self.p = p
+        self.max_angle = max_angle
+        self.max_px_shift = max_px_shift
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        # the transform is applied only on the right image
+        # in order to mimic slight calibration issues
+        img_left, img_right = images
+
+        INTERP_MODE = self._interpolation_mode_strategy()
+
+        if torch.rand(1) < self.p:
+            # [0, 1] -> [-a, a]
+            shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item()
+            angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item()
+            # sample center point for the rotation matrix
+            y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item()
+            x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item()
+            # apply affine transformations
+            img_right = F.affine(
+                img_right,
+                angle=angle,
+                translate=[0, shift],  # translation only on the y-axis
+                center=[x, y],
+                scale=1.0,
+                shear=0.0,
+                interpolation=INTERP_MODE,
+            )
+
+        return ((img_left, img_right), disparities, masks)
+
+
+class RandomHorizontalFlip(torch.nn.Module):
+    def __init__(self, p: float = 0.5) -> None:
+        super().__init__()
+        self.p = p
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        img_left, img_right = images
+        dsp_left, dsp_right = disparities
+        mask_left, mask_right = masks
+
+        if dsp_right is not None and torch.rand(1) < self.p:
+            img_left, img_right = F.hflip(img_left), F.hflip(img_right)
+            dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right)
+            if mask_left is not None and mask_right is not None:
+                mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right)
+            return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left))
+
+        return images, disparities, masks
+
+
+class Resize(torch.nn.Module):
+    def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None:
+        super().__init__()
+        self.resize_size = list(resize_size)  # doing this to keep mypy happy
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        resized_images = ()
+        resized_disparities = ()
+        resized_masks = ()
+
+        INTERP_MODE = self._interpolation_mode_strategy()
+
+        for img in images:
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the stereo models with antialias=True?
+            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE, antialias=False),)
+
+        for dsp in disparities:
+            if dsp is not None:
+                # rescale disparity to match the new image size
+                scale_x = self.resize_size[1] / dsp.shape[-1]
+                resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,)
+            else:
+                resized_disparities += (None,)
+
+        for mask in masks:
+            if mask is not None:
+                resized_masks += (
+                    # we squeeze and unsqueeze because the API requires > 3D tensors
+                    F.resize(
+                        mask.unsqueeze(0),
+                        self.resize_size,
+                        interpolation=F.InterpolationMode.NEAREST,
+                    ).squeeze(0),
+                )
+            else:
+                resized_masks += (None,)
+
+        return resized_images, resized_disparities, resized_masks
+
+
+class RandomRescaleAndCrop(torch.nn.Module):
+    # This transform will resize the input with a given proba, and then crop it.
+    # These are the reversed operations of the built-in RandomResizedCrop,
+    # although the order of the operations doesn't matter too much: resizing a
+    # crop would give the same result as cropping a resized image, up to
+    # interpolation artifact at the borders of the output.
+    #
+    # The reason we don't rely on RandomResizedCrop is because of a significant
+    # difference in the parametrization of both transforms, in particular,
+    # because of the way the random parameters are sampled in both transforms,
+    # which leads to fairly different results (and different epe). For more details see
+    # https://github.com/pytorch/vision/pull/5026/files#r762932579
+    def __init__(
+        self,
+        crop_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (-0.2, 0.5),
+        rescale_prob: float = 0.8,
+        scaling_type: str = "exponential",
+        interpolation_type: str = "bilinear",
+    ) -> None:
+        super().__init__()
+        self.crop_size = crop_size
+        self.min_scale = scale_range[0]
+        self.max_scale = scale_range[1]
+        self.rescale_prob = rescale_prob
+        self.scaling_type = scaling_type
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+
+        if self.scaling_type == "linear" and self.min_scale < 0:
+            raise ValueError("min_scale must be >= 0 for linear scaling")
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        img_left, img_right = images
+        dsp_left, dsp_right = disparities
+        mask_left, mask_right = masks
+        INTERP_MODE = self._interpolation_mode_strategy()
+
+        # randomly sample scale
+        h, w = img_left.shape[-2:]
+        # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti)
+        # It shouldn't matter much
+        min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w)
+
+        # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise
+        # 2 to the power of that random value. This final scale distribution will have a different
+        # mean and variance than a uniform distribution. Note that a scale of 1 will result in
+        # a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
+        # of 0.5X the original size.
+        if self.scaling_type == "exponential":
+            scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
+        # linear scaling will draw a random scale in (min_scale, max_scale)
+        elif self.scaling_type == "linear":
+            scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
+
+        scale = max(scale, min_scale)
+
+        new_h, new_w = round(h * scale), round(w * scale)
+
+        if torch.rand(1).item() < self.rescale_prob:
+            # rescale the images
+            img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE)
+            img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE)
+
+            resized_masks, resized_disparities = (), ()
+
+            for disparity, mask in zip(disparities, masks):
+                if disparity is not None:
+                    if mask is None:
+                        resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE)
+                        # rescale the disparity
+                        resized_disparity = (
+                            resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None]
+                        )
+                        resized_mask = None
+                    else:
+                        resized_disparity, resized_mask = _resize_sparse_flow(
+                            disparity, mask, scale_x=scale, scale_y=scale
+                        )
+                resized_masks += (resized_mask,)
+                resized_disparities += (resized_disparity,)
+
+        else:
+            resized_disparities = disparities
+            resized_masks = masks
+
+        disparities = resized_disparities
+        masks = resized_masks
+
+        # Note: For sparse datasets (Kitti), the original code uses a "margin"
+        # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
+        # We don't, not sure if it matters much
+        y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item()
+        x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item()
+
+        img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1])
+        img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1])
+        if dsp_left is not None:
+            dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1])
+        if dsp_right is not None:
+            dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1])
+
+        cropped_masks = ()
+        for mask in masks:
+            if mask is not None:
+                mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1])
+            cropped_masks += (mask,)
+
+        return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks)
+
+
+def _resize_sparse_flow(
+    flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0
+) -> Tuple[Tensor, Tensor]:
+    # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse)
+    # There are as-many non-zero values in the original flow as in the resized flow (up to OOB)
+    # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4
+
+    h, w = flow.shape[-2:]
+
+    h_new = int(round(h * scale_y))
+    w_new = int(round(w * scale_x))
+    flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype)
+    valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype)
+
+    jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy")
+
+    ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask]
+
+    ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long)
+    jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long)
+
+    within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new)
+
+    ii_valid = ii_valid[within_bounds_mask]
+    jj_valid = jj_valid[within_bounds_mask]
+    ii_valid_new = ii_valid_new[within_bounds_mask]
+    jj_valid_new = jj_valid_new[within_bounds_mask]
+
+    valid_flow_new = flow[:, ii_valid, jj_valid]
+    valid_flow_new *= scale_x
+
+    flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new
+    valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid]
+
+    return flow_new, valid_new.bool()
+
+
+class Compose(torch.nn.Module):
+    def __init__(self, transforms: List[Callable]):
+        super().__init__()
+        self.transforms = transforms
+
+    @torch.inference_mode()
+    def forward(self, images, disparities, masks):
+        for t in self.transforms:
+            images, disparities, masks = t(images, disparities, masks)
+        return images, disparities, masks
diff --git a/references/depth/stereo/utils/__init__.py b/references/depth/stereo/utils/__init__.py
new file mode 100644
index 00000000000..4dacbe61ba0
--- /dev/null
+++ b/references/depth/stereo/utils/__init__.py
@@ -0,0 +1,6 @@
+from .losses import *
+from .metrics import *
+from .distributed import *
+from .logger import *
+from .padder import *
+from .norm import *
diff --git a/references/depth/stereo/utils/distributed.py b/references/depth/stereo/utils/distributed.py
new file mode 100644
index 00000000000..228aa2a0f9a
--- /dev/null
+++ b/references/depth/stereo/utils/distributed.py
@@ -0,0 +1,60 @@
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def _redefine_print(is_main):
+    """disables printing when not in main process"""
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_main or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def setup_ddp(args):
+    # Set the local_rank, rank, and world_size values as args fields
+    # This is done differently depending on how we're running the script. We
+    # currently support either torchrun or the custom run_with_submitit.py
+    # If you're confused (like I was), this might help a bit
+    # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2
+
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    elif hasattr(args, "rank"):
+        pass
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        args.world_size = 1
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    dist.init_process_group(
+        backend="nccl",
+        rank=args.rank,
+        world_size=args.world_size,
+        init_method=args.dist_url,
+    )
+    torch.distributed.barrier()
+    _redefine_print(is_main=(args.rank == 0))
+
+
+def reduce_across_processes(val):
+    t = torch.tensor(val, device="cuda")
+    dist.barrier()
+    dist.all_reduce(t)
+    return t
diff --git a/references/depth/stereo/utils/logger.py b/references/depth/stereo/utils/logger.py
new file mode 100644
index 00000000000..803e9aebd7b
--- /dev/null
+++ b/references/depth/stereo/utils/logger.py
@@ -0,0 +1,153 @@
+import datetime
+import time
+from collections import defaultdict, deque
+
+import torch
+
+from .distributed import reduce_across_processes
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt="{median:.4f} ({global_avg:.4f})"):
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        t = reduce_across_processes([self.count, self.total])
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
+        )
+
+
+class MetricLogger:
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            if not isinstance(v, (float, int)):
+                raise TypeError(
+                    f"This method expects the value of the input arguments to be of type float or int, instead  got {type(v)}"
+                )
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(f"{name}: {str(meter)}")
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, **kwargs):
+        self.meters[name] = SmoothedValue(**kwargs)
+
+    def log_every(self, iterable, print_freq=5, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
+        else:
+            log_msg = self.delimiter.join(
+                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
+            )
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if print_freq is not None and i % print_freq == 0:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(f"{header} Total time: {total_time_str}")
diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py
new file mode 100644
index 00000000000..1c21353a056
--- /dev/null
+++ b/references/depth/stereo/utils/losses.py
@@ -0,0 +1,503 @@
+from typing import List, Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid
+
+
+def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor:
+    """Function to create a 2D Gaussian kernel."""
+
+    x = torch.arange(kernel_size, dtype=torch.float32)
+    y = torch.arange(kernel_size, dtype=torch.float32)
+    x = x - (kernel_size - 1) / 2
+    y = y - (kernel_size - 1) / 2
+    x, y = torch.meshgrid(x, y, indexing="ij")
+    grid = (x**2 + y**2) / (2 * sigma**2)
+    kernel = torch.exp(-grid)
+    kernel = kernel / kernel.sum()
+    return kernel
+
+
+def _sequence_loss_fn(
+    flow_preds: List[Tensor],
+    flow_gt: Tensor,
+    valid_flow_mask: Optional[Tensor],
+    gamma: Tensor,
+    max_flow: int = 256,
+    exclude_large: bool = False,
+    weights: Optional[Tensor] = None,
+):
+    """Loss function defined over sequence of flow predictions"""
+    torch._assert(
+        gamma < 1,
+        "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma),
+    )
+
+    if exclude_large:
+        # exclude invalid pixels and extremely large diplacements
+        flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
+        if valid_flow_mask is not None:
+            valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
+        else:
+            valid_flow_mask = flow_norm < max_flow
+
+    if valid_flow_mask is not None:
+        valid_flow_mask = valid_flow_mask.unsqueeze(1)
+    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
+
+    abs_diff = (flow_preds - flow_gt).abs()
+    if valid_flow_mask is not None:
+        abs_diff = abs_diff * valid_flow_mask.unsqueeze(0)
+
+    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
+    num_predictions = flow_preds.shape[0]
+
+    # allocating on CPU and moving to device during run-time can force
+    # an unwanted GPU synchronization that produces a large overhead
+    if weights is None or len(weights) != num_predictions:
+        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
+
+    flow_loss = (abs_diff * weights).sum()
+    return flow_loss, weights
+
+
+class SequenceLoss(nn.Module):
+    def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None:
+        """
+        Args:
+            gamma: value for the exponential weighting of the loss across frames
+            max_flow: maximum flow value to exclude
+            exclude_large_flows: whether to exclude large flows
+        """
+
+        super().__init__()
+        self.max_flow = max_flow
+        self.excluding_large = exclude_large_flows
+        self.register_buffer("gamma", torch.tensor([gamma]))
+        # cache the scale factor for the loss
+        self._weights = None
+
+    def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor:
+        """
+        Args:
+            flow_preds: list of flow predictions of shape (batch_size, C, H, W)
+            flow_gt: ground truth flow of shape (batch_size, C, H, W)
+            valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W)
+        """
+        loss, weights = _sequence_loss_fn(
+            flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights
+        )
+        self._weights = weights
+        return loss
+
+    def set_gamma(self, gamma: float) -> None:
+        self.gamma.fill_(gamma)
+        # reset the cached scale factor
+        self._weights = None
+
+
+def _ssim_loss_fn(
+    source: Tensor,
+    reference: Tensor,
+    kernel: Tensor,
+    eps: float = 1e-8,
+    c1: float = 0.01**2,
+    c2: float = 0.03**2,
+    use_padding: bool = False,
+) -> Tensor:
+    # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity
+    # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim
+
+    torch._assert(
+        source.ndim == reference.ndim == 4,
+        "SSIM: `source` and `reference` must be 4-dimensional tensors",
+    )
+
+    torch._assert(
+        source.shape == reference.shape,
+        "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format(
+            source.shape, reference.shape
+        ),
+    )
+
+    B, C, H, W = source.shape
+    kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1)
+    if use_padding:
+        pad_size = kernel.shape[2] // 2
+        source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect")
+        reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect")
+
+    mu1 = F.conv2d(source, kernel, groups=C)
+    mu2 = F.conv2d(reference, kernel, groups=C)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+
+    mu1_mu2 = mu1 * mu2
+    mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C)
+    mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C)
+    mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C)
+
+    sigma1_sq = mu_img1_sq - mu1_sq
+    sigma2_sq = mu_img2_sq - mu2_sq
+    sigma12 = mu_img1_mu2 - mu1_mu2
+
+    numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
+    denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
+    ssim = numerator / (denominator + eps)
+
+    # doing 1 - ssim because we want to maximize the ssim
+    return 1 - ssim.mean(dim=(1, 2, 3))
+
+
+class SSIM(nn.Module):
+    def __init__(
+        self,
+        kernel_size: int = 11,
+        max_val: float = 1.0,
+        sigma: float = 1.5,
+        eps: float = 1e-12,
+        use_padding: bool = True,
+    ) -> None:
+        """SSIM loss function.
+
+        Args:
+            kernel_size: size of the Gaussian kernel
+            max_val: constant scaling factor
+            sigma: sigma of the Gaussian kernel
+            eps: constant for division by zero
+            use_padding: whether to pad the input tensor such that we have a score for each pixel
+        """
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.max_val = max_val
+        self.sigma = sigma
+
+        gaussian_kernel = make_gaussian_kernel(kernel_size, sigma)
+        self.register_buffer("gaussian_kernel", gaussian_kernel)
+
+        self.c1 = (0.01 * self.max_val) ** 2
+        self.c2 = (0.03 * self.max_val) ** 2
+
+        self.use_padding = use_padding
+        self.eps = eps
+
+    def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            source: source image of shape (batch_size, C, H, W)
+            reference: reference image of shape (batch_size, C, H, W)
+
+        Returns:
+            SSIM loss of shape (batch_size,)
+        """
+        return _ssim_loss_fn(
+            source,
+            reference,
+            kernel=self.gaussian_kernel,
+            c1=self.c1,
+            c2=self.c2,
+            use_padding=self.use_padding,
+            eps=self.eps,
+        )
+
+
+def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor):
+    # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202
+
+    torch._assert(
+        img_gx.ndim >= 3,
+        "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)",
+    )
+
+    torch._assert(
+        img_gx.ndim == val_gx.ndim,
+        "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format(
+            img_gx.ndim, val_gx.ndim
+        ),
+    )
+
+    for idx in range(img_gx.ndim):
+        torch._assert(
+            (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)),
+            "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format(
+                img_gx.shape, val_gx.shape
+            ),
+        )
+
+    # -3 is channel dimension
+    weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True))
+    weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True))
+
+    smoothness_x = img_gx * weights_x
+    smoothness_y = img_gy * weights_y
+
+    smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1))
+    return smoothness
+
+
+class SmoothnessLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _x_gradient(self, img: Tensor) -> Tensor:
+        if img.ndim > 4:
+            original_shape = img.shape
+            is_reshaped = True
+            img = img.reshape(-1, *original_shape[-3:])
+        else:
+            is_reshaped = False
+
+        padded = F.pad(img, (0, 1, 0, 0), mode="replicate")
+        grad = padded[..., :, :-1] - padded[..., :, 1:]
+        if is_reshaped:
+            grad = grad.reshape(original_shape)
+        return grad
+
+    def _y_gradient(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim > 4:
+            original_shape = x.shape
+            is_reshaped = True
+            x = x.reshape(-1, *original_shape[-3:])
+        else:
+            is_reshaped = False
+
+        padded = F.pad(x, (0, 0, 0, 1), mode="replicate")
+        grad = padded[..., :-1, :] - padded[..., 1:, :]
+        if is_reshaped:
+            grad = grad.reshape(original_shape)
+        return grad
+
+    def forward(self, images: Tensor, vals: Tensor) -> Tensor:
+        """
+        Args:
+            images: tensor of shape (D1, D2, ..., DN, C, H, W)
+            vals: tensor of shape (D1, D2, ..., DN, 1, H, W)
+
+        Returns:
+            smoothness loss of shape (D1, D2, ..., DN)
+        """
+        img_gx = self._x_gradient(images)
+        img_gy = self._y_gradient(images)
+
+        val_gx = self._x_gradient(vals)
+        val_gy = self._y_gradient(vals)
+
+        return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy)
+
+
+def _flow_sequence_consistency_loss_fn(
+    flow_preds: List[Tensor],
+    gamma: float = 0.8,
+    resize_factor: float = 0.25,
+    rescale_factor: float = 0.25,
+    rescale_mode: str = "bilinear",
+    weights: Optional[Tensor] = None,
+):
+    """Loss function defined over sequence of flow predictions"""
+
+    # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf
+    # In the original paper, an additional refinement network is used to refine a flow prediction.
+    # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update.
+    # which should be consistent with the previous step. In this implementation, we simplify the overall loss
+    # term and ignore left-right consistency loss or photometric loss which can be treated separately.
+
+    torch._assert(
+        rescale_factor <= 1.0,
+        "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format(
+            rescale_factor
+        ),
+    )
+
+    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
+    N, B, C, H, W = flow_preds.shape
+
+    # rescale flow predictions to account for bilinear upsampling artifacts
+    if rescale_factor:
+        flow_preds = (
+            F.interpolate(
+                flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True
+            )
+        ) * rescale_factor
+        flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0)
+
+    # force the next prediction to be similar to the previous prediction
+    abs_diff = (flow_preds[1:] - flow_preds[:-1]).square()
+    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
+
+    num_predictions = flow_preds.shape[0] - 1  # because we are comparing differences
+    if weights is None or len(weights) != num_predictions:
+        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
+
+    flow_loss = (abs_diff * weights).sum()
+    return flow_loss, weights
+
+
+class FlowSequenceConsistencyLoss(nn.Module):
+    def __init__(
+        self,
+        gamma: float = 0.8,
+        resize_factor: float = 0.25,
+        rescale_factor: float = 0.25,
+        rescale_mode: str = "bilinear",
+    ) -> None:
+        super().__init__()
+        self.gamma = gamma
+        self.resize_factor = resize_factor
+        self.rescale_factor = rescale_factor
+        self.rescale_mode = rescale_mode
+        self._weights = None
+
+    def forward(self, flow_preds: List[Tensor]) -> Tensor:
+        """
+        Args:
+            flow_preds: list of tensors of shape (batch_size, C, H, W)
+
+        Returns:
+            sequence consistency loss of shape (batch_size,)
+        """
+        loss, weights = _flow_sequence_consistency_loss_fn(
+            flow_preds,
+            gamma=self.gamma,
+            resize_factor=self.resize_factor,
+            rescale_factor=self.rescale_factor,
+            rescale_mode=self.rescale_mode,
+            weights=self._weights,
+        )
+        self._weights = weights
+        return loss
+
+    def set_gamma(self, gamma: float) -> None:
+        self.gamma.fill_(gamma)
+        # reset the cached scale factor
+        self._weights = None
+
+
+def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor:
+    torch._assert(
+        source.shape == target.shape,
+        "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape),
+    )
+
+    # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1))))
+
+
+class PSNRLoss(nn.Module):
+    def __init__(self, max_val: float = 256) -> None:
+        """
+        Args:
+            max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor.
+
+        """
+        super().__init__()
+        self.max_val = max_val
+
+    def forward(self, source: Tensor, target: Tensor) -> Tensor:
+        """
+        Args:
+            source: tensor of shape (D1, D2, ..., DN, C, H, W)
+            target: tensor of shape (D1, D2, ..., DN, C, H, W)
+
+        Returns:
+            psnr loss of shape (D1, D2, ..., DN)
+        """
+
+        # multiply by -1 as we want to maximize the psnr
+        return -1 * _psnr_loss_fn(source, target, self.max_val)
+
+
+class FlowPhotoMetricLoss(nn.Module):
+    def __init__(
+        self,
+        ssim_weight: float = 0.85,
+        ssim_window_size: int = 11,
+        ssim_max_val: float = 1.0,
+        ssim_sigma: float = 1.5,
+        ssim_eps: float = 1e-12,
+        ssim_use_padding: bool = True,
+        max_displacement_ratio: float = 0.15,
+    ) -> None:
+        super().__init__()
+
+        self._ssim_loss = SSIM(
+            kernel_size=ssim_window_size,
+            max_val=ssim_max_val,
+            sigma=ssim_sigma,
+            eps=ssim_eps,
+            use_padding=ssim_use_padding,
+        )
+
+        self._L1_weight = 1 - ssim_weight
+        self._SSIM_weight = ssim_weight
+        self._max_displacement_ratio = max_displacement_ratio
+
+    def forward(
+        self,
+        source: Tensor,
+        reference: Tensor,
+        flow_pred: Tensor,
+        valid_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            source: tensor of shape (B, C, H, W)
+            reference: tensor of shape (B, C, H, W)
+            flow_pred: tensor of shape (B, 2, H, W)
+            valid_mask: tensor of shape (B, H, W) or None
+
+        Returns:
+            photometric loss of shape
+
+        """
+        torch._assert(
+            source.ndim == 4,
+            "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim),
+        )
+        torch._assert(
+            reference.ndim == source.ndim,
+            "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format(
+                source.ndim, reference.ndim
+            ),
+        )
+        torch._assert(
+            flow_pred.shape[1] == 2,
+            "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]),
+        )
+        torch._assert(
+            flow_pred.ndim == 4,
+            "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim),
+        )
+
+        B, C, H, W = source.shape
+        flow_channels = flow_pred.shape[1]
+
+        max_displacements = []
+        for dim in range(flow_channels):
+            shape_index = -1 - dim
+            max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index]))
+
+        # mask out all pixels that have larger flow than the max flow allowed
+        max_flow_mask = torch.logical_and(
+            *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)]
+        )
+
+        if valid_mask is not None:
+            valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1)
+        else:
+            valid_mask = max_flow_mask.unsqueeze(1)
+
+        grid = make_coords_grid(B, H, W, device=str(source.device))
+        resampled_grids = grid - flow_pred
+        resampled_grids = resampled_grids.permute(0, 2, 3, 1)
+        resampled_source = grid_sample(reference, resampled_grids, mode="bilinear")
+
+        # compute SSIM loss
+        ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask)
+        l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1))
+        loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss
+
+        return loss.mean()
diff --git a/references/depth/stereo/utils/metrics.py b/references/depth/stereo/utils/metrics.py
new file mode 100644
index 00000000000..05b149fb048
--- /dev/null
+++ b/references/depth/stereo/utils/metrics.py
@@ -0,0 +1,49 @@
+from typing import Dict, List, Optional, Tuple
+
+from torch import Tensor
+
+AVAILABLE_METRICS = ["mae", "rmse", "epe", "bad1", "bad2", "epe", "1px", "3px", "5px", "fl-all", "relepe"]
+
+
+def compute_metrics(
+    flow_pred: Tensor, flow_gt: Tensor, valid_flow_mask: Optional[Tensor], metrics: List[str]
+) -> Tuple[Dict[str, float], int]:
+    for m in metrics:
+        if m not in AVAILABLE_METRICS:
+            raise ValueError(f"Invalid metric: {m}. Valid metrics are: {AVAILABLE_METRICS}")
+
+    metrics_dict = {}
+
+    pixels_diffs = (flow_pred - flow_gt).abs()
+    # there is no Y flow in Stereo Matching, therefore flow.abs() = flow.pow(2).sum(dim=1).sqrt()
+    flow_norm = flow_gt.abs()
+
+    if valid_flow_mask is not None:
+        valid_flow_mask = valid_flow_mask.unsqueeze(1)
+        pixels_diffs = pixels_diffs[valid_flow_mask]
+        flow_norm = flow_norm[valid_flow_mask]
+
+    num_pixels = pixels_diffs.numel()
+    if "bad1" in metrics:
+        metrics_dict["bad1"] = (pixels_diffs > 1).float().mean().item()
+    if "bad2" in metrics:
+        metrics_dict["bad2"] = (pixels_diffs > 2).float().mean().item()
+
+    if "mae" in metrics:
+        metrics_dict["mae"] = pixels_diffs.mean().item()
+    if "rmse" in metrics:
+        metrics_dict["rmse"] = pixels_diffs.pow(2).mean().sqrt().item()
+    if "epe" in metrics:
+        metrics_dict["epe"] = pixels_diffs.mean().item()
+    if "1px" in metrics:
+        metrics_dict["1px"] = (pixels_diffs < 1).float().mean().item()
+    if "3px" in metrics:
+        metrics_dict["3px"] = (pixels_diffs < 3).float().mean().item()
+    if "5px" in metrics:
+        metrics_dict["5px"] = (pixels_diffs < 5).float().mean().item()
+    if "fl-all" in metrics:
+        metrics_dict["fl-all"] = ((pixels_diffs < 3) & ((pixels_diffs / flow_norm) < 0.05)).float().mean().item() * 100
+    if "relepe" in metrics:
+        metrics_dict["relepe"] = (pixels_diffs / flow_norm).mean().item()
+
+    return metrics_dict, num_pixels
diff --git a/references/depth/stereo/utils/norm.py b/references/depth/stereo/utils/norm.py
new file mode 100644
index 00000000000..7f6e0011160
--- /dev/null
+++ b/references/depth/stereo/utils/norm.py
@@ -0,0 +1,13 @@
+import torch
+
+
+def freeze_batch_norm(model):
+    for m in model.modules():
+        if isinstance(m, torch.nn.BatchNorm2d):
+            m.eval()
+
+
+def unfreeze_batch_norm(model):
+    for m in model.modules():
+        if isinstance(m, torch.nn.BatchNorm2d):
+            m.train()
diff --git a/references/depth/stereo/utils/padder.py b/references/depth/stereo/utils/padder.py
new file mode 100644
index 00000000000..7d2c63afba6
--- /dev/null
+++ b/references/depth/stereo/utils/padder.py
@@ -0,0 +1,28 @@
+import torch.nn.functional as F
+
+
+class InputPadder:
+    """Pads images such that dimensions are divisible by 8"""
+
+    # TODO: Ideally, this should be part of the eval transforms preset, instead
+    # of being part of the validation code. It's not obvious what a good
+    # solution would be, because we need to unpad the predicted flows according
+    # to the input images' size, and in some datasets (Kitti) images can have
+    # variable sizes.
+
+    def __init__(self, dims, mode="sintel"):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == "sintel":
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode="replicate") for x in inputs]
+
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0] : c[1], c[2] : c[3]]
diff --git a/references/depth/stereo/visualization.py b/references/depth/stereo/visualization.py
new file mode 100644
index 00000000000..07a7e7167d3
--- /dev/null
+++ b/references/depth/stereo/visualization.py
@@ -0,0 +1,127 @@
+import os
+from typing import List
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from torch import Tensor
+from torchvision.utils import make_grid
+
+
+@torch.no_grad()
+def make_disparity_image(disparity: Tensor):
+    # normalize image to [0, 1]
+    disparity = disparity.detach().cpu()
+    disparity = (disparity - disparity.min()) / (disparity.max() - disparity.min())
+    return disparity
+
+
+@torch.no_grad()
+def make_disparity_image_pairs(disparity: Tensor, image: Tensor):
+    disparity = make_disparity_image(disparity)
+    # image is in [-1, 1], bring it to [0, 1]
+    image = image.detach().cpu()
+    image = image * 0.5 + 0.5
+    return disparity, image
+
+
+@torch.no_grad()
+def make_disparity_sequence(disparities: List[Tensor]):
+    # convert each disparity to [0, 1]
+    for idx, disparity_batch in enumerate(disparities):
+        disparities[idx] = torch.stack(list(map(make_disparity_image, disparity_batch)))
+    # make the list into a batch
+    disparity_sequences = torch.stack(disparities)
+    return disparity_sequences
+
+
+@torch.no_grad()
+def make_pair_grid(*inputs, orientation="horizontal"):
+    # make a grid of images with the outputs and references side by side
+    if orientation == "horizontal":
+        # interleave the outputs and references
+        canvas = torch.zeros_like(inputs[0])
+        canvas = torch.cat([canvas] * len(inputs), dim=0)
+        size = len(inputs)
+        for idx, inp in enumerate(inputs):
+            canvas[idx::size, ...] = inp
+        grid = make_grid(canvas, nrow=len(inputs), padding=16, normalize=True, scale_each=True)
+    elif orientation == "vertical":
+        # interleave the outputs and references
+        canvas = torch.cat(inputs, dim=0)
+        size = len(inputs)
+        for idx, inp in enumerate(inputs):
+            canvas[idx::size, ...] = inp
+        grid = make_grid(canvas, nrow=len(inputs[0]), padding=16, normalize=True, scale_each=True)
+    else:
+        raise ValueError("Unknown orientation: {}".format(orientation))
+    return grid
+
+
+@torch.no_grad()
+def make_training_sample_grid(
+    left_images: Tensor,
+    right_images: Tensor,
+    disparities: Tensor,
+    masks: Tensor,
+    predictions: List[Tensor],
+) -> npt.NDArray:
+    # detach images and renormalize to [0, 1]
+    images_left = left_images.detach().cpu() * 0.5 + 0.5
+    images_right = right_images.detach().cpu() * 0.5 + 0.5
+    # detach the disparties and predictions
+    disparities = disparities.detach().cpu()
+    predictions = predictions[-1].detach().cpu()
+    # keep only the first channel of pixels, and repeat it 3 times
+    disparities = disparities[:, :1, ...].repeat(1, 3, 1, 1)
+    predictions = predictions[:, :1, ...].repeat(1, 3, 1, 1)
+    # unsqueeze and repeat the masks
+    masks = masks.detach().cpu().unsqueeze(1).repeat(1, 3, 1, 1)
+    # make a grid that will self normalize across the batch
+    pred_grid = make_pair_grid(images_left, images_right, masks, disparities, predictions, orientation="horizontal")
+    pred_grid = pred_grid.permute(1, 2, 0).numpy()
+    pred_grid = (pred_grid * 255).astype(np.uint8)
+    return pred_grid
+
+
+@torch.no_grad()
+def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> npt.NDArray:
+    # right most we will be adding the ground truth
+    seq_len = len(predictions) + 1
+    predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))
+    sequence = make_disparity_sequence(predictions)
+    # swap axes to have the in the correct order for each batch sample
+    sequence = torch.swapaxes(sequence, 0, 1).contiguous().reshape(-1, 1, disparities.shape[-2], disparities.shape[-1])
+    sequence = make_grid(sequence, nrow=seq_len, padding=16, normalize=True, scale_each=True)
+    sequence = sequence.permute(1, 2, 0).numpy()
+    sequence = (sequence * 255).astype(np.uint8)
+    return sequence
+
+
+@torch.no_grad()
+def make_prediction_image_side_to_side(
+    predictions: Tensor, disparities: Tensor, valid_mask: Tensor, save_path: str, prefix: str
+) -> None:
+    import matplotlib.pyplot as plt
+
+    # normalize the predictions and disparities in [0, 1]
+    predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min())
+    disparities = (disparities - disparities.min()) / (disparities.max() - disparities.min())
+    predictions = predictions * valid_mask
+    disparities = disparities * valid_mask
+
+    predictions = predictions.detach().cpu()
+    disparities = disparities.detach().cpu()
+
+    for idx, (pred, gt) in enumerate(zip(predictions, disparities)):
+        pred = pred.permute(1, 2, 0).numpy()
+        gt = gt.permute(1, 2, 0).numpy()
+        # plot pred and gt side by side
+        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+        ax[0].imshow(pred)
+        ax[0].set_title("Prediction")
+        ax[1].imshow(gt)
+        ax[1].set_title("Ground Truth")
+        save_name = os.path.join(save_path, "{}_{}.png".format(prefix, idx))
+        plt.savefig(save_name)
+        plt.close()
diff --git a/references/detection/README.md b/references/detection/README.md
index aec7c10e1b5..d9af26523a5 100644
--- a/references/detection/README.md
+++ b/references/detection/README.md
@@ -68,7 +68,7 @@ torchrun --nproc_per_node=8 train.py\
 torchrun --nproc_per_node=8 train.py\
     --dataset coco --model ssdlite320_mobilenet_v3_large --epochs 660\
     --aspect-ratio-group-factor 3 --lr-scheduler cosineannealinglr --lr 0.15 --batch-size 24\
-    --weight-decay 0.00004 --data-augmentation ssdlite --weights-backbone MobileNet_V3_Large_Weights.IMAGENET1K_V1
+    --weight-decay 0.00004 --data-augmentation ssdlite
 ```
 
 
diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 396de63297b..44b917a6ec6 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -1,4 +1,3 @@
-import copy
 import os
 
 import torch
@@ -9,24 +8,6 @@
 from pycocotools.coco import COCO
 
 
-class FilterAndRemapCocoCategories:
-    def __init__(self, categories, remap=True):
-        self.categories = categories
-        self.remap = remap
-
-    def __call__(self, image, target):
-        anno = target["annotations"]
-        anno = [obj for obj in anno if obj["category_id"] in self.categories]
-        if not self.remap:
-            target["annotations"] = anno
-            return image, target
-        anno = copy.deepcopy(anno)
-        for obj in anno:
-            obj["category_id"] = self.categories.index(obj["category_id"])
-        target["annotations"] = anno
-        return image, target
-
-
 def convert_coco_poly_to_mask(segmentations, height, width):
     masks = []
     for polygons in segmentations:
@@ -49,7 +30,6 @@ def __call__(self, image, target):
         w, h = image.size
 
         image_id = target["image_id"]
-        image_id = torch.tensor([image_id])
 
         anno = target["annotations"]
 
@@ -116,7 +96,7 @@ def _has_valid_annotation(anno):
         # if all boxes have close to zero area, there is no annotation
         if _has_only_empty_bbox(anno):
             return False
-        # keypoints task have a slight different critera for considering
+        # keypoints task have a slight different criteria for considering
         # if an annotation is valid
         if "keypoints" not in anno[0]:
             return True
@@ -126,10 +106,6 @@ def _has_valid_annotation(anno):
             return True
         return False
 
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
     ids = []
     for ds_idx, img_id in enumerate(dataset.ids):
         ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -147,13 +123,13 @@ def convert_to_coco_api(ds):
     coco_ds = COCO()
     # annotation IDs need to start at 1, not 0, see torchvision issue #1530
     ann_id = 1
-    dataset = {"images": [], "categories": [], "annotations": []}
+    dataset = {"images": [], "categories": [], "annotations": [], "info": {}}
     categories = set()
     for img_idx in range(len(ds)):
         # find better way to get target
         # targets = ds.get_annotations(img_idx)
         img, targets = ds[img_idx]
-        image_id = targets["image_id"].item()
+        image_id = targets["image_id"]
         img_dict = {}
         img_dict["id"] = image_id
         img_dict["height"] = img.shape[-2]
@@ -196,6 +172,7 @@ def convert_to_coco_api(ds):
 
 
 def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
     for _ in range(10):
         if isinstance(dataset, torchvision.datasets.CocoDetection):
             break
@@ -220,7 +197,7 @@ def __getitem__(self, idx):
         return img, target
 
 
-def get_coco(root, image_set, transforms, mode="instances"):
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
     anno_file_template = "{}_{}2017.json"
     PATHS = {
         "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
@@ -228,17 +205,26 @@ def get_coco(root, image_set, transforms, mode="instances"):
         # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
     }
 
-    t = [ConvertCocoPolysToMask()]
-
-    if transforms is not None:
-        t.append(transforms)
-    transforms = T.Compose(t)
-
     img_folder, ann_file = PATHS[image_set]
     img_folder = os.path.join(root, img_folder)
     ann_file = os.path.join(root, ann_file)
 
-    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+    if use_v2:
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        target_keys = ["boxes", "labels", "image_id"]
+        if with_masks:
+            target_keys += ["masks"]
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+    else:
+        # TODO: handle with_masks for V1?
+        t = [ConvertCocoPolysToMask()]
+        if transforms is not None:
+            t.append(transforms)
+        transforms = T.Compose(t)
+
+        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
 
     if image_set == "train":
         dataset = _coco_remove_images_without_annotations(dataset)
@@ -246,7 +232,3 @@ def get_coco(root, image_set, transforms, mode="instances"):
     # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
 
     return dataset
-
-
-def get_coco_kp(root, image_set, transforms):
-    return get_coco(root, image_set, transforms, mode="person_keypoints")
diff --git a/references/detection/engine.py b/references/detection/engine.py
index 0e5d55f189d..0e9bfffdf8a 100644
--- a/references/detection/engine.py
+++ b/references/detection/engine.py
@@ -26,7 +26,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc
 
     for images, targets in metric_logger.log_every(data_loader, print_freq, header):
         images = list(image.to(device) for image in images)
-        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
         with torch.cuda.amp.autocast(enabled=scaler is not None):
             loss_dict = model(images, targets)
             losses = sum(loss for loss in loss_dict.values())
@@ -97,7 +97,7 @@ def evaluate(model, data_loader, device):
         outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
         model_time = time.time() - model_time
 
-        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        res = {target["image_id"]: output for target, output in zip(targets, outputs)}
         evaluator_time = time.time()
         coco_evaluator.update(res)
         evaluator_time = time.time() - evaluator_time
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
index 5312cc036d6..d12e14b540c 100644
--- a/references/detection/group_by_aspect_ratio.py
+++ b/references/detection/group_by_aspect_ratio.py
@@ -63,7 +63,7 @@ def __iter__(self):
         expected_num_batches = len(self)
         num_remaining = expected_num_batches - num_batches
         if num_remaining > 0:
-            # for the remaining batches, take first the buffers with largest number
+            # for the remaining batches, take first the buffers with the largest number
             # of elements
             for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
                 remaining = self.batch_size - len(buffer_per_group[group_id])
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 779f3f218ca..e9b6d56c886 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -1,73 +1,114 @@
+from collections import defaultdict
+
 import torch
-import transforms as T
+import transforms as reference_transforms
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        import torchvision.tv_tensors
+
+        return torchvision.transforms.v2, torchvision.tv_tensors
+    else:
+        return reference_transforms, None
 
 
 class DetectionPresetTrain:
-    def __init__(self, *, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)):
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter.
+    def __init__(
+        self,
+        *,
+        data_augmentation,
+        hflip_prob=0.5,
+        mean=(123.0, 117.0, 104.0),
+        backend="pil",
+        use_v2=False,
+    ):
+
+        T, tv_tensors = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tv_tensor":
+            transforms.append(T.ToImage())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
         if data_augmentation == "hflip":
-            self.transforms = T.Compose(
-                [
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [T.RandomHorizontalFlip(p=hflip_prob)]
         elif data_augmentation == "lsj":
-            self.transforms = T.Compose(
-                [
-                    T.ScaleJitter(target_size=(1024, 1024)),
-                    T.FixedSizeCrop(size=(1024, 1024), fill=mean),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.ScaleJitter(target_size=(1024, 1024), antialias=True),
+                # TODO: FixedSizeCrop below doesn't work on tensors!
+                reference_transforms.FixedSizeCrop(size=(1024, 1024), fill=mean),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "multiscale":
-            self.transforms = T.Compose(
-                [
-                    T.RandomShortestSize(
-                        min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
-                    ),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "ssd":
-            self.transforms = T.Compose(
-                [
-                    T.RandomPhotometricDistort(),
-                    T.RandomZoomOut(fill=list(mean)),
-                    T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            fill = defaultdict(lambda: mean, {tv_tensors.Mask: 0}) if use_v2 else list(mean)
+            transforms += [
+                T.RandomPhotometricDistort(),
+                T.RandomZoomOut(fill=fill),
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "ssdlite":
-            self.transforms = T.Compose(
-                [
-                    T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         else:
             raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
 
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2.
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+
+        transforms += [T.ToDtype(torch.float, scale=True)]
+
+        if use_v2:
+            transforms += [
+                T.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.XYXY),
+                T.SanitizeBoundingBoxes(),
+                T.ToPureTensor(),
+            ]
+
+        self.transforms = T.Compose(transforms)
+
     def __call__(self, img, target):
         return self.transforms(img, target)
 
 
 class DetectionPresetEval:
-    def __init__(self):
-        self.transforms = T.Compose(
-            [
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-            ]
-        )
+    def __init__(self, backend="pil", use_v2=False):
+        T, _ = get_modules(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+        elif backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "tv_tensor":
+            transforms += [T.ToImage()]
+        else:
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.ToDtype(torch.float, scale=True)]
+
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
diff --git a/references/detection/train.py b/references/detection/train.py
index dea483c5f75..6a9ffb0af4d 100644
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -28,7 +28,7 @@
 import torchvision.models.detection
 import torchvision.models.detection.mask_rcnn
 import utils
-from coco_utils import get_coco, get_coco_kp
+from coco_utils import get_coco
 from engine import evaluate, train_one_epoch
 from group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler
 from torchvision.transforms import InterpolationMode
@@ -40,23 +40,32 @@ def copypaste_collate_fn(batch):
     return copypaste(*utils.collate_fn(batch))
 
 
-def get_dataset(name, image_set, transform, data_path):
-    paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)}
-    p, ds_fn, num_classes = paths[name]
-
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+def get_dataset(is_train, args):
+    image_set = "train" if is_train else "val"
+    num_classes, mode = {"coco": (91, "instances"), "coco_kp": (2, "person_keypoints")}[args.dataset]
+    with_masks = "mask" in args.model
+    ds = get_coco(
+        root=args.data_path,
+        image_set=image_set,
+        transforms=get_transform(is_train, args),
+        mode=mode,
+        use_v2=args.use_v2,
+        with_masks=with_masks,
+    )
     return ds, num_classes
 
 
-def get_transform(train, args):
-    if train:
-        return presets.DetectionPresetTrain(data_augmentation=args.data_augmentation)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.DetectionPresetTrain(
+            data_augmentation=args.data_augmentation, backend=args.backend, use_v2=args.use_v2
+        )
     elif args.weights and args.test_only:
         weights = torchvision.models.get_weight(args.weights)
         trans = weights.transforms()
         return lambda img, target: (trans(img), target)
     else:
-        return presets.DetectionPresetEval()
+        return presets.DetectionPresetEval(backend=args.backend, use_v2=args.use_v2)
 
 
 def get_args_parser(add_help=True):
@@ -65,7 +74,12 @@ def get_args_parser(add_help=True):
     parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help)
 
     parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
-    parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
+    parser.add_argument(
+        "--dataset",
+        default="coco",
+        type=str,
+        help="dataset name. Use coco for object detection and instance segmentation and coco_kp for Keypoint detection",
+    )
     parser.add_argument("--model", default="maskrcnn_resnet50_fpn", type=str, help="model name")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument(
@@ -159,10 +173,22 @@ def get_args_parser(add_help=True):
         help="Use CopyPaste data augmentation. Works only with data-augmentation='lsj'.",
     )
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
+
     return parser
 
 
 def main(args):
+    if args.backend.lower() == "tv_tensor" and not args.use_v2:
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor backend.")
+    if args.dataset not in ("coco", "coco_kp"):
+        raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}")
+    if "keypoint" in args.model and args.dataset != "coco_kp":
+        raise ValueError("Oops, if you want Keypoint detection, set --dataset coco_kp")
+    if args.dataset == "coco_kp" and args.use_v2:
+        raise ValueError("KeyPoint detection doesn't support V2 transforms yet")
+
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -177,8 +203,8 @@ def main(args):
     # Data loading code
     print("Loading data")
 
-    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path)
-    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path)
+    dataset, num_classes = get_dataset(is_train=True, args=args)
+    dataset_test, _ = get_dataset(is_train=False, args=args)
 
     print("Creating data loaders")
     if args.distributed:
@@ -262,7 +288,7 @@ def main(args):
         )
 
     if args.resume:
-        checkpoint = torch.load(args.resume, map_location="cpu")
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
         model_without_ddp.load_state_dict(checkpoint["model"])
         optimizer.load_state_dict(checkpoint["optimizer"])
         lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
diff --git a/references/detection/transforms.py b/references/detection/transforms.py
index d26bf6eac85..e07ccfc9921 100644
--- a/references/detection/transforms.py
+++ b/references/detection/transforms.py
@@ -53,14 +53,17 @@ def forward(
         return image, target
 
 
-class ConvertImageDtype(nn.Module):
-    def __init__(self, dtype: torch.dtype) -> None:
+class ToDtype(nn.Module):
+    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
         super().__init__()
         self.dtype = dtype
+        self.scale = scale
 
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
     ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
         image = F.convert_image_dtype(image, self.dtype)
         return image, target
 
@@ -293,11 +296,13 @@ def __init__(
         target_size: Tuple[int, int],
         scale_range: Tuple[float, float] = (0.1, 2.0),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias=True,
     ):
         super().__init__()
         self.target_size = target_size
         self.scale_range = scale_range
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
@@ -315,14 +320,17 @@ def forward(
         new_width = int(orig_width * r)
         new_height = int(orig_height * r)
 
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
 
         if target is not None:
             target["boxes"][:, 0::2] *= new_width / orig_width
             target["boxes"][:, 1::2] *= new_height / orig_height
             if "masks" in target:
                 target["masks"] = F.resize(
-                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
+                    target["masks"],
+                    [new_height, new_width],
+                    interpolation=InterpolationMode.NEAREST,
+                    antialias=self.antialias,
                 )
 
         return image, target
diff --git a/references/optical_flow/README.md b/references/optical_flow/README.md
index a7ac0223739..6ad1d4079f7 100644
--- a/references/optical_flow/README.md
+++ b/references/optical_flow/README.md
@@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 --nnodes 1 train.py --val-dataset sintel --batch-siz
 
 This should give an epe of about 1.3822 on the clean pass and 2.7161 on the
 final pass of Sintel-train. Results may vary slightly depending on the batch
-size and the number of GPUs. For the most accurate resuts use 1 GPU and
+size and the number of GPUs. For the most accurate results use 1 GPU and
 `--batch-size 1`:
 
 ```
diff --git a/references/optical_flow/train.py b/references/optical_flow/train.py
index be6ffe4ccef..7012ea6f810 100644
--- a/references/optical_flow/train.py
+++ b/references/optical_flow/train.py
@@ -82,7 +82,7 @@ def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, b
 
     def inner_loop(blob):
         if blob[0].dim() == 3:
-            # input is not batched so we add an extra dim for consistency
+            # input is not batched, so we add an extra dim for consistency
             blob = [x[None, :, :, :] if x is not None else None for x in blob]
 
         image1, image2, flow_gt = blob[:3]
@@ -150,7 +150,7 @@ def preprocessing(img1, img2, flow, valid_flow_mask):
 
     for name in val_datasets:
         if name == "kitti":
-            # Kitti has different image sizes so we need to individually pad them, we can't batch.
+            # Kitti has different image sizes, so we need to individually pad them, we can't batch.
             # see comment in InputPadder
             if args.batch_size != 1 and (not args.distributed or args.rank == 0):
                 warnings.warn(
@@ -226,7 +226,7 @@ def main(args):
         model_without_ddp = model
 
     if args.resume is not None:
-        checkpoint = torch.load(args.resume, map_location="cpu")
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
         model_without_ddp.load_state_dict(checkpoint["model"])
 
     if args.test_only:
diff --git a/references/optical_flow/transforms.py b/references/optical_flow/transforms.py
index 6011608183a..bc831a2ee52 100644
--- a/references/optical_flow/transforms.py
+++ b/references/optical_flow/transforms.py
@@ -164,7 +164,7 @@ class RandomResizeAndCrop(torch.nn.Module):
     # The reason we don't rely on RandomResizedCrop is because of a significant
     # difference in the parametrization of both transforms, in particular,
     # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
+    # which leads to fairly different results (and different epe). For more details see
     # https://github.com/pytorch/vision/pull/5026/files#r762932579
     def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, stretch_prob=0.8):
         super().__init__()
@@ -196,8 +196,12 @@ def forward(self, img1, img2, flow, valid_flow_mask):
 
         if torch.rand(1).item() < self.resize_prob:
             # rescale the images
-            img1 = F.resize(img1, size=(new_h, new_w))
-            img2 = F.resize(img2, size=(new_h, new_w))
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the OF models with antialias=True?
+            img1 = F.resize(img1, size=(new_h, new_w), antialias=False)
+            img2 = F.resize(img2, size=(new_h, new_w), antialias=False)
             if valid_flow_mask is None:
                 flow = F.resize(flow, size=(new_h, new_w))
                 flow = flow * torch.tensor([scale_x, scale_y])[:, None, None]
@@ -208,7 +212,7 @@ def forward(self, img1, img2, flow, valid_flow_mask):
 
         # Note: For sparse datasets (Kitti), the original code uses a "margin"
         # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
+        # We don't, not sure if it matters much
         y0 = torch.randint(0, img1.shape[1] - self.crop_size[0], size=(1,)).item()
         x0 = torch.randint(0, img1.shape[2] - self.crop_size[1], size=(1,)).item()
 
diff --git a/references/optical_flow/utils.py b/references/optical_flow/utils.py
index 8b07e9de35c..cd4b16eb0d8 100644
--- a/references/optical_flow/utils.py
+++ b/references/optical_flow/utils.py
@@ -181,7 +181,7 @@ def sequence_loss(flow_preds, flow_gt, valid_flow_mask, gamma=0.8, max_flow=400)
     if gamma > 1:
         raise ValueError(f"Gamma should be < 1, got {gamma}.")
 
-    # exlude invalid pixels and extremely large diplacements
+    # exclude invalid pixels and extremely large diplacements
     flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
     valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
 
@@ -248,7 +248,7 @@ def setup_ddp(args):
     # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2
 
     if all(key in os.environ for key in ("LOCAL_RANK", "RANK", "WORLD_SIZE")):
-        # if we're here, the script was called with torchrun. Otherwise
+        # if we're here, the script was called with torchrun. Otherwise,
         # these args will be set already by the run_with_submitit script
         args.local_rank = int(os.environ["LOCAL_RANK"])
         args.rank = int(os.environ["RANK"])
diff --git a/references/segmentation/README.md b/references/segmentation/README.md
index 2c7391c8380..2c8e581dac1 100644
--- a/references/segmentation/README.md
+++ b/references/segmentation/README.md
@@ -1,7 +1,7 @@
 # Semantic segmentation reference training scripts
 
 This folder contains reference training scripts for semantic segmentation.
-They serve as a log of how to train specific models, as provide baseline
+They serve as a log of how to train specific models and provide baseline
 training and evaluation scripts to quickly bootstrap research.
 
 All models have been trained on 8x V100 GPUs.
diff --git a/references/segmentation/coco_utils.py b/references/segmentation/coco_utils.py
index e02434012f1..6a15dbefb52 100644
--- a/references/segmentation/coco_utils.py
+++ b/references/segmentation/coco_utils.py
@@ -68,11 +68,6 @@ def _has_valid_annotation(anno):
         # if more than 1k pixels occupied in the image
         return sum(obj["area"] for obj in anno) > 1000
 
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
-
     ids = []
     for ds_idx, img_id in enumerate(dataset.ids):
         ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -86,7 +81,7 @@ def _has_valid_annotation(anno):
     return dataset
 
 
-def get_coco(root, image_set, transforms):
+def get_coco(root, image_set, transforms, use_v2=False):
     PATHS = {
         "train": ("train2017", os.path.join("annotations", "instances_train2017.json")),
         "val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
@@ -94,13 +89,24 @@ def get_coco(root, image_set, transforms):
     }
     CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]
 
-    transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
-
     img_folder, ann_file = PATHS[image_set]
     img_folder = os.path.join(root, img_folder)
     ann_file = os.path.join(root, ann_file)
 
-    dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+    # The 2 "Compose" below achieve the same thing: converting coco detection
+    # samples into segmentation-compatible samples. They just do it with
+    # slightly different implementations. We could refactor and unify, but
+    # keeping them separate helps keeping the v2 version clean
+    if use_v2:
+        import v2_extras
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"})
+    else:
+        transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
 
     if image_set == "train":
         dataset = _coco_remove_images_without_annotations(dataset, CAT_LIST)
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index ed02ae660e4..803769fcafc 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -1,39 +1,109 @@
 import torch
-import transforms as T
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        import torchvision.tv_tensors
+        import v2_extras
+
+        return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras
+    else:
+        import transforms
+
+        return transforms, None, None
 
 
 class SegmentationPresetTrain:
-    def __init__(self, *, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        min_size = int(0.5 * base_size)
-        max_size = int(2.0 * base_size)
+    def __init__(
+        self,
+        *,
+        base_size,
+        crop_size,
+        hflip_prob=0.5,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+        backend="pil",
+        use_v2=False,
+    ):
+        T, tv_tensors, v2_extras = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tv_tensor":
+            transforms.append(T.ToImage())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))]
 
-        trans = [T.RandomResize(min_size, max_size)]
         if hflip_prob > 0:
-            trans.append(T.RandomHorizontalFlip(hflip_prob))
-        trans.extend(
-            [
-                T.RandomCrop(crop_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
+            transforms += [T.RandomHorizontalFlip(hflip_prob)]
+
+        if use_v2:
+            # We need a custom pad transform here, since the padding we want to perform here is fundamentally
+            # different from the padding in `RandomCrop` if `pad_if_needed=True`.
+            transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})]
+
+        transforms += [T.RandomCrop(crop_size)]
+
+        if backend == "pil":
+            transforms += [T.PILToTensor()]
+
+        if use_v2:
+            img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor
+            transforms += [
+                T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True)
             ]
-        )
-        self.transforms = T.Compose(trans)
+        else:
+            # No need to explicitly convert masks as they're magically int64 already
+            transforms += [T.ToDtype(torch.float, scale=True)]
+
+        transforms += [T.Normalize(mean=mean, std=std)]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
 
 
 class SegmentationPresetEval:
-    def __init__(self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        self.transforms = T.Compose(
-            [
-                T.RandomResize(base_size, base_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
-            ]
-        )
+    def __init__(
+        self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False
+    ):
+        T, _, _ = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "tv_tensor":
+            transforms += [T.ToImage()]
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        if use_v2:
+            transforms += [T.Resize(size=(base_size, base_size))]
+        else:
+            transforms += [T.RandomResize(min_size=base_size, max_size=base_size)]
+
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+
+        transforms += [
+            T.ToDtype(torch.float, scale=True),
+            T.Normalize(mean=mean, std=std),
+        ]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index bb57e65b801..abdc3c6aacb 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -14,24 +14,30 @@
 from torchvision.transforms import functional as F, InterpolationMode
 
 
-def get_dataset(dir_path, name, image_set, transform):
+def get_dataset(args, is_train):
     def sbd(*args, **kwargs):
+        kwargs.pop("use_v2")
         return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)
 
+    def voc(*args, **kwargs):
+        kwargs.pop("use_v2")
+        return torchvision.datasets.VOCSegmentation(*args, **kwargs)
+
     paths = {
-        "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
-        "voc_aug": (dir_path, sbd, 21),
-        "coco": (dir_path, get_coco, 21),
+        "voc": (args.data_path, voc, 21),
+        "voc_aug": (args.data_path, sbd, 21),
+        "coco": (args.data_path, get_coco, 21),
     }
-    p, ds_fn, num_classes = paths[name]
+    p, ds_fn, num_classes = paths[args.dataset]
 
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    image_set = "train" if is_train else "val"
+    ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
     return ds, num_classes
 
 
-def get_transform(train, args):
-    if train:
-        return presets.SegmentationPresetTrain(base_size=520, crop_size=480)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2)
     elif args.weights and args.test_only:
         weights = torchvision.models.get_weight(args.weights)
         trans = weights.transforms()
@@ -44,7 +50,7 @@ def preprocessing(img, target):
 
         return preprocessing
     else:
-        return presets.SegmentationPresetEval(base_size=520)
+        return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2)
 
 
 def criterion(inputs, target):
@@ -120,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi
 
 
 def main(args):
+    if args.backend.lower() != "pil" and not args.use_v2:
+        # TODO: Support tensor backend in V1?
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.")
+    if args.use_v2 and args.dataset != "coco":
+        raise ValueError("v2 is only support supported for coco dataset for now.")
+
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -134,8 +146,8 @@ def main(args):
     else:
         torch.backends.cudnn.benchmark = True
 
-    dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args))
-    dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args))
+    dataset, num_classes = get_dataset(args, is_train=True)
+    dataset_test, _ = get_dataset(args, is_train=False)
 
     if args.distributed:
         train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
@@ -211,7 +223,7 @@ def main(args):
         lr_scheduler = main_lr_scheduler
 
     if args.resume:
-        checkpoint = torch.load(args.resume, map_location="cpu")
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
         model_without_ddp.load_state_dict(checkpoint["model"], strict=not args.test_only)
         if not args.test_only:
             optimizer.load_state_dict(checkpoint["optimizer"])
@@ -260,7 +272,7 @@ def get_args_parser(add_help=True):
     parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
     parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
     parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name")
-    parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss")
+    parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument(
         "-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
@@ -307,6 +319,8 @@ def get_args_parser(add_help=True):
     # Mixed precision training parameters
     parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
     return parser
 
 
diff --git a/references/segmentation/transforms.py b/references/segmentation/transforms.py
index 518048db2fa..6934b9f862e 100644
--- a/references/segmentation/transforms.py
+++ b/references/segmentation/transforms.py
@@ -35,7 +35,7 @@ def __init__(self, min_size, max_size=None):
 
     def __call__(self, image, target):
         size = random.randint(self.min_size, self.max_size)
-        image = F.resize(image, size)
+        image = F.resize(image, size, antialias=True)
         target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST)
         return image, target
 
@@ -81,11 +81,14 @@ def __call__(self, image, target):
         return image, target
 
 
-class ConvertImageDtype:
-    def __init__(self, dtype):
+class ToDtype:
+    def __init__(self, dtype, scale=False):
         self.dtype = dtype
+        self.scale = scale
 
     def __call__(self, image, target):
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
         image = F.convert_image_dtype(image, self.dtype)
         return image, target
 
diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py
index 4ea24db83ed..92db1899851 100644
--- a/references/segmentation/utils.py
+++ b/references/segmentation/utils.py
@@ -88,7 +88,7 @@ def compute(self):
         return acc_global, acc, iu
 
     def reduce_from_all_processes(self):
-        reduce_across_processes(self.mat)
+        self.mat = reduce_across_processes(self.mat).to(torch.int64)
 
     def __str__(self):
         acc_global, acc, iu = self.compute()
@@ -267,9 +267,9 @@ def init_distributed_mode(args):
         args.rank = int(os.environ["RANK"])
         args.world_size = int(os.environ["WORLD_SIZE"])
         args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
+    # elif "SLURM_PROCID" in os.environ:
+    #     args.rank = int(os.environ["SLURM_PROCID"])
+    #     args.gpu = args.rank % torch.cuda.device_count()
     elif hasattr(args, "rank"):
         pass
     else:
diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
new file mode 100644
index 00000000000..2d9eb3e661a
--- /dev/null
+++ b/references/segmentation/v2_extras.py
@@ -0,0 +1,83 @@
+"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1."""
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+class PadIfSmaller(v2.Transform):
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = size
+        self.fill = v2._utils._setup_fill_arg(fill)
+
+    def make_params(self, sample):
+        _, height, width = v2._utils.query_chw(sample)
+        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
+        needs_padding = any(padding)
+        return dict(padding=padding, needs_padding=needs_padding)
+
+    def transform(self, inpt, params):
+        if not params["needs_padding"]:
+            return inpt
+
+        fill = v2._utils._get_fill(self.fill, type(inpt))
+        fill = v2._utils._convert_fill_arg(fill)
+
+        return v2.functional.pad(inpt, padding=params["padding"], fill=fill)
+
+
+class CocoDetectionToVOCSegmentation(v2.Transform):
+    """Turn samples from datasets.CocoDetection into the same format as VOCSegmentation.
+
+    This is achieved in two steps:
+
+    1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately,
+       the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not
+       present in VOC are dropped and replaced by background.
+    2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual
+       mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where
+       the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation
+       mask while pixels that belong to multiple detection masks are marked as invalid.
+    """
+
+    COCO_TO_VOC_LABEL_MAP = dict(
+        zip(
+            [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72],
+            range(21),
+        )
+    )
+    INVALID_VALUE = 255
+
+    def _coco_detection_masks_to_voc_segmentation_mask(self, target):
+        if "masks" not in target:
+            return None
+
+        instance_masks, instance_labels_coco = target["masks"], target["labels"]
+
+        valid_labels_voc = [
+            (idx, label_voc)
+            for idx, label_coco in enumerate(instance_labels_coco.tolist())
+            if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None
+        ]
+
+        if not valid_labels_voc:
+            return None
+
+        valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc)
+
+        instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8)
+        instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8)
+
+        # Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as
+        # there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step.
+        segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0)
+        segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE
+
+        return segmentation_mask
+
+    def forward(self, image, target):
+        segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target)
+        if segmentation_mask is None:
+            segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8)
+
+        return image, tv_tensors.Mask(segmentation_mask)
diff --git a/references/similarity/sampler.py b/references/similarity/sampler.py
index f4564eca33e..fe6517418ab 100644
--- a/references/similarity/sampler.py
+++ b/references/similarity/sampler.py
@@ -48,7 +48,7 @@ def __init__(self, groups, p, k):
 
         # Ensures there are enough classes to sample from
         if len(self.groups) < p:
-            raise ValueError("There are not enought classes to sample from")
+            raise ValueError("There are not enough classes to sample from")
 
     def __iter__(self):
         # Shuffle samples within groups
diff --git a/references/similarity/train.py b/references/similarity/train.py
index 146e2bef688..7686729927e 100644
--- a/references/similarity/train.py
+++ b/references/similarity/train.py
@@ -101,7 +101,7 @@ def main(args):
 
     model = EmbeddingNet()
     if args.resume:
-        model.load_state_dict(torch.load(args.resume))
+        model.load_state_dict(torch.load(args.resume, weights_only=True))
 
     model.to(device)
 
diff --git a/references/video_classification/README.md b/references/video_classification/README.md
index 9bd1b9cc285..39c5d8f1bba 100644
--- a/references/video_classification/README.md
+++ b/references/video_classification/README.md
@@ -76,11 +76,12 @@ Input data augmentations at validation time (with optional parameters):
 5. Convert BCHW to CBHW
 
 This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the
-batch size per GPU. Moreover note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
+batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
 Video resnet models:
 ```
 # number of frames per clip
 --clip_len 16 \ 
+--frame-rate 15 \
 # allow for temporal jittering
 --clips_per_video 5 \
 --batch-size 24 \
@@ -97,6 +98,21 @@ Video resnet models:
 --val-crop-size 112 112
 ```
 
+### S3D
+
+The S3D model was trained similarly to the above but with the following changes on the default configuration:
+```
+--batch-size=12 --lr 0.2 --clip-len 64 --clips-per-video 5 --sync-bn \
+--train-resize-size 256 256 --train-crop-size 224 224 --val-resize-size 256 256 --val-crop-size 224 224
+```
+
+We used 64 GPUs to train the architecture. 
+
+To estimate the validation statistics of the model, we run the reference script with the following configuration:
+```
+--batch-size=16 --test-only --clip-len 128 --clips-per-video 1 
+```
+
 ### Additional video modelling resources
 
 - [Video Model Zoo](https://github.com/facebookresearch/VMZ)
diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py
index ef774052257..f73802c9666 100644
--- a/references/video_classification/presets.py
+++ b/references/video_classification/presets.py
@@ -15,7 +15,11 @@ def __init__(
     ):
         trans = [
             transforms.ConvertImageDtype(torch.float32),
-            transforms.Resize(resize_size),
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the video models with antialias=True?
+            transforms.Resize(resize_size, antialias=False),
         ]
         if hflip_prob > 0:
             trans.append(transforms.RandomHorizontalFlip(hflip_prob))
@@ -31,7 +35,11 @@ def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645),
         self.transforms = transforms.Compose(
             [
                 transforms.ConvertImageDtype(torch.float32),
-                transforms.Resize(resize_size),
+                # We hard-code antialias=False to preserve results after we changed
+                # its default from None to True (see
+                # https://github.com/pytorch/vision/pull/7160)
+                # TODO: we could re-train the video models with antialias=True?
+                transforms.Resize(resize_size, antialias=False),
                 transforms.Normalize(mean=mean, std=std),
                 transforms.CenterCrop(crop_size),
                 ConvertBCHWtoCBHW(),
diff --git a/references/video_classification/train.py b/references/video_classification/train.py
index e26231bb914..a03a9722003 100644
--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -164,7 +164,7 @@ def main(args):
 
     if args.cache_dataset and os.path.exists(cache_path):
         print(f"Loading dataset_train from {cache_path}")
-        dataset, _ = torch.load(cache_path)
+        dataset, _ = torch.load(cache_path, weights_only=False)
         dataset.transform = transform_train
     else:
         if args.distributed:
@@ -201,7 +201,7 @@ def main(args):
 
     if args.cache_dataset and os.path.exists(cache_path):
         print(f"Loading dataset_test from {cache_path}")
-        dataset_test, _ = torch.load(cache_path)
+        dataset_test, _ = torch.load(cache_path, weights_only=False)
         dataset_test.transform = transform_test
     else:
         if args.distributed:
@@ -295,7 +295,7 @@ def main(args):
         model_without_ddp = model.module
 
     if args.resume:
-        checkpoint = torch.load(args.resume, map_location="cpu")
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
         model_without_ddp.load_state_dict(checkpoint["model"])
         optimizer.load_state_dict(checkpoint["optimizer"])
         lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
diff --git a/release/README.md b/release/README.md
new file mode 100644
index 00000000000..830f964e531
--- /dev/null
+++ b/release/README.md
@@ -0,0 +1,3 @@
+# Vision Release Scripts
+
+This folder contains script(s) used for releasing new versions of the Vision package
diff --git a/release/apply-release-changes.py b/release/apply-release-changes.py
new file mode 100644
index 00000000000..22dd37216f8
--- /dev/null
+++ b/release/apply-release-changes.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""
+apply-release-changes.py - Cross-platform script to replace main with a specified release version in YML files
+
+This script performs two replacements in YML files in .github/workflows/:
+1. Replaces @main with @release/VERSION
+2. Replaces 'test-infra-ref: main' with 'test-infra-ref: release/VERSION'
+
+Usage:
+  python apply-release-changes.py VERSION
+
+Example:
+  python apply-release-changes.py 2.7
+"""
+
+import os
+import pathlib
+import sys
+from typing import Optional
+
+
+def replace_in_file(file_path: pathlib.Path, old_text: str, new_text: str) -> None:
+    """Replace all occurrences of old_text with new_text in the specified file."""
+    try:
+        # Try reading the file without specifying encoding to use the default
+        encoding = None
+        try:
+            content = file_path.read_text()
+        except UnicodeDecodeError:
+            # If that fails, try with UTF-8
+            encoding = "utf-8"
+            content = file_path.read_text(encoding=encoding)
+
+        # Perform the replacement
+        new_content = content.replace(old_text, new_text)
+
+        # Only write if changes were made
+        if new_content != content:
+            # Write with the same encoding we used to read
+            if encoding:
+                file_path.write_text(new_content, encoding=encoding)
+            else:
+                file_path.write_text(new_content)
+            print(f"Updated: {file_path}")
+
+    except Exception as e:
+        print(f"Error processing {file_path}: {e}")
+
+
+def find_repo_root() -> Optional[pathlib.Path]:
+    """Find the git repository root by searching for .git directory."""
+    # Start from the current directory and traverse upwards
+    current_path = pathlib.Path.cwd().absolute()
+
+    while current_path != current_path.parent:
+        # Check if .git directory exists
+        git_dir = current_path / ".git"
+        if git_dir.exists() and git_dir.is_dir():
+            return current_path
+
+        # Move up one directory
+        current_path = current_path.parent
+
+    # If we get here, we didn't find a repository root
+    return None
+
+
+def main() -> None:
+    # Check if version is provided as command line argument
+    if len(sys.argv) != 2:
+        print("Error: Exactly one version parameter is required")
+        print(f"Usage: python {os.path.basename(__file__)} VERSION")
+        print("Example: python apply-release-changes.py 2.7")
+        sys.exit(1)
+
+    # Get version from command line argument
+    version = sys.argv[1]
+    print(f"Using release version: {version}")
+
+    # Find the repository root by searching for .git directory
+    repo_root = find_repo_root()
+    if not repo_root:
+        print("Error: Not inside a git repository. Please run from within a git repository.")
+        sys.exit(1)
+
+    print(f"Repository root found at: {repo_root}")
+
+    # Get path to workflow directory
+    workflow_dir = repo_root / ".github" / "workflows"
+
+    # Process all workflow files and perform both replacements on each file
+    for yml_file in workflow_dir.glob("*.yml"):
+        replace_in_file(yml_file, "@main", f"@release/{version}")
+        replace_in_file(yml_file, "test-infra-ref: main", f"test-infra-ref: release/{version}")
+
+
+if __name__ == "__main__":
+    print("Starting YML updates...")
+    main()
+    print("YML updates completed.")
diff --git a/scripts/download_model_urls.py b/scripts/download_model_urls.py
new file mode 100644
index 00000000000..f5f53d71e98
--- /dev/null
+++ b/scripts/download_model_urls.py
@@ -0,0 +1,41 @@
+import asyncio
+import sys
+from pathlib import Path
+from time import perf_counter
+from urllib.parse import urlsplit
+
+import aiofiles
+import aiohttp
+from torchvision import models
+from tqdm.asyncio import tqdm
+
+
+async def main(download_root):
+    download_root.mkdir(parents=True, exist_ok=True)
+    urls = {weight.url for name in models.list_models() for weight in iter(models.get_model_weights(name))}
+
+    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=None)) as session:
+        await tqdm.gather(*[download(download_root, session, url) for url in urls])
+
+
+async def download(download_root, session, url):
+    response = await session.get(url, params=dict(source="ci"))
+
+    assert response.ok
+
+    file_name = Path(urlsplit(url).path).name
+    async with aiofiles.open(download_root / file_name, "wb") as f:
+        async for data in response.content.iter_any():
+            await f.write(data)
+
+
+if __name__ == "__main__":
+    download_root = (
+        (Path(sys.argv[1]) if len(sys.argv) > 1 else Path("~/.cache/torch/hub/checkpoints")).expanduser().resolve()
+    )
+    print(f"Downloading model weights to {download_root}")
+    start = perf_counter()
+    asyncio.get_event_loop().run_until_complete(main(download_root))
+    stop = perf_counter()
+    minutes, seconds = divmod(stop - start, 60)
+    print(f"Download took {minutes:2.0f}m {seconds:2.0f}s")
diff --git a/scripts/release_notes/classify_prs.py b/scripts/release_notes/classify_prs.py
index de55c299381..5847c9f03f5 100644
--- a/scripts/release_notes/classify_prs.py
+++ b/scripts/release_notes/classify_prs.py
@@ -1,40 +1,22 @@
 # In[1]:
-
-# imports and set configuration
 import pandas as pd
-from retrieve_prs_data import run
-
-exclude_prototype = True
-data_filename = "10.0_to_11.0-rc2.json"
-previous_release = "v10.0"
-current_release = "v11.0-rc2"
 
 # In[2]:
-
-
+data_filename = "data.json"
 df = pd.read_json(data_filename).T
 df.tail()
 
-
 # In[3]:
-
-
 all_labels = {lbl for labels in df["labels"] for lbl in labels}
 all_labels
 
-
 # In[4]:
-
-
 # Add one column per label
 for label in all_labels:
     df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
 df.head()
 
-
 # In[5]:
-
-
 # Add a clean "module" column. It contains tuples since PRs can have more than one module.
 # Maybe we should include "topics" in that column as well?
 
@@ -51,24 +33,15 @@
 df["module"] = df.module.apply(tuple)
 df.head()
 
-
 # In[6]:
-
-
 mod_df = df.set_index("module").sort_index()
 mod_df.tail()
 
-
 # In[7]:
-
-
 # All improvement PRs
 mod_df[mod_df["enhancement"]].head()
 
-
 # In[8]:
-
-
 # improvement f module
 # note: don't filter module name on the index as the index contain tuples with non-exclusive values
 # Use the boolean column instead
@@ -76,12 +49,10 @@
 
 
 # In[9]:
-
-
-def format_prs(mod_df):
+def format_prs(mod_df, exclude_prototype=True):
     out = []
     for idx, row in mod_df.iterrows():
-        if exclude_prototype and row["prototype"]:
+        if exclude_prototype and "prototype" in row and row["prototype"]:
             continue
         modules = idx
         # Put "documentation" and "tests" first for sorting to be dece
@@ -98,8 +69,6 @@ def format_prs(mod_df):
 
 
 # In[10]:
-
-
 included_prs = pd.DataFrame()
 
 # If labels are accurate, this shouhld generate most of the release notes already
@@ -112,27 +81,40 @@ def format_prs(mod_df):
     ("Bug Fixes", "bug"),
     ("Code Quality", "code quality"),
 ):
-    print(f"## {section_title}")
-    print()
-    tmp_df = mod_df[mod_df[module_idx]]
-    included_prs = pd.concat([included_prs, tmp_df])
-    print(format_prs(tmp_df))
-    print()
+    if module_idx in mod_df:
+        print(f"## {section_title}")
+        print()
+        tmp_df = mod_df[mod_df[module_idx]]
+        included_prs = pd.concat([included_prs, tmp_df])
+        print(format_prs(tmp_df))
+        print()
 
 
 # In[11]:
-
-
 # Missing PRs are these ones... classify them manually
 missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
 print(format_prs(missing_prs))
 
 # In[12]:
-
 # Generate list of contributors
 print()
 print("## Contributors")
 
-command_to_run = f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *$//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/, $//' | fold -s"
-rc, output, err = run(command_to_run)
-print(output)
+previous_release = "c35d3855ccbfa6a36e6ae6337a1f2c721c1f1e78"
+current_release = "5181a854d8b127cf465cd22a67c1b5aaf6ccae05"
+print(
+    f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/,//' | fold -s"
+)
+
+# In[13]:
+# Utility to extract PR numbers only from multiple lines, useful to bundle all
+# the docs changes for example:
+import re
+
+s = """
+
+[] Remove unnecessary dependency from macOS/Conda binaries (#8077)
+[rocm] [ROCm] remove HCC references (#8070)
+"""
+
+print(", ".join(re.findall("(#\\d+)", s)))
diff --git a/setup.cfg b/setup.cfg
index f36195194cd..0f4ddbfab10 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,7 +2,7 @@
 universal=1
 
 [metadata]
-license_file = LICENSE
+license_files = LICENSE
 
 [pep8]
 max-line-length = 120
@@ -10,7 +10,7 @@ max-line-length = 120
 [flake8]
 # note: we ignore all 501s (line too long) anyway as they're taken care of by black
 max-line-length = 120
-ignore = E203, E402, W503, W504, F821, E501
+ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE
 per-file-ignores =
     __init__.py: F401, F403, F405
     ./hubconf.py: F401
diff --git a/setup.py b/setup.py
index 54319451521..5e69fa50f52 100644
--- a/setup.py
+++ b/setup.py
@@ -2,50 +2,85 @@
 import distutils.spawn
 import glob
 import os
+import shlex
 import shutil
 import subprocess
 import sys
+import warnings
+from pathlib import Path
 
 import torch
 from pkg_resources import DistributionNotFound, get_distribution, parse_version
 from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME, CUDAExtension
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME, CUDAExtension, ROCM_HOME
+
+FORCE_CUDA = os.getenv("FORCE_CUDA", "0") == "1"
+FORCE_MPS = os.getenv("FORCE_MPS", "0") == "1"
+DEBUG = os.getenv("DEBUG", "0") == "1"
+USE_PNG = os.getenv("TORCHVISION_USE_PNG", "1") == "1"
+USE_JPEG = os.getenv("TORCHVISION_USE_JPEG", "1") == "1"
+USE_WEBP = os.getenv("TORCHVISION_USE_WEBP", "1") == "1"
+USE_NVJPEG = os.getenv("TORCHVISION_USE_NVJPEG", "1") == "1"
+NVCC_FLAGS = os.getenv("NVCC_FLAGS", None)
+# Note: the GPU video decoding stuff used to be called "video codec", which
+# isn't an accurate or descriptive name considering there are at least 2 other
+# video decoding backends in torchvision. I'm renaming this to "gpu video
+# decoder" where possible, keeping user facing names (like the env var below) to
+# the old scheme for BC.
+USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1"
+# Same here: "use ffmpeg" was used to denote "use cpu video decoder".
+USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1"
+
+TORCHVISION_INCLUDE = os.environ.get("TORCHVISION_INCLUDE", "")
+TORCHVISION_LIBRARY = os.environ.get("TORCHVISION_LIBRARY", "")
+TORCHVISION_INCLUDE = TORCHVISION_INCLUDE.split(os.pathsep) if TORCHVISION_INCLUDE else []
+TORCHVISION_LIBRARY = TORCHVISION_LIBRARY.split(os.pathsep) if TORCHVISION_LIBRARY else []
+
+ROOT_DIR = Path(__file__).absolute().parent
+CSRS_DIR = ROOT_DIR / "torchvision/csrc"
+IS_ROCM = (torch.version.hip is not None) and (ROCM_HOME is not None)
+BUILD_CUDA_SOURCES = (torch.cuda.is_available() and ((CUDA_HOME is not None) or IS_ROCM)) or FORCE_CUDA
+
+package_name = os.getenv("TORCHVISION_PACKAGE_NAME", "torchvision")
+
+print("Torchvision build configuration:")
+print(f"{FORCE_CUDA = }")
+print(f"{FORCE_MPS = }")
+print(f"{DEBUG = }")
+print(f"{USE_PNG = }")
+print(f"{USE_JPEG = }")
+print(f"{USE_WEBP = }")
+print(f"{USE_NVJPEG = }")
+print(f"{NVCC_FLAGS = }")
+print(f"{USE_CPU_VIDEO_DECODER = }")
+print(f"{USE_GPU_VIDEO_DECODER = }")
+print(f"{TORCHVISION_INCLUDE = }")
+print(f"{TORCHVISION_LIBRARY = }")
+print(f"{IS_ROCM = }")
+print(f"{BUILD_CUDA_SOURCES = }")
+
+
+def get_version():
+    with open(ROOT_DIR / "version.txt") as f:
+        version = f.readline().strip()
+    sha = "Unknown"
 
-
-def read(*names, **kwargs):
-    with open(os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")) as fp:
-        return fp.read()
-
-
-def get_dist(pkgname):
     try:
-        return get_distribution(pkgname)
-    except DistributionNotFound:
-        return None
-
-
-cwd = os.path.dirname(os.path.abspath(__file__))
+        sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(ROOT_DIR)).decode("ascii").strip()
+    except Exception:
+        pass
 
-version_txt = os.path.join(cwd, "version.txt")
-with open(version_txt) as f:
-    version = f.readline().strip()
-sha = "Unknown"
-package_name = "torchvision"
+    if os.getenv("BUILD_VERSION"):
+        version = os.getenv("BUILD_VERSION")
+    elif sha != "Unknown":
+        version += "+" + sha[:7]
 
-try:
-    sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("ascii").strip()
-except Exception:
-    pass
+    return version, sha
 
-if os.getenv("BUILD_VERSION"):
-    version = os.getenv("BUILD_VERSION")
-elif sha != "Unknown":
-    version += "+" + sha[:7]
 
-
-def write_version_file():
-    version_path = os.path.join(cwd, "torchvision", "version.py")
-    with open(version_path, "w") as f:
+def write_version_file(version, sha):
+    # Exists for BC, probably completely useless.
+    with open(ROOT_DIR / "torchvision" / "version.py", "w") as f:
         f.write(f"__version__ = '{version}'\n")
         f.write(f"git_version = {repr(sha)}\n")
         f.write("from torchvision.extension import _check_cuda_version\n")
@@ -53,178 +88,56 @@ def write_version_file():
         f.write("    cuda = _check_cuda_version()\n")
 
 
-pytorch_dep = "torch"
-if os.getenv("PYTORCH_VERSION"):
-    pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
-
-requirements = [
-    "typing_extensions",
-    "numpy",
-    "requests",
-    pytorch_dep,
-]
-
-# Excluding 8.3.* because of https://github.com/pytorch/vision/issues/4934
-pillow_ver = " >= 5.3.0, !=8.3.*"
-pillow_req = "pillow-simd" if get_dist("pillow-simd") is not None else "pillow"
-requirements.append(pillow_req + pillow_ver)
-
-
-def find_library(name, vision_include):
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    build_prefix = os.environ.get("BUILD_PREFIX", None)
-    is_conda_build = build_prefix is not None
-
-    library_found = False
-    conda_installed = False
-    lib_folder = None
-    include_folder = None
-    library_header = f"{name}.h"
-
-    # Lookup in TORCHVISION_INCLUDE or in the package file
-    package_path = [os.path.join(this_dir, "torchvision")]
-    for folder in vision_include + package_path:
-        candidate_path = os.path.join(folder, library_header)
-        library_found = os.path.exists(candidate_path)
-        if library_found:
-            break
-
-    if not library_found:
-        print(f"Running build on conda-build: {is_conda_build}")
-        if is_conda_build:
-            # Add conda headers/libraries
-            if os.name == "nt":
-                build_prefix = os.path.join(build_prefix, "Library")
-            include_folder = os.path.join(build_prefix, "include")
-            lib_folder = os.path.join(build_prefix, "lib")
-            library_header_path = os.path.join(include_folder, library_header)
-            library_found = os.path.isfile(library_header_path)
-            conda_installed = library_found
-        else:
-            # Check if using Anaconda to produce wheels
-            conda = shutil.which("conda")
-            is_conda = conda is not None
-            print(f"Running build on conda: {is_conda}")
-            if is_conda:
-                python_executable = sys.executable
-                py_folder = os.path.dirname(python_executable)
-                if os.name == "nt":
-                    env_path = os.path.join(py_folder, "Library")
-                else:
-                    env_path = os.path.dirname(py_folder)
-                lib_folder = os.path.join(env_path, "lib")
-                include_folder = os.path.join(env_path, "include")
-                library_header_path = os.path.join(include_folder, library_header)
-                library_found = os.path.isfile(library_header_path)
-                conda_installed = library_found
-
-        if not library_found:
-            if sys.platform == "linux":
-                library_found = os.path.exists(f"/usr/include/{library_header}")
-                library_found = library_found or os.path.exists(f"/usr/local/include/{library_header}")
-
-    return library_found, conda_installed, include_folder, lib_folder
-
-
-def get_extensions():
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "torchvision", "csrc")
-
-    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob(
-        os.path.join(extensions_dir, "ops", "*.cpp")
-    )
-    source_cpu = (
-        glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp"))
-        + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
-        + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
-    )
-
-    print("Compiling extensions with following flags:")
-    compile_cpp_tests = os.getenv("WITH_CPP_MODELS_TEST", "0") == "1"
-    print(f"  WITH_CPP_MODELS_TEST: {compile_cpp_tests}")
-    force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
-    print(f"  FORCE_CUDA: {force_cuda}")
-    debug_mode = os.getenv("DEBUG", "0") == "1"
-    print(f"  DEBUG: {debug_mode}")
-    use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1"
-    print(f"  TORCHVISION_USE_PNG: {use_png}")
-    use_jpeg = os.getenv("TORCHVISION_USE_JPEG", "1") == "1"
-    print(f"  TORCHVISION_USE_JPEG: {use_jpeg}")
-    use_nvjpeg = os.getenv("TORCHVISION_USE_NVJPEG", "1") == "1"
-    print(f"  TORCHVISION_USE_NVJPEG: {use_nvjpeg}")
-    use_ffmpeg = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1"
-    print(f"  TORCHVISION_USE_FFMPEG: {use_ffmpeg}")
-    use_video_codec = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1"
-    print(f"  TORCHVISION_USE_VIDEO_CODEC: {use_video_codec}")
-
-    nvcc_flags = os.getenv("NVCC_FLAGS", "")
-    print(f"  NVCC_FLAGS: {nvcc_flags}")
-
-    is_rocm_pytorch = False
-
-    if torch.__version__ >= "1.5":
-        from torch.utils.cpp_extension import ROCM_HOME
-
-        is_rocm_pytorch = (torch.version.hip is not None) and (ROCM_HOME is not None)
-
-    if is_rocm_pytorch:
-        from torch.utils.hipify import hipify_python
-
-        hipify_python.hipify(
-            project_directory=this_dir,
-            output_directory=this_dir,
-            includes="torchvision/csrc/ops/cuda/*",
-            show_detailed=True,
-            is_pytorch_extension=True,
-        )
-        source_cuda = glob.glob(os.path.join(extensions_dir, "ops", "hip", "*.hip"))
-        # Copy over additional files
-        for file in glob.glob(r"torchvision/csrc/ops/cuda/*.h"):
-            shutil.copy(file, "torchvision/csrc/ops/hip")
-    else:
-        source_cuda = glob.glob(os.path.join(extensions_dir, "ops", "cuda", "*.cu"))
-
-    source_cuda += glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
+def get_requirements():
+    def get_dist(pkgname):
+        try:
+            return get_distribution(pkgname)
+        except DistributionNotFound:
+            return None
+
+    pytorch_dep = os.getenv("TORCH_PACKAGE_NAME", "torch")
+    if version_pin := os.getenv("PYTORCH_VERSION"):
+        pytorch_dep += "==" + version_pin
+    elif (version_pin_ge := os.getenv("PYTORCH_VERSION_GE")) and (version_pin_lt := os.getenv("PYTORCH_VERSION_LT")):
+        # This branch and the associated env vars exist to help third-party
+        # builds like in https://github.com/pytorch/vision/pull/8936. This is
+        # supported on a best-effort basis, we don't guarantee that this won't
+        # eventually break (and we don't test it.)
+        pytorch_dep += f">={version_pin_ge},<{version_pin_lt}"
+
+    requirements = [
+        "numpy",
+        pytorch_dep,
+    ]
 
-    sources = main_file + source_cpu
-    extension = CppExtension
+    # Excluding 8.3.* because of https://github.com/pytorch/vision/issues/4934
+    pillow_ver = " >= 5.3.0, !=8.3.*"
+    pillow_req = "pillow-simd" if get_dist("pillow-simd") is not None else "pillow"
+    requirements.append(pillow_req + pillow_ver)
 
-    if compile_cpp_tests:
-        print("Compiling CPP tests")
-        test_dir = os.path.join(this_dir, "test")
-        models_dir = os.path.join(this_dir, "torchvision", "csrc", "models")
-        test_file = glob.glob(os.path.join(test_dir, "*.cpp"))
-        source_models = glob.glob(os.path.join(models_dir, "*.cpp"))
+    return requirements
 
-        test_file = [os.path.join(test_dir, s) for s in test_file]
-        source_models = [os.path.join(models_dir, s) for s in source_models]
-        tests = test_file + source_models
-        tests_include_dirs = [test_dir, models_dir]
 
+def get_macros_and_flags():
     define_macros = []
-
     extra_compile_args = {"cxx": []}
-    if (torch.cuda.is_available() and ((CUDA_HOME is not None) or is_rocm_pytorch)) or force_cuda:
-        extension = CUDAExtension
-        sources += source_cuda
-        if not is_rocm_pytorch:
+    if BUILD_CUDA_SOURCES:
+        if IS_ROCM:
+            define_macros += [("WITH_HIP", None)]
+            nvcc_flags = []
+        else:
             define_macros += [("WITH_CUDA", None)]
-            if nvcc_flags == "":
+            if NVCC_FLAGS is None:
                 nvcc_flags = []
             else:
-                nvcc_flags = nvcc_flags.split(" ")
-        else:
-            define_macros += [("WITH_HIP", None)]
-            nvcc_flags = []
+                nvcc_flags = shlex.split(NVCC_FLAGS)
         extra_compile_args["nvcc"] = nvcc_flags
 
     if sys.platform == "win32":
         define_macros += [("torchvision_EXPORTS", None)]
-        define_macros += [("USE_PYTHON", None)]
         extra_compile_args["cxx"].append("/MP")
 
-    if debug_mode:
-        print("Compiling in debug mode")
+    if DEBUG:
         extra_compile_args["cxx"].append("-g")
         extra_compile_args["cxx"].append("-O0")
         if "nvcc" in extra_compile_args:
@@ -233,162 +146,249 @@ def get_extensions():
             extra_compile_args["nvcc"] = [f for f in nvcc_flags if not ("-O" in f or "-g" in f)]
             extra_compile_args["nvcc"].append("-O0")
             extra_compile_args["nvcc"].append("-g")
+    else:
+        extra_compile_args["cxx"].append("-g0")
 
-    sources = [os.path.join(extensions_dir, s) for s in sources]
+    return define_macros, extra_compile_args
 
-    include_dirs = [extensions_dir]
 
-    ext_modules = [
-        extension(
-            "torchvision._C",
-            sorted(sources),
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-        )
-    ]
-    if compile_cpp_tests:
-        ext_modules.append(
-            extension(
-                "torchvision._C_tests",
-                tests,
-                include_dirs=tests_include_dirs,
-                define_macros=define_macros,
-                extra_compile_args=extra_compile_args,
-            )
-        )
+def make_C_extension():
+    print("Building _C extension")
 
-    # ------------------- Torchvision extra extensions ------------------------
-    vision_include = os.environ.get("TORCHVISION_INCLUDE", None)
-    vision_library = os.environ.get("TORCHVISION_LIBRARY", None)
-    vision_include = vision_include.split(os.pathsep) if vision_include is not None else []
-    vision_library = vision_library.split(os.pathsep) if vision_library is not None else []
-    include_dirs += vision_include
-    library_dirs = vision_library
+    sources = (
+        list(CSRS_DIR.glob("*.cpp"))
+        + list(CSRS_DIR.glob("ops/*.cpp"))
+        + list(CSRS_DIR.glob("ops/autocast/*.cpp"))
+        + list(CSRS_DIR.glob("ops/autograd/*.cpp"))
+        + list(CSRS_DIR.glob("ops/cpu/*.cpp"))
+        + list(CSRS_DIR.glob("ops/quantized/cpu/*.cpp"))
+    )
+    mps_sources = list(CSRS_DIR.glob("ops/mps/*.mm"))
 
-    # Image reading extension
-    image_macros = []
-    image_include = [extensions_dir]
-    image_library = []
-    image_link_flags = []
+    if IS_ROCM:
+        from torch.utils.hipify import hipify_python
 
-    if sys.platform == "win32":
-        image_macros += [("USE_PYTHON", None)]
-
-    # Locating libPNG
-    libpng = shutil.which("libpng-config")
-    pngfix = shutil.which("pngfix")
-    png_found = libpng is not None or pngfix is not None
-
-    use_png = use_png and png_found
-    if use_png:
-        print("Found PNG library")
-        if libpng is not None:
-            # Linux / Mac
-            min_version = "1.6.0"
-            png_version = subprocess.run([libpng, "--version"], stdout=subprocess.PIPE)
-            png_version = png_version.stdout.strip().decode("utf-8")
-            png_version = parse_version(png_version)
-            if png_version >= parse_version(min_version):
-                print("Building torchvision with PNG image support")
-                png_lib = subprocess.run([libpng, "--libdir"], stdout=subprocess.PIPE)
-                png_lib = png_lib.stdout.strip().decode("utf-8")
-                if "disabled" not in png_lib:
-                    image_library += [png_lib]
-                png_include = subprocess.run([libpng, "--I_opts"], stdout=subprocess.PIPE)
-                png_include = png_include.stdout.strip().decode("utf-8")
-                _, png_include = png_include.split("-I")
-                image_include += [png_include]
-                image_link_flags.append("png")
-                print(f"  libpng version: {png_version}")
-                print(f"  libpng include path: {png_include}")
-            else:
-                print("Could not add PNG image support to torchvision:")
-                print(f"  libpng minimum version {min_version}, found {png_version}")
-                use_png = False
-        else:
-            # Windows
-            png_lib = os.path.join(os.path.dirname(os.path.dirname(pngfix)), "lib")
-            png_include = os.path.join(os.path.dirname(os.path.dirname(pngfix)), "include", "libpng16")
-            image_library += [png_lib]
-            image_include += [png_include]
-            image_link_flags.append("libpng")
+        hipify_python.hipify(
+            project_directory=str(ROOT_DIR),
+            output_directory=str(ROOT_DIR),
+            includes="torchvision/csrc/ops/cuda/*",
+            show_detailed=True,
+            is_pytorch_extension=True,
+        )
+        cuda_sources = list(CSRS_DIR.glob("ops/hip/*.hip"))
+        for header in CSRS_DIR.glob("ops/cuda/*.h"):
+            shutil.copy(str(header), str(CSRS_DIR / "ops/hip"))
     else:
-        print("Building torchvision without PNG image support")
-    image_macros += [("PNG_FOUND", str(int(use_png)))]
-
-    # Locating libjpeg
-    (jpeg_found, jpeg_conda, jpeg_include, jpeg_lib) = find_library("jpeglib", vision_include)
-
-    use_jpeg = use_jpeg and jpeg_found
-    if use_jpeg:
-        print("Building torchvision with JPEG image support")
-        image_link_flags.append("jpeg")
-        if jpeg_conda:
-            image_library += [jpeg_lib]
-            image_include += [jpeg_include]
+        cuda_sources = list(CSRS_DIR.glob("ops/cuda/*.cu"))
+
+    if BUILD_CUDA_SOURCES:
+        Extension = CUDAExtension
+        sources += cuda_sources
     else:
-        print("Building torchvision without JPEG image support")
-    image_macros += [("JPEG_FOUND", str(int(use_jpeg)))]
-
-    # Locating nvjpeg
-    # Should be included in CUDA_HOME for CUDA >= 10.1, which is the minimum version we have in the CI
-    nvjpeg_found = (
-        extension is CUDAExtension
-        and CUDA_HOME is not None
-        and os.path.exists(os.path.join(CUDA_HOME, "include", "nvjpeg.h"))
+        Extension = CppExtension
+        if torch.backends.mps.is_available() or FORCE_MPS:
+            sources += mps_sources
+
+    define_macros, extra_compile_args = get_macros_and_flags()
+    return Extension(
+        name="torchvision._C",
+        sources=sorted(str(s) for s in sources),
+        include_dirs=[CSRS_DIR],
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args,
     )
 
-    use_nvjpeg = use_nvjpeg and nvjpeg_found
-    if use_nvjpeg:
-        print("Building torchvision with NVJPEG image support")
-        image_link_flags.append("nvjpeg")
+
+def find_libpng():
+    # Returns (found, include dir, library dir, library name)
+    if sys.platform in ("linux", "darwin", "aix"):
+        libpng_config = shutil.which("libpng-config")
+        if libpng_config is None:
+            warnings.warn("libpng-config not found")
+            return False, None, None, None
+        min_version = parse_version("1.6.0")
+        png_version = parse_version(
+            subprocess.run([libpng_config, "--version"], stdout=subprocess.PIPE).stdout.strip().decode("utf-8")
+        )
+        if png_version < min_version:
+            warnings.warn(f"libpng version {png_version} is less than minimum required version {min_version}")
+            return False, None, None, None
+
+        include_dir = (
+            subprocess.run([libpng_config, "--I_opts"], stdout=subprocess.PIPE)
+            .stdout.strip()
+            .decode("utf-8")
+            .split("-I")[1]
+        )
+        library_dir = subprocess.run([libpng_config, "--libdir"], stdout=subprocess.PIPE).stdout.strip().decode("utf-8")
+        library = "png"
+    else:  # Windows
+        pngfix = shutil.which("pngfix")
+        if pngfix is None:
+            warnings.warn("pngfix not found")
+            return False, None, None, None
+        pngfix_dir = Path(pngfix).absolute().parent.parent
+
+        library_dir = str(pngfix_dir / "lib")
+        include_dir = str(pngfix_dir / "include/libpng16")
+        library = "libpng"
+
+    return True, include_dir, library_dir, library
+
+
+def find_library(header):
+    # returns (found, include dir, library dir)
+    # if include dir or library dir is None, it means that the library is in
+    # standard paths and don't need to be added to compiler / linker search
+    # paths
+
+    searching_for = f"Searching for {header}"
+
+    for folder in TORCHVISION_INCLUDE:
+        if (Path(folder) / header).exists():
+            print(f"{searching_for} in {Path(folder) / header}. Found in TORCHVISION_INCLUDE.")
+            return True, None, None
+    print(f"{searching_for}. Didn't find in TORCHVISION_INCLUDE.")
+
+    # Try conda-related prefixes. If BUILD_PREFIX is set it means conda-build is
+    # being run. If CONDA_PREFIX is set then we're in a conda environment.
+    for prefix_env_var in ("BUILD_PREFIX", "CONDA_PREFIX"):
+        if (prefix := os.environ.get(prefix_env_var)) is not None:
+            prefix = Path(prefix)
+            if sys.platform == "win32":
+                prefix = prefix / "Library"
+            include_dir = prefix / "include"
+            library_dir = prefix / "lib"
+            if (include_dir / header).exists():
+                print(f"{searching_for}. Found in {prefix_env_var}.")
+                return True, str(include_dir), str(library_dir)
+        print(f"{searching_for}. Didn't find in {prefix_env_var}.")
+
+    if sys.platform == "linux":
+        for prefix in (Path("/usr/include"), Path("/usr/local/include")):
+            if (prefix / header).exists():
+                print(f"{searching_for}. Found in {prefix}.")
+                return True, None, None
+            print(f"{searching_for}. Didn't find in {prefix}")
+
+    if sys.platform == "darwin":
+        HOMEBREW_PATH = Path("/opt/homebrew")
+        include_dir = HOMEBREW_PATH / "include"
+        library_dir = HOMEBREW_PATH / "lib"
+        if (include_dir / header).exists():
+            print(f"{searching_for}. Found in {include_dir}.")
+            return True, str(include_dir), str(library_dir)
+
+    return False, None, None
+
+
+def make_image_extension():
+    print("Building image extension")
+
+    include_dirs = TORCHVISION_INCLUDE.copy()
+    library_dirs = TORCHVISION_LIBRARY.copy()
+
+    libraries = []
+    define_macros, extra_compile_args = get_macros_and_flags()
+
+    image_dir = CSRS_DIR / "io/image"
+    sources = list(image_dir.glob("*.cpp")) + list(image_dir.glob("cpu/*.cpp")) + list(image_dir.glob("cpu/giflib/*.c"))
+
+    if IS_ROCM:
+        sources += list(image_dir.glob("hip/*.cpp"))
+        # we need to exclude this in favor of the hipified source
+        sources.remove(image_dir / "image.cpp")
     else:
-        print("Building torchvision without NVJPEG image support")
-    image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))]
-
-    image_path = os.path.join(extensions_dir, "io", "image")
-    image_src = (
-        glob.glob(os.path.join(image_path, "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cuda", "*.cpp"))
+        sources += list(image_dir.glob("cuda/*.cpp"))
+
+    Extension = CppExtension
+
+    if USE_PNG:
+        png_found, png_include_dir, png_library_dir, png_library = find_libpng()
+        if png_found:
+            print("Building torchvision with PNG support")
+            print(f"{png_include_dir = }")
+            print(f"{png_library_dir = }")
+            include_dirs.append(png_include_dir)
+            library_dirs.append(png_library_dir)
+            libraries.append(png_library)
+            define_macros += [("PNG_FOUND", 1)]
+        else:
+            warnings.warn("Building torchvision without PNG support")
+
+    if USE_JPEG:
+        jpeg_found, jpeg_include_dir, jpeg_library_dir = find_library(header="jpeglib.h")
+        if jpeg_found:
+            print("Building torchvision with JPEG support")
+            print(f"{jpeg_include_dir = }")
+            print(f"{jpeg_library_dir = }")
+            if jpeg_include_dir is not None and jpeg_library_dir is not None:
+                # if those are None it means they come from standard paths that are already in the search paths, which we don't need to re-add.
+                include_dirs.append(jpeg_include_dir)
+                library_dirs.append(jpeg_library_dir)
+            libraries.append("jpeg")
+            define_macros += [("JPEG_FOUND", 1)]
+        else:
+            warnings.warn("Building torchvision without JPEG support")
+
+    if USE_WEBP:
+        webp_found, webp_include_dir, webp_library_dir = find_library(header="webp/decode.h")
+        if webp_found:
+            print("Building torchvision with WEBP support")
+            print(f"{webp_include_dir = }")
+            print(f"{webp_library_dir = }")
+            if webp_include_dir is not None and webp_library_dir is not None:
+                # if those are None it means they come from standard paths that are already in the search paths, which we don't need to re-add.
+                include_dirs.append(webp_include_dir)
+                library_dirs.append(webp_library_dir)
+            webp_library = "libwebp" if sys.platform == "win32" else "webp"
+            libraries.append(webp_library)
+            define_macros += [("WEBP_FOUND", 1)]
+        else:
+            warnings.warn("Building torchvision without WEBP support")
+
+    if USE_NVJPEG and (torch.cuda.is_available() or FORCE_CUDA):
+        nvjpeg_found = CUDA_HOME is not None and (Path(CUDA_HOME) / "include/nvjpeg.h").exists()
+
+        if nvjpeg_found:
+            print("Building torchvision with NVJPEG image support")
+            libraries.append("nvjpeg")
+            define_macros += [("NVJPEG_FOUND", 1)]
+            Extension = CUDAExtension
+        else:
+            warnings.warn("Building torchvision without NVJPEG support")
+    elif USE_NVJPEG:
+        warnings.warn("Building torchvision without NVJPEG support")
+
+    return Extension(
+        name="torchvision.image",
+        sources=sorted(str(s) for s in sources),
+        include_dirs=include_dirs,
+        library_dirs=library_dirs,
+        define_macros=define_macros,
+        libraries=libraries,
+        extra_compile_args=extra_compile_args,
     )
 
-    if use_png or use_jpeg:
-        ext_modules.append(
-            extension(
-                "torchvision.image",
-                image_src,
-                include_dirs=image_include + include_dirs + [image_path],
-                library_dirs=image_library + library_dirs,
-                define_macros=image_macros,
-                libraries=image_link_flags,
-                extra_compile_args=extra_compile_args,
-            )
-        )
 
-    # Locating ffmpeg
-    ffmpeg_exe = shutil.which("ffmpeg")
-    has_ffmpeg = ffmpeg_exe is not None
-    ffmpeg_version = None
-    # FIXME: Building torchvision with ffmpeg on MacOS or with Python 3.9
-    # FIXME: causes crash. See the following GitHub issues for more details.
-    # FIXME: https://github.com/pytorch/pytorch/issues/65000
-    # FIXME: https://github.com/pytorch/vision/issues/3367
+def make_video_decoders_extensions():
+    print("Building video decoder extensions")
+
+    build_without_extensions_msg = "Building without video decoders extensions."
     if sys.platform != "linux" or (sys.version_info.major == 3 and sys.version_info.minor == 9):
-        has_ffmpeg = False
-    if has_ffmpeg:
-        try:
-            # This is to check if ffmpeg is installed properly.
-            ffmpeg_version = subprocess.check_output(["ffmpeg", "-version"])
-        except subprocess.CalledProcessError:
-            print("Building torchvision without ffmpeg support")
-            print("  Error fetching ffmpeg version, ignoring ffmpeg.")
-            has_ffmpeg = False
+        # FIXME: Building torchvision with ffmpeg on MacOS or with Python 3.9
+        # FIXME: causes crash. See the following GitHub issues for more details.
+        # FIXME: https://github.com/pytorch/pytorch/issues/65000
+        # FIXME: https://github.com/pytorch/vision/issues/3367
+        print("Can only build video decoder extensions on linux and Python != 3.9")
+        return []
 
-    use_ffmpeg = use_ffmpeg and has_ffmpeg
+    ffmpeg_exe = shutil.which("ffmpeg")
+    if ffmpeg_exe is None:
+        print(f"{build_without_extensions_msg} Couldn't find ffmpeg binary.")
+        return []
 
-    if use_ffmpeg:
+    def find_ffmpeg_libraries():
         ffmpeg_libraries = {"libavcodec", "libavformat", "libavutil", "libswresample", "libswscale"}
 
         ffmpeg_bin = os.path.dirname(ffmpeg_exe)
@@ -410,49 +410,56 @@ def get_extensions():
 
         for library in ffmpeg_libraries:
             library_found = False
-            for search_path in ffmpeg_include_dir + include_dirs:
+            for search_path in ffmpeg_include_dir + TORCHVISION_INCLUDE:
                 full_path = os.path.join(search_path, library, "*.h")
                 library_found |= len(glob.glob(full_path)) > 0
 
             if not library_found:
-                print("Building torchvision without ffmpeg support")
-                print(f"  {library} header files were not found, disabling ffmpeg support")
-                use_ffmpeg = False
-    else:
-        print("Building torchvision without ffmpeg support")
+                print(f"{build_without_extensions_msg}")
+                print(f"{library} header files were not found.")
+                return None, None
+
+        return ffmpeg_include_dir, ffmpeg_library_dir
+
+    ffmpeg_include_dir, ffmpeg_library_dir = find_ffmpeg_libraries()
+    if ffmpeg_include_dir is None or ffmpeg_library_dir is None:
+        return []
+
+    print("Found ffmpeg:")
+    print(f"  ffmpeg include path: {ffmpeg_include_dir}")
+    print(f"  ffmpeg library_dir: {ffmpeg_library_dir}")
 
-    if use_ffmpeg:
-        print("Building torchvision with ffmpeg support")
-        print(f"  ffmpeg version: {ffmpeg_version}")
-        print(f"  ffmpeg include path: {ffmpeg_include_dir}")
-        print(f"  ffmpeg library_dir: {ffmpeg_library_dir}")
+    extensions = []
+    if USE_CPU_VIDEO_DECODER:
+        print("Building with CPU video decoder support")
 
         # TorchVision base decoder + video reader
-        video_reader_src_dir = os.path.join(this_dir, "torchvision", "csrc", "io", "video_reader")
+        video_reader_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video_reader")
         video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-        base_decoder_src_dir = os.path.join(this_dir, "torchvision", "csrc", "io", "decoder")
+        base_decoder_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "decoder")
         base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "*.cpp"))
         # Torchvision video API
-        videoapi_src_dir = os.path.join(this_dir, "torchvision", "csrc", "io", "video")
+        videoapi_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video")
         videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp"))
         # exclude tests
         base_decoder_src = [x for x in base_decoder_src if "_test.cpp" not in x]
 
         combined_src = video_reader_src + base_decoder_src + videoapi_src
 
-        ext_modules.append(
+        extensions.append(
             CppExtension(
+                # This is an awful name. It should be "cpu_video_decoder". Keeping for BC.
                 "torchvision.video_reader",
                 combined_src,
                 include_dirs=[
                     base_decoder_src_dir,
                     video_reader_src_dir,
                     videoapi_src_dir,
-                    extensions_dir,
+                    str(CSRS_DIR),
                     *ffmpeg_include_dir,
-                    *include_dirs,
+                    *TORCHVISION_INCLUDE,
                 ],
-                library_dirs=ffmpeg_library_dir + library_dirs,
+                library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY,
                 libraries=[
                     "avcodec",
                     "avformat",
@@ -460,41 +467,41 @@ def get_extensions():
                     "swresample",
                     "swscale",
                 ],
-                extra_compile_args=["-std=c++14"] if os.name != "nt" else ["/std:c++14", "/MP"],
-                extra_link_args=["-std=c++14" if os.name != "nt" else "/std:c++14"],
+                extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"],
+                extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"],
             )
         )
 
-    # Locating video codec
-    # CUDA_HOME should be set to the cuda root directory.
-    # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the location to
-    # video codec header files and libraries respectively.
-    video_codec_found = (
-        extension is CUDAExtension
-        and CUDA_HOME is not None
-        and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in vision_include])
-        and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in vision_include])
-        and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in library_dirs])
-    )
+    if USE_GPU_VIDEO_DECODER:
+        # Locating GPU video decoder headers and libraries
+        # CUDA_HOME should be set to the cuda root directory.
+        # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the locations
+        # to the headers and libraries below
+        if not (
+            BUILD_CUDA_SOURCES
+            and CUDA_HOME is not None
+            and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in TORCHVISION_INCLUDE])
+            and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in TORCHVISION_INCLUDE])
+            and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in TORCHVISION_LIBRARY])
+            and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir])
+        ):
+            print("Could not find necessary dependencies. Refer the setup.py to check which ones are needed.")
+            print("Building without GPU video decoder support")
+            return extensions
+        print("Building torchvision with GPU video decoder support")
 
-    use_video_codec = use_video_codec and video_codec_found
-    if (
-        use_video_codec
-        and use_ffmpeg
-        and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir])
-    ):
-        print("Building torchvision with video codec support")
-        gpu_decoder_path = os.path.join(extensions_dir, "io", "decoder", "gpu")
+        gpu_decoder_path = os.path.join(CSRS_DIR, "io", "decoder", "gpu")
         gpu_decoder_src = glob.glob(os.path.join(gpu_decoder_path, "*.cpp"))
         cuda_libs = os.path.join(CUDA_HOME, "lib64")
         cuda_inc = os.path.join(CUDA_HOME, "include")
 
-        ext_modules.append(
-            extension(
-                "torchvision.Decoder",
+        _, extra_compile_args = get_macros_and_flags()
+        extensions.append(
+            CUDAExtension(
+                "torchvision.gpu_decoder",
                 gpu_decoder_src,
-                include_dirs=include_dirs + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir,
-                library_dirs=ffmpeg_library_dir + library_dirs + [cuda_libs],
+                include_dirs=[CSRS_DIR] + TORCHVISION_INCLUDE + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir,
+                library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY + [cuda_libs],
                 libraries=[
                     "avcodec",
                     "avformat",
@@ -512,20 +519,8 @@ def get_extensions():
                 extra_compile_args=extra_compile_args,
             )
         )
-    else:
-        print("Building torchvision without video codec support")
-        if (
-            use_video_codec
-            and use_ffmpeg
-            and not any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir])
-        ):
-            print(
-                "  The installed version of ffmpeg is missing the header file 'bsf.h' which is "
-                "  required for GPU video decoding. Please install the latest ffmpeg from conda-forge channel:"
-                "   `conda install -c conda-forge ffmpeg`."
-            )
 
-    return ext_modules
+    return extensions
 
 
 class clean(distutils.command.clean.clean):
@@ -544,15 +539,21 @@ def run(self):
 
 
 if __name__ == "__main__":
-    print(f"Building wheel {package_name}-{version}")
+    version, sha = get_version()
+    write_version_file(version, sha)
 
-    write_version_file()
+    print(f"Building wheel {package_name}-{version}")
 
-    with open("README.rst") as f:
+    with open("README.md") as f:
         readme = f.read()
 
+    extensions = [
+        make_C_extension(),
+        make_image_extension(),
+        *make_video_decoders_extensions(),
+    ]
+
     setup(
-        # Metadata
         name=package_name,
         version=version,
         author="PyTorch Core Team",
@@ -560,17 +561,18 @@ def run(self):
         url="https://github.com/pytorch/vision",
         description="image and video datasets and models for torch deep learning",
         long_description=readme,
+        long_description_content_type="text/markdown",
         license="BSD",
-        # Package info
         packages=find_packages(exclude=("test",)),
         package_data={package_name: ["*.dll", "*.dylib", "*.so", "prototype/datasets/_builtin/*.categories"]},
         zip_safe=False,
-        install_requires=requirements,
+        install_requires=get_requirements(),
         extras_require={
+            "gdown": ["gdown>=4.7.3"],
             "scipy": ["scipy"],
         },
-        ext_modules=get_extensions(),
-        python_requires=">=3.7",
+        ext_modules=extensions,
+        python_requires=">=3.9",
         cmdclass={
             "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
             "clean": clean,
diff --git a/test/assets/fakedata/draw_boxes_different_label_colors.png b/test/assets/fakedata/draw_boxes_different_label_colors.png
new file mode 100644
index 00000000000..72178930602
Binary files /dev/null and b/test/assets/fakedata/draw_boxes_different_label_colors.png differ
diff --git a/test/assets/fakedata/draw_boxes_different_label_fill_colors.png b/test/assets/fakedata/draw_boxes_different_label_fill_colors.png
new file mode 100644
index 00000000000..b15adb40092
Binary files /dev/null and b/test/assets/fakedata/draw_boxes_different_label_fill_colors.png differ
diff --git a/test/assets/fakedata/draw_boxes_util.png b/test/assets/fakedata/draw_boxes_util.png
index d38f8be78ac..ee5dac329e0 100644
Binary files a/test/assets/fakedata/draw_boxes_util.png and b/test/assets/fakedata/draw_boxes_util.png differ
diff --git a/test/assets/fakedata/draw_keypoints_visibility.png b/test/assets/fakedata/draw_keypoints_visibility.png
new file mode 100644
index 00000000000..8cd34f84539
Binary files /dev/null and b/test/assets/fakedata/draw_keypoints_visibility.png differ
diff --git a/test/assets/fakedata/draw_rotated_boxes.png b/test/assets/fakedata/draw_rotated_boxes.png
new file mode 100644
index 00000000000..4e5a5eb5414
Binary files /dev/null and b/test/assets/fakedata/draw_rotated_boxes.png differ
diff --git a/test/assets/fakedata/draw_rotated_boxes_fill.png b/test/assets/fakedata/draw_rotated_boxes_fill.png
new file mode 100644
index 00000000000..474b771f04e
Binary files /dev/null and b/test/assets/fakedata/draw_rotated_boxes_fill.png differ
diff --git a/test/assets/fakedata/logos/rgb_pytorch.avif b/test/assets/fakedata/logos/rgb_pytorch.avif
new file mode 100644
index 00000000000..ea1bb586957
Binary files /dev/null and b/test/assets/fakedata/logos/rgb_pytorch.avif differ
diff --git a/test/assets/fakedata/logos/rgb_pytorch.webp b/test/assets/fakedata/logos/rgb_pytorch.webp
new file mode 100644
index 00000000000..e594584d76d
Binary files /dev/null and b/test/assets/fakedata/logos/rgb_pytorch.webp differ
diff --git a/test/assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic b/test/assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic
new file mode 100644
index 00000000000..4c29ac3c71c
Binary files /dev/null and b/test/assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic differ
diff --git a/test/assets/toosmall_png/heapbof.png b/test/assets/toosmall_png/heapbof.png
new file mode 100644
index 00000000000..e720d183342
Binary files /dev/null and b/test/assets/toosmall_png/heapbof.png differ
diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index 8c5484a2823..ef5d5e1ec96 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -12,14 +12,14 @@
 import random
 import shutil
 import unittest.mock
-import warnings
 import xml.etree.ElementTree as ET
 from collections import Counter, defaultdict
 
 import numpy as np
 import pytest
 import torch
-from datasets_utils import combinations_grid, create_image_file, create_image_folder, make_tar, make_zip
+from common_utils import combinations_grid
+from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
 from torchvision.prototype import datasets
@@ -519,10 +519,22 @@ def imagenet(root, config):
         ]
         num_children = 1
         synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
-        with warnings.catch_warnings():
-            # The warning is not for savemat, but rather for some internals savemet is using
-            warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
-            savemat(data_root / "meta.mat", dict(synsets=synsets))
+        synsets = np.array(
+            synsets,
+            dtype=np.dtype(
+                [
+                    ("ILSVRC2012_ID", "O"),
+                    ("WNID", "O"),
+                    ("words", "O"),
+                    ("gloss", "O"),
+                    ("num_children", "O"),
+                    ("children", "O"),
+                    ("wordnet_height", "O"),
+                    ("num_train_images", "O"),
+                ]
+            ),
+        )
+        savemat(data_root / "meta.mat", dict(synsets=synsets))
 
         make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
     else:  # config["split"] == "test"
@@ -661,15 +673,15 @@ class SBDMockData:
     _NUM_CATEGORIES = 20
 
     @classmethod
-    def _make_split_files(cls, root_map):
-        ids_map = {
-            split: [f"2008_{idx:06d}" for idx in idcs]
-            for split, idcs in (
-                ("train", [0, 1, 2]),
-                ("train_noval", [0, 2]),
-                ("val", [3]),
-            )
-        }
+    def _make_split_files(cls, root_map, *, split):
+        splits_and_idcs = [
+            ("train", [0, 1, 2]),
+            ("val", [3]),
+        ]
+        if split == "train_noval":
+            splits_and_idcs.append(("train_noval", [0, 2]))
+
+        ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs}
 
         for split, ids in ids_map.items():
             with open(root_map[split] / f"{split}.txt", "w") as fh:
@@ -710,12 +722,14 @@ def _make_segmentation(cls, size):
         return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
 
     @classmethod
-    def generate(cls, root):
+    def generate(cls, root, *, split):
         archive_folder = root / "benchmark_RELEASE"
         dataset_folder = archive_folder / "dataset"
         dataset_folder.mkdir(parents=True, exist_ok=True)
 
-        ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root}))
+        ids, num_samples_map = cls._make_split_files(
+            defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split
+        )
         sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
         create_image_folder(
             dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
@@ -723,12 +737,12 @@ def generate(cls, root):
 
         make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
 
-        return num_samples_map
+        return num_samples_map[split]
 
 
 @register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
 def sbd(root, config):
-    return SBDMockData.generate(root)[config["split"]]
+    return SBDMockData.generate(root, split=config["split"])
 
 
 @register_mock(configs=[dict()])
diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py
new file mode 100644
index 00000000000..adb794f5db6
--- /dev/null
+++ b/test/common_extended_utils.py
@@ -0,0 +1,310 @@
+import os
+from collections import defaultdict
+from numbers import Number
+from typing import Any
+
+import torch
+from torch.utils._python_dispatch import TorchDispatchMode
+
+from torch.utils._pytree import tree_map
+
+from torchvision.models._api import Weights
+
+aten = torch.ops.aten
+quantized = torch.ops.quantized
+
+
+def get_shape(i):
+    if isinstance(i, torch.Tensor):
+        return i.shape
+    elif hasattr(i, "weight"):
+        return i.weight().shape
+    else:
+        raise ValueError(f"Unknown type {type(i)}")
+
+
+def prod(x):
+    res = 1
+    for i in x:
+        res *= i
+    return res
+
+
+def matmul_flop(inputs: list[Any], outputs: list[Any]) -> Number:
+    """
+    Count flops for matmul.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    input_shapes = [get_shape(v) for v in inputs]
+    assert len(input_shapes) == 2, input_shapes
+    assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    flop = prod(input_shapes[0]) * input_shapes[-1][-1]
+    return flop
+
+
+def addmm_flop(inputs: list[Any], outputs: list[Any]) -> Number:
+    """
+    Count flops for fully connected layers.
+    """
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [get_shape(v) for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [batch size, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]
+    assert len(input_shapes[1]) == 2, input_shapes[1]
+    batch_size, input_dim = input_shapes[0]
+    output_dim = input_shapes[1][1]
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def bmm_flop(inputs: list[Any], outputs: list[Any]) -> Number:
+    """
+    Count flops for the bmm operation.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [get_shape(v) for v in inputs]
+    n, c, t = input_shapes[0]
+    d = input_shapes[-1][-1]
+    flop = n * c * t * d
+    return flop
+
+
+def conv_flop_count(
+    x_shape: list[int],
+    w_shape: list[int],
+    out_shape: list[int],
+    transposed: bool = False,
+) -> Number:
+    """
+    Count flops for convolution. Note only multiplication is
+    counted. Computation for addition and bias is ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flop = batch_size * prod(w_shape) * prod(conv_shape)
+    return flop
+
+
+def conv_flop(inputs: list[Any], outputs: list[Any]):
+    """
+    Count flops for convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+    transposed = inputs[6]
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+def quant_conv_flop(inputs: list[Any], outputs: list[Any]):
+    """
+    Count flops for quantized convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=False)
+
+
+def transpose_shape(shape):
+    return [shape[1], shape[0]] + list(shape[2:])
+
+
+def conv_backward_flop(inputs: list[Any], outputs: list[Any]):
+    grad_out_shape, x_shape, w_shape = (get_shape(i) for i in inputs[:3])
+    output_mask = inputs[-1]
+    fwd_transposed = inputs[7]
+    flop_count = 0
+
+    if output_mask[0]:
+        grad_input_shape = get_shape(outputs[0])
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed)
+    if output_mask[1]:
+        grad_weight_shape = get_shape(outputs[1])
+        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed)
+
+    return flop_count
+
+
+def scaled_dot_product_flash_attention_flop(inputs: list[Any], outputs: list[Any]):
+    # FIXME: this needs to count the flops of this kernel
+    # https://github.com/pytorch/pytorch/blob/207b06d099def9d9476176a1842e88636c1f714f/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp#L52-L267
+    return 0
+
+
+flop_mapping = {
+    aten.mm: matmul_flop,
+    aten.matmul: matmul_flop,
+    aten.addmm: addmm_flop,
+    aten.bmm: bmm_flop,
+    aten.convolution: conv_flop,
+    aten._convolution: conv_flop,
+    aten.convolution_backward: conv_backward_flop,
+    quantized.conv2d: quant_conv_flop,
+    quantized.conv2d_relu: quant_conv_flop,
+    aten._scaled_dot_product_flash_attention: scaled_dot_product_flash_attention_flop,
+}
+
+unmapped_ops = set()
+
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+class FlopCounterMode(TorchDispatchMode):
+    def __init__(self, model=None):
+        self.flop_counts = defaultdict(lambda: defaultdict(int))
+        self.parents = ["Global"]
+        # global mod
+        if model is not None:
+            for name, module in dict(model.named_children()).items():
+                module.register_forward_pre_hook(self.enter_module(name))
+                module.register_forward_hook(self.exit_module(name))
+
+    def enter_module(self, name):
+        def f(module, inputs):
+            self.parents.append(name)
+            inputs = normalize_tuple(inputs)
+            out = self.create_backwards_pop(name)(*inputs)
+            return out
+
+        return f
+
+    def exit_module(self, name):
+        def f(module, inputs, outputs):
+            assert self.parents[-1] == name
+            self.parents.pop()
+            outputs = normalize_tuple(outputs)
+            return self.create_backwards_push(name)(*outputs)
+
+        return f
+
+    def create_backwards_push(self, name):
+        class PushState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                self.parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def create_backwards_pop(self, name):
+        class PopState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                assert self.parents[-1] == name
+                self.parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def __enter__(self):
+        self.flop_counts.clear()
+        super().__enter__()
+
+    def __exit__(self, *args):
+        # print(f"Total: {sum(self.flop_counts['Global'].values()) / 1e9} GFLOPS")
+        # for mod in self.flop_counts.keys():
+        #     print(f"Module: ", mod)
+        #     for k, v in self.flop_counts[mod].items():
+        #         print(f"{k}: {v / 1e9} GFLOPS")
+        #     print()
+        super().__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+
+        out = func(*args, **kwargs)
+        func_packet = func._overloadpacket
+        if func_packet in flop_mapping:
+            flop_count = flop_mapping[func_packet](args, normalize_tuple(out))
+            for par in self.parents:
+                self.flop_counts[par][func_packet] += flop_count
+        else:
+            unmapped_ops.add(func_packet)
+
+        return out
+
+    def get_flops(self):
+        return sum(self.flop_counts["Global"].values()) / 1e9
+
+
+def get_dims(module_name, height, width):
+    # detection models have curated input sizes
+    if module_name == "detection":
+        # we can feed a batch of 1 for detection model instead of a list of 1 image
+        dims = (3, height, width)
+    elif module_name == "video":
+        # hard-coding the time dimension to size 16
+        dims = (1, 16, 3, height, width)
+    else:
+        dims = (1, 3, height, width)
+
+    return dims
+
+
+def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512):
+    module_name = model.__module__.split(".")[-2]
+    dims = get_dims(module_name=module_name, height=height, width=width)
+
+    input_tensor = torch.randn(dims)
+
+    # try:
+    preprocess = weight.transforms()
+    if module_name == "optical_flow":
+        inp = preprocess(input_tensor, input_tensor)
+    else:
+        # hack to enable mod(*inp) for optical_flow models
+        inp = [preprocess(input_tensor)]
+
+    model.eval()
+
+    flop_counter = FlopCounterMode(model)
+    with flop_counter:
+        # detection models expect a list of 3d tensors as inputs
+        if module_name == "detection":
+            model(inp)
+        else:
+            model(*inp)
+
+        flops = flop_counter.get_flops()
+
+    return round(flops, 3)
+
+
+def get_file_size_mb(weight):
+    weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1])
+    weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024
+
+    return round(weights_size_mb, 3)
diff --git a/test/common_utils.py b/test/common_utils.py
index 7017212fc67..74ad31fea72 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -1,35 +1,45 @@
 import contextlib
 import functools
+import itertools
 import os
+import pathlib
 import random
+import re
 import shutil
+import sys
 import tempfile
+import warnings
+from subprocess import CalledProcessError, check_output, STDOUT
 
 import numpy as np
+import PIL
+import pytest
 import torch
-from PIL import Image
-from torchvision import io
+import torch.testing
 
-import __main__  # noqa: 401
+from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
+from torchvision import io, tv_tensors
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_image, to_pil_image
+from torchvision.utils import _Image_fromarray
 
 
-IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true"
+IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
 IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
 IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
-CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda."
+MPS_NOT_AVAILABLE_MSG = "MPS device not available"
+OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda."
 
 
 @contextlib.contextmanager
 def get_tmp_dir(src=None, **kwargs):
-    tmp_dir = tempfile.mkdtemp(**kwargs)
-    if src is not None:
-        os.rmdir(tmp_dir)
-        shutil.copytree(src, tmp_dir)
-    try:
+    with tempfile.TemporaryDirectory(
+        **kwargs,
+    ) as tmp_dir:
+        if src is not None:
+            shutil.copytree(src, tmp_dir)
         yield tmp_dir
-    finally:
-        shutil.rmtree(tmp_dir)
 
 
 def set_rng_seed(seed):
@@ -107,18 +117,28 @@ def disable_console_output():
         yield
 
 
-def cpu_and_gpu():
+def cpu_and_cuda():
     import pytest  # noqa
 
     return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
 
 
+def cpu_and_cuda_and_mps():
+    return cpu_and_cuda() + (pytest.param("mps", marks=pytest.mark.needs_mps),)
+
+
 def needs_cuda(test_func):
     import pytest  # noqa
 
     return pytest.mark.needs_cuda(test_func)
 
 
+def needs_mps(test_func):
+    import pytest  # noqa
+
+    return pytest.mark.needs_mps(test_func)
+
+
 def _create_data(height=3, width=3, channels=3, device="cpu"):
     # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture
     tensor = torch.randint(0, 256, (channels, height, width), dtype=torch.uint8, device=device)
@@ -127,7 +147,7 @@ def _create_data(height=3, width=3, channels=3, device="cpu"):
     if channels == 1:
         mode = "L"
         data = data[..., 0]
-    pil_img = Image.fromarray(data, mode=mode)
+    pil_img = _Image_fromarray(data, mode=mode)
     return tensor, pil_img
 
 
@@ -137,9 +157,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu
     return batch_tensor
 
 
-assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
-
-
 def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
     names = []
     for i in range(num_videos):
@@ -160,6 +177,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
 
 
 def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
+    # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it
     np_pil_image = np.array(pil_image)
     if np_pil_image.ndim == 2:
         np_pil_image = np_pil_image[:, :, None]
@@ -172,6 +190,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
 def _assert_approx_equal_tensor_to_pil(
     tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None
 ):
+    # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it
     # TODO: we could just merge this into _assert_equal_tensor_to_pil
     np_pil_image = np.array(pil_image)
     if np_pil_image.ndim == 2:
@@ -202,3 +221,321 @@ def _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=1e-8, **fn_kwargs):
         # scriptable function test
         s_transformed_batch = scripted_fn(batch_tensors, **fn_kwargs)
         torch.testing.assert_close(transformed_batch, s_transformed_batch, rtol=1e-5, atol=scripted_fn_atol)
+
+
+def cache(fn):
+    """Similar to :func:`functools.cache` (Python >= 3.8) or :func:`functools.lru_cache` with infinite cache size,
+    but this also caches exceptions.
+    """
+    sentinel = object()
+    out_cache = {}
+    exc_tb_cache = {}
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        key = args + tuple(kwargs.values())
+
+        out = out_cache.get(key, sentinel)
+        if out is not sentinel:
+            return out
+
+        exc_tb = exc_tb_cache.get(key, sentinel)
+        if exc_tb is not sentinel:
+            raise exc_tb[0].with_traceback(exc_tb[1])
+
+        try:
+            out = fn(*args, **kwargs)
+        except Exception as exc:
+            # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest
+            # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding
+            # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details.
+            exc_tb_cache[key] = exc, exc.__traceback__
+            raise exc
+
+        out_cache[key] = out
+        return out
+
+    return wrapper
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+class ImagePair(TensorLikePair):
+    def __init__(
+        self,
+        actual,
+        expected,
+        *,
+        mae=False,
+        **other_parameters,
+    ):
+        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
+            actual, expected = (to_image(input) for input in [actual, expected])
+
+        super().__init__(actual, expected, **other_parameters)
+        self.mae = mae
+
+    def compare(self) -> None:
+        actual, expected = self.actual, self.expected
+
+        self._compare_attributes(actual, expected)
+        actual, expected = self._equalize_attributes(actual, expected)
+
+        if self.mae:
+            if actual.dtype is torch.uint8:
+                actual, expected = actual.to(torch.int), expected.to(torch.int)
+            mae = float(torch.abs(actual - expected).float().mean())
+            if mae > self.atol:
+                self._fail(
+                    AssertionError,
+                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
+                )
+        else:
+            super()._compare_values(actual, expected)
+
+
+def assert_close(
+    actual,
+    expected,
+    *,
+    allow_subclasses=True,
+    rtol=None,
+    atol=None,
+    equal_nan=False,
+    check_device=True,
+    check_dtype=True,
+    check_layout=True,
+    check_stride=False,
+    msg=None,
+    **kwargs,
+):
+    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
+    __tracebackhide__ = True
+
+    error_metas = not_close_error_metas(
+        actual,
+        expected,
+        pair_types=(
+            NonePair,
+            BooleanPair,
+            NumberPair,
+            ImagePair,
+            TensorLikePair,
+        ),
+        allow_subclasses=allow_subclasses,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=check_device,
+        check_dtype=check_dtype,
+        check_layout=check_layout,
+        check_stride=check_stride,
+        **kwargs,
+    )
+
+    if error_metas:
+        raise error_metas[0].to_error(msg)
+
+
+assert_equal = functools.partial(assert_close, rtol=0, atol=0)
+
+
+DEFAULT_SIZE = (17, 11)
+
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return tv_tensors.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_keypoints(canvas_size=DEFAULT_SIZE, *, num_points=4, dtype=None, device="cpu"):
+    y = torch.randint(0, canvas_size[0], size=(num_points, 1), dtype=dtype, device=device)
+    x = torch.randint(0, canvas_size[1], size=(num_points, 1), dtype=dtype, device=device)
+    return tv_tensors.KeyPoints(torch.cat((x, y), dim=-1), canvas_size=canvas_size)
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    clamping_mode="soft",
+    num_boxes=1,
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()])
+
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    h, w = (torch.randint(1, s, (num_boxes,)) for s in canvas_size)
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+    r = -360 * torch.rand((num_boxes,)) + 180
+
+    if format is tv_tensors.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is tv_tensors.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    elif format is tv_tensors.BoundingBoxFormat.XYWHR:
+        parts = (x, y, w, h, r)
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWHR:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h, r)
+    elif format is tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        r_rad = r * torch.pi / 180.0
+        cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+        x1 = x
+        y1 = y
+        x2 = x1 + w * cos
+        y2 = y1 - w * sin
+        x3 = x2 + h * sin
+        y3 = y2 + h * cos
+        x4 = x1 + h * sin
+        y4 = y1 + h * cos
+        parts = (x1, y1, x2, y2, x3, y3, x4, y4)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+    out_boxes = torch.stack(parts, dim=-1).to(dtype=dtype, device=device)
+    return tv_tensors.BoundingBoxes(out_boxes, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
+
+
+def make_detection_masks(size=DEFAULT_SIZE, *, num_masks=1, dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (num_masks, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def assert_run_python_script(source_code):
+    """Utility to check assertions in an independent Python subprocess.
+
+    The script provided in the source code should return 0 and not print
+    anything on stderr or stdout. Modified from scikit-learn test utils.
+
+    Args:
+        source_code (str): The Python source code to execute.
+    """
+    with get_tmp_dir() as root:
+        path = pathlib.Path(root) / "main.py"
+        with open(path, "w") as file:
+            file.write(source_code)
+
+        try:
+            out = check_output([sys.executable, str(path)], stderr=STDOUT)
+        except CalledProcessError as e:
+            raise RuntimeError(f"script errored with output:\n{e.output.decode()}")
+        if out != b"":
+            raise AssertionError(out.decode())
+
+
+@contextlib.contextmanager
+def assert_no_warnings():
+    # The name `catch_warnings` is a misnomer as the context manager does **not** catch any warnings, but rather scopes
+    # the warning filters. All changes that are made to the filters while in this context, will be reset upon exit.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        yield
+
+
+@contextlib.contextmanager
+def ignore_jit_no_profile_information_warning():
+    # Calling a scripted object often triggers a warning like
+    # `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information`
+    # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
+    # them.
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning)
+        yield
diff --git a/test/conftest.py b/test/conftest.py
index 1a9b2db7f5c..a9768598ded 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,22 +3,32 @@
 import numpy as np
 import pytest
 import torch
-from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER
+
+from common_utils import (
+    CUDA_NOT_AVAILABLE_MSG,
+    IN_FBCODE,
+    IN_OSS_CI,
+    IN_RE_WORKER,
+    MPS_NOT_AVAILABLE_MSG,
+    OSS_CI_GPU_NO_CUDA_MSG,
+)
 
 
 def pytest_configure(config):
     # register an additional marker (see pytest_collection_modifyitems)
     config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device")
+    config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device")
     config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected")
+    config.addinivalue_line("markers", "opcheck_only_one: only opcheck one parametrization")
 
 
 def pytest_collection_modifyitems(items):
     # This hook is called by pytest after it has collected the tests (google its name to check out its doc!)
     # We can ignore some tests as we see fit here, or add marks, such as a skip mark.
     #
-    # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
+    # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
     # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already.
-    # This is true for both CircleCI and the fbcode internal CI.
+    # This is true for both OSS CI and the fbcode internal CI.
     # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on
     # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if
     # these tests never existed.
@@ -28,16 +38,20 @@ def pytest_collection_modifyitems(items):
         # The needs_cuda mark will exist if the test was explicitly decorated with
         # the @needs_cuda decorator. It will also exist if it was parametrized with a
         # parameter that has the mark: for example if a test is parametrized with
-        # @pytest.mark.parametrize('device', cpu_and_gpu())
+        # @pytest.mark.parametrize('device', cpu_and_cuda())
         # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
         # and the ones with device == 'cpu' won't have the mark.
         needs_cuda = item.get_closest_marker("needs_cuda") is not None
+        needs_mps = item.get_closest_marker("needs_mps") is not None
 
         if needs_cuda and not torch.cuda.is_available():
             # In general, we skip cuda tests on machines without a GPU
             # There are special cases though, see below
             item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG))
 
+        if needs_mps and not torch.backends.mps.is_available():
+            item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG))
+
         if IN_FBCODE:
             # fbcode doesn't like skipping tests, so instead we  just don't collect the test
             # so that they don't even "exist", hence the continue statements.
@@ -49,15 +63,18 @@ def pytest_collection_modifyitems(items):
                 # TODO: something more robust would be to do that only in a sandcastle instance,
                 # so that we can still see the test being skipped when testing locally from a devvm
                 continue
-        elif IN_CIRCLE_CI:
+            if needs_mps and not torch.backends.mps.is_available():
+                # Same as above, but for MPS
+                continue
+        elif IN_OSS_CI:
             # Here we're not in fbcode, so we can safely collect and skip tests.
             if not needs_cuda and torch.cuda.is_available():
-                # Similar to what happens in RE workers: we don't need the CircleCI GPU machines
+                # Similar to what happens in RE workers: we don't need the OSS CI GPU machines
                 # to run the CPU-only tests.
-                item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG))
+                item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG))
 
         if item.get_closest_marker("dont_collect") is not None:
-            # currently, this is only used for some tests we're sure we dont want to run on fbcode
+            # currently, this is only used for some tests we're sure we don't want to run on fbcode
             continue
 
         out_items.append(item)
diff --git a/test/cpp/test_custom_operators.cpp b/test/cpp/test_custom_operators.cpp
index e68f6c2f029..5178575d21b 100644
--- a/test/cpp/test_custom_operators.cpp
+++ b/test/cpp/test_custom_operators.cpp
@@ -7,7 +7,8 @@
 
 TEST(test_custom_operators, nms) {
   // make sure that the torchvision ops are visible to the jit interpreter
-  auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::nms"));
+  auto& ops = torch::jit::getAllOperatorsFor(
+      torch::jit::Symbol::fromQualString("torchvision::nms"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -24,29 +25,35 @@ TEST(test_custom_operators, nms) {
 
   at::Tensor output = vision::ops::nms(boxes, scores, thresh);
   ASSERT_TRUE(output_jit.allclose(output));
-
 }
 
 TEST(test_custom_operators, roi_align_visible) {
-  // make sure that the torchvision ops are visible to the jit interpreter even if
-  // not explicitly included
-  auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::roi_align"));
+  // make sure that the torchvision ops are visible to the jit interpreter even
+  // if not explicitly included
+  auto& ops = torch::jit::getAllOperatorsFor(
+      torch::jit::Symbol::fromQualString("torchvision::roi_align"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
   ASSERT_EQ(op->schema().name(), "torchvision::roi_align");
 
   torch::jit::Stack stack;
-  float roi_data[] = {
-    0., 0., 0., 5., 5.,
-    0., 5., 5., 10., 10.
-  };
-  at::Tensor input = at::rand({1, 2, 10, 10}), rois = at::from_blob(roi_data, {2, 5});
+  float roi_data[] = {0., 0., 0., 5., 5., 0., 5., 5., 10., 10.};
+  at::Tensor input = at::rand({1, 2, 10, 10}),
+             rois = at::from_blob(roi_data, {2, 5});
   double spatial_scale = 1.0;
   int64_t pooled_height = 3, pooled_width = 3, sampling_ratio = -1;
   bool aligned = true;
 
-  torch::jit::push(stack, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned);
+  torch::jit::push(
+      stack,
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
   op->getOperation()(stack);
   at::Tensor output_jit;
   torch::jit::pop(stack, output_jit);
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index c232e7132b4..cbfb26b6c6b 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -5,6 +5,7 @@
 import itertools
 import os
 import pathlib
+import platform
 import random
 import shutil
 import string
@@ -14,7 +15,8 @@
 import unittest.mock
 import zipfile
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
+from collections.abc import Iterator, Sequence
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 
@@ -25,7 +27,12 @@
 import torchvision.datasets
 import torchvision.io
 from common_utils import disable_console_output, get_tmp_dir
+from torch.utils._pytree import tree_any
+from torch.utils.data import DataLoader
+from torchvision import tv_tensors
+from torchvision.datasets import wrap_dataset_for_transforms_v2
 from torchvision.transforms.functional import get_dimensions
+from torchvision.transforms.v2.functional import get_size
 
 
 __all__ = [
@@ -57,6 +64,7 @@ class LazyImporter:
     provide modules listed in MODULES as attributes. They are only imported when accessed.
 
     """
+
     MODULES = (
         "av",
         "lmdb",
@@ -137,7 +145,7 @@ def test_foo(self, config):
 
     .. note::
 
-        This will try to remove duplicate configurations. During this process it will not not preserve a potential
+        This will try to remove duplicate configurations. During this process it will not preserve a potential
         ordering of the configurations or an inner ordering of a configuration.
     """
 
@@ -146,7 +154,7 @@ def maybe_remove_duplicates(configs):
             return [dict(config_) for config_ in {tuple(sorted(config.items())) for config in configs}]
         except TypeError:
             # A TypeError will be raised if a value of any config is not hashable, e.g. a list. In that case duplicate
-            # removal would be a lot more elaborate and we simply bail out.
+            # removal would be a lot more elaborate, and we simply bail out.
             return configs
 
     @functools.wraps(test)
@@ -169,23 +177,6 @@ def wrapper(self):
     return wrapper
 
 
-def combinations_grid(**kwargs):
-    """Creates a grid of input combinations.
-
-    Each element in the returned sequence is a dictionary containing one possible combination as values.
-
-    Example:
-        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
-        [
-            {'foo': 'bar', 'spam': 'eggs'},
-            {'foo': 'bar', 'spam': 'ham'},
-            {'foo': 'baz', 'spam': 'eggs'},
-            {'foo': 'baz', 'spam': 'ham'}
-        ]
-    """
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
 class DatasetTestCase(unittest.TestCase):
     """Abstract base class for all dataset testcases.
 
@@ -291,13 +282,13 @@ def test_baz(self):
         "download_and_extract_archive",
     }
 
-    def dataset_args(self, tmpdir: str, config: Dict[str, Any]) -> Sequence[Any]:
+    def dataset_args(self, tmpdir: str, config: dict[str, Any]) -> Sequence[Any]:
         """Define positional arguments passed to the dataset.
 
         .. note::
 
             The default behavior is only valid if the dataset to be tested has ``root`` as the only required parameter.
-            Otherwise you need to overwrite this method.
+            Otherwise, you need to overwrite this method.
 
         Args:
             tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset
@@ -310,7 +301,7 @@ def dataset_args(self, tmpdir: str, config: Dict[str, Any]) -> Sequence[Any]:
         """
         return (tmpdir,)
 
-    def inject_fake_data(self, tmpdir: str, config: Dict[str, Any]) -> Union[int, Dict[str, Any]]:
+    def inject_fake_data(self, tmpdir: str, config: dict[str, Any]) -> Union[int, dict[str, Any]]:
         """Inject fake data for dataset into a temporary directory.
 
         During the creation of the dataset the download and extract logic is disabled. Thus, the fake data injected
@@ -334,11 +325,11 @@ def inject_fake_data(self, tmpdir: str, config: Dict[str, Any]) -> Union[int, Di
     @contextlib.contextmanager
     def create_dataset(
         self,
-        config: Optional[Dict[str, Any]] = None,
+        config: Optional[dict[str, Any]] = None,
         inject_fake_data: bool = True,
         patch_checks: Optional[bool] = None,
         **kwargs: Any,
-    ) -> Iterator[Tuple[torchvision.datasets.VisionDataset, Dict[str, Any]]]:
+    ) -> Iterator[tuple[torchvision.datasets.VisionDataset, dict[str, Any]]]:
         r"""Create the dataset in a temporary directory.
 
         The configuration passed to the dataset is populated to contain at least all parameters with default values.
@@ -564,7 +555,7 @@ def test_feature_types(self, config):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"]
+            assert len(list(dataset)) == len(dataset) == info["num_examples"]
 
     @test_all_configs
     def test_transforms(self, config):
@@ -581,6 +572,39 @@ def test_transforms(self, config):
 
                 mock.assert_called()
 
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        try:
+            with self.create_dataset(config) as (dataset, info):
+                for target_keys in [None, "all"]:
+                    if target_keys is not None and self.DATASET_CLASS not in {
+                        torchvision.datasets.CocoDetection,
+                        torchvision.datasets.VOCDetection,
+                        torchvision.datasets.Kitti,
+                        torchvision.datasets.WIDERFace,
+                    }:
+                        with self.assertRaisesRegex(ValueError, "`target_keys` is currently only supported for"):
+                            wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                        continue
+
+                    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                    assert isinstance(wrapped_dataset, self.DATASET_CLASS)
+                    assert len(wrapped_dataset) == info["num_examples"]
+
+                    wrapped_sample = wrapped_dataset[0]
+                    assert tree_any(
+                        lambda item: isinstance(item, (tv_tensors.TVTensor, PIL.Image.Image)), wrapped_sample
+                    )
+        except TypeError as error:
+            msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
+            if str(error).startswith(msg):
+                pytest.skip(msg)
+            raise error
+        except RuntimeError as error:
+            if "currently not supported by this wrapper" in str(error):
+                pytest.skip("Config is currently not supported by this wrapper")
+            raise error
+
 
 class ImageDatasetTestCase(DatasetTestCase):
     """Abstract base class for image dataset testcases.
@@ -589,43 +613,56 @@ class ImageDatasetTestCase(DatasetTestCase):
     """
 
     FEATURE_TYPES = (PIL.Image.Image, int)
+    SUPPORT_TV_IMAGE_DECODE: bool = False
 
     @contextlib.contextmanager
     def create_dataset(
         self,
-        config: Optional[Dict[str, Any]] = None,
+        config: Optional[dict[str, Any]] = None,
         inject_fake_data: bool = True,
         patch_checks: Optional[bool] = None,
         **kwargs: Any,
-    ) -> Iterator[Tuple[torchvision.datasets.VisionDataset, Dict[str, Any]]]:
+    ) -> Iterator[tuple[torchvision.datasets.VisionDataset, dict[str, Any]]]:
         with super().create_dataset(
             config=config,
             inject_fake_data=inject_fake_data,
             patch_checks=patch_checks,
             **kwargs,
         ) as (dataset, info):
-            # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access
+            # PIL.Image.open() only loads the image metadata upfront and keeps the file open until the first access
             # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we
             # force-load opened images.
             # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an
             # image, but never use the underlying data. During normal operation it is reasonable to assume that the
             # user wants to work with the image he just opened rather than deleting the underlying file.
-            with self._force_load_images():
+            with self._force_load_images(loader=(config or {}).get("loader", None)):
                 yield dataset, info
 
     @contextlib.contextmanager
-    def _force_load_images(self):
-        open = PIL.Image.open
+    def _force_load_images(self, loader: Optional[Callable[[str], Any]] = None):
+        open = loader or PIL.Image.open
 
         def new(fp, *args, **kwargs):
             image = open(fp, *args, **kwargs)
-            if isinstance(fp, (str, pathlib.Path)):
+            if isinstance(fp, (str, pathlib.Path)) and isinstance(image, PIL.Image.Image):
                 image.load()
             return image
 
-        with unittest.mock.patch("PIL.Image.open", new=new):
+        with unittest.mock.patch(open.__module__ + "." + open.__qualname__, new=new):
             yield
 
+    def test_tv_decode_image_support(self):
+        if not self.SUPPORT_TV_IMAGE_DECODE:
+            pytest.skip(f"{self.DATASET_CLASS.__name__} does not support torchvision.io.decode_image.")
+
+        with self.create_dataset(
+            config=dict(
+                loader=torchvision.io.decode_image,
+            )
+        ) as (dataset, _):
+            image = dataset[0][0]
+            assert isinstance(image, torch.Tensor)
+
 
 class VideoDatasetTestCase(DatasetTestCase):
     """Abstract base class for video dataset testcases.
@@ -641,27 +678,76 @@ class VideoDatasetTestCase(DatasetTestCase):
     FEATURE_TYPES = (torch.Tensor, torch.Tensor, int)
     REQUIRED_PACKAGES = ("av",)
 
-    DEFAULT_FRAMES_PER_CLIP = 1
+    FRAMES_PER_CLIP = 1
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.dataset_args = self._set_default_frames_per_clip(self.dataset_args)
 
-    def _set_default_frames_per_clip(self, inject_fake_data):
+    def _set_default_frames_per_clip(self, dataset_args):
         argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__)
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @functools.wraps(inject_fake_data)
+        @functools.wraps(dataset_args)
         def wrapper(tmpdir, config):
-            args = inject_fake_data(tmpdir, config)
+            args = dataset_args(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
-                args = (*args, self.DEFAULT_FRAMES_PER_CLIP)
+                args = (*args, self.FRAMES_PER_CLIP)
 
             return args
 
         return wrapper
 
+    def test_output_format(self):
+        for output_format in ["TCHW", "THWC"]:
+            with self.create_dataset(output_format=output_format) as (dataset, _):
+                for video, *_ in dataset:
+                    if output_format == "TCHW":
+                        num_frames, num_channels, *_ = video.shape
+                    else:  # output_format == "THWC":
+                        num_frames, *_, num_channels = video.shape
+
+                assert num_frames == self.FRAMES_PER_CLIP
+                assert num_channels == 3
+
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly
+        # or use the supported `"TCHW"`
+        if config.setdefault("output_format", "TCHW") == "THWC":
+            return
+
+        super().test_transforms_v2_wrapper.__wrapped__(self, config)
+
+
+def _no_collate(batch):
+    return batch
+
+
+def check_transforms_v2_wrapper_spawn(dataset, expected_size):
+    # This check ensures that the wrapped datasets can be used with multiprocessing_context="spawn" in the DataLoader.
+    # We also check that transforms are applied correctly as a non-regression test for
+    # https://github.com/pytorch/vision/issues/8066
+    # Implicitly, this also checks that the wrapped datasets are pickleable.
+
+    # To save CI/test time, we only check on Windows where "spawn" is the default
+    if platform.system() != "Windows":
+        pytest.skip("Multiprocessing spawning is only checked on macOS.")
+
+    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    dataloader = DataLoader(wrapped_dataset, num_workers=2, multiprocessing_context="spawn", collate_fn=_no_collate)
+
+    def resize_was_applied(item):
+        # Checking the size of the output ensures that the Resize transform was correctly applied
+        return isinstance(item, (tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)) and get_size(item) == list(
+            expected_size
+        )
+
+    for wrapped_sample in dataloader:
+        assert tree_any(resize_was_applied, wrapped_sample)
+
 
 def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor:
     r"""Create a random uint8 tensor.
@@ -715,7 +801,7 @@ def create_image_folder(
     num_examples: int,
     size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None,
     **kwargs: Any,
-) -> List[pathlib.Path]:
+) -> list[pathlib.Path]:
     """Create a folder of random images.
 
     Args:
@@ -737,7 +823,7 @@ def create_image_folder(
     """
     if size is None:
 
-        def size(idx: int) -> Tuple[int, int, int]:
+        def size(idx: int) -> tuple[int, int, int]:
             num_channels = 3
             height, width = torch.randint(3, 11, size=(2,), dtype=torch.int).tolist()
             return (num_channels, height, width)
@@ -786,7 +872,7 @@ def create_video_file(
     fps: float = 25,
     **kwargs: Any,
 ) -> pathlib.Path:
-    """Create an video file from random data.
+    """Create a video file from random data.
 
     Args:
         root (Union[str, pathlib.Path]): Root directory the video file will be placed in.
@@ -829,7 +915,7 @@ def create_video_folder(
     size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None,
     fps=25,
     **kwargs,
-) -> List[pathlib.Path]:
+) -> list[pathlib.Path]:
     """Create a folder of random videos.
 
     Args:
@@ -951,7 +1037,7 @@ def create_random_string(length: int, *digits: str) -> str:
 
     Args:
         length (int): Number of characters in the generated string.
-        *characters (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
+        *digits (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
     """
     if not digits:
         digits = string.ascii_lowercase
diff --git a/test/expect/ModelTester.test_crestereo_base_expect.pkl b/test/expect/ModelTester.test_crestereo_base_expect.pkl
new file mode 100644
index 00000000000..e5b8cd8f666
Binary files /dev/null and b/test/expect/ModelTester.test_crestereo_base_expect.pkl differ
diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl
index e95ba5f5398..862af2185c7 100644
Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl
index c2875679efd..1d317eb7915 100644
Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl differ
diff --git a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl
index 0657261d96c..3d4e3e63f28 100644
Binary files a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl
index 2f1ff941aba..54dfb7cd206 100644
Binary files a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl
index 36b68081672..f52b77a8dd8 100644
Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl
index c6d1fd14081..23e841bf874 100644
Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl differ
diff --git a/test/expect/ModelTester.test_maxvit_t_expect.pkl b/test/expect/ModelTester.test_maxvit_t_expect.pkl
new file mode 100644
index 00000000000..3a93545f614
Binary files /dev/null and b/test/expect/ModelTester.test_maxvit_t_expect.pkl differ
diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
index 7fb8d66b080..f188ee7b911 100644
Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl
index 9c74f2e9b99..beaf6c8e84b 100644
Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl differ
diff --git a/test/expect/ModelTester.test_swin3d_b_expect.pkl b/test/expect/ModelTester.test_swin3d_b_expect.pkl
new file mode 100644
index 00000000000..1efc513c911
Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_b_expect.pkl differ
diff --git a/test/expect/ModelTester.test_swin3d_s_expect.pkl b/test/expect/ModelTester.test_swin3d_s_expect.pkl
new file mode 100644
index 00000000000..0c1e594993e
Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_s_expect.pkl differ
diff --git a/test/expect/ModelTester.test_swin3d_t_expect.pkl b/test/expect/ModelTester.test_swin3d_t_expect.pkl
new file mode 100644
index 00000000000..5e658ff16b7
Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_t_expect.pkl differ
diff --git a/test/optests_failures_dict.json b/test/optests_failures_dict.json
new file mode 100644
index 00000000000..3bad0bbb027
--- /dev/null
+++ b/test/optests_failures_dict.json
@@ -0,0 +1,5 @@
+{
+  "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
+  "_version": 1,
+  "data": {}
+}
diff --git a/test/smoke_test.py b/test/smoke_test.py
index c3a4bdd19d6..e2a3b5068ab 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,4 +1,147 @@
+"""Run smoke tests"""
+
+import os
+import sys
+import sysconfig
+from pathlib import Path
+
 import torch
 import torchvision
-import torchvision.datasets as dset
-import torchvision.transforms
+from torchvision.io import decode_avif, decode_heic, decode_image, decode_jpeg, read_file
+from torchvision.models import resnet50, ResNet50_Weights
+
+
+SCRIPT_DIR = Path(__file__).parent
+
+
+def smoke_test_torchvision() -> None:
+    print(
+        "Is torchvision usable?",
+        all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]),
+    )
+
+
+def smoke_test_torchvision_read_decode() -> None:
+    img_jpg = decode_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
+    if img_jpg.shape != (3, 606, 517):
+        raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
+
+    img_png = decode_image(str(SCRIPT_DIR / "assets" / "interlaced_png" / "wizard_low.png"))
+    if img_png.shape != (4, 471, 354):
+        raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
+
+    img_webp = decode_image(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch.webp"))
+    if img_webp.shape != (3, 100, 100):
+        raise RuntimeError(f"Unexpected shape of img_webp: {img_webp.shape}")
+
+    if sys.platform == "linux":
+        pass
+        # TODO: Fix/uncomment below (the TODO below is mostly accurate but we're
+        # still observing some failures on some CUDA jobs. Most are working.)
+        # if torch.cuda.is_available():
+        #     # TODO: For whatever reason this only passes on the runners that
+        #     # support CUDA.
+        #     # Strangely, on the CPU runners where this fails, the AVIF/HEIC
+        #     # tests (ran with pytest) are passing. This is likely related to a
+        #     # libcxx symbol thing, and the proper libstdc++.so get loaded only
+        #     # with pytest? Ugh.
+        #     img_avif = decode_avif(read_file(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch.avif")))
+        #     if img_avif.shape != (3, 100, 100):
+        #         raise RuntimeError(f"Unexpected shape of img_avif: {img_avif.shape}")
+
+        #     img_heic = decode_heic(
+        #         read_file(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic"))
+        #     )
+        #     if img_heic.shape != (3, 100, 100):
+        #         raise RuntimeError(f"Unexpected shape of img_heic: {img_heic.shape}")
+    else:
+        try:
+            decode_avif(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch.avif"))
+        except RuntimeError as e:
+            assert "torchvision-extra-decoders" in str(e)
+
+        try:
+            decode_heic(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic"))
+        except RuntimeError as e:
+            assert "torchvision-extra-decoders" in str(e)
+
+
+def smoke_test_torchvision_decode_jpeg(device: str = "cpu"):
+    img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
+    img_jpg = decode_jpeg(img_jpg_data, device=device)
+    if img_jpg.shape != (3, 606, 517):
+        raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
+
+
+def smoke_test_compile() -> None:
+    try:
+        model = resnet50().cuda()
+        model = torch.compile(model)
+        x = torch.randn(1, 3, 224, 224, device="cuda")
+        out = model(x)
+        print(f"torch.compile model output: {out.shape}")
+    except RuntimeError:
+        if sys.platform == "win32":
+            print("Successfully caught torch.compile RuntimeError on win")
+        else:
+            raise
+
+
+def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
+    img = decode_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device)
+
+    # Step 1: Initialize model with the best available weights
+    weights = ResNet50_Weights.DEFAULT
+    model = resnet50(weights=weights, progress=False).to(device)
+    model.eval()
+
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms(antialias=True)
+
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(img).unsqueeze(0)
+
+    # Step 4: Use the model and print the predicted category
+    prediction = model(batch).squeeze(0).softmax(0)
+    class_id = prediction.argmax().item()
+    score = prediction[class_id].item()
+    category_name = weights.meta["categories"][class_id]
+    expected_category = "German shepherd"
+    print(f"{category_name} ({device}): {100 * score:.1f}%")
+    if category_name != expected_category:
+        raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}")
+
+
+def main() -> None:
+    print(f"torchvision: {torchvision.__version__}")
+    print(f"torch.cuda.is_available: {torch.cuda.is_available()}")
+
+    print(f"{torch.ops.image._jpeg_version() = }")
+    if not torch.ops.image._is_compiled_against_turbo():
+        msg = "Torchvision wasn't compiled against libjpeg-turbo"
+        if os.getenv("IS_M1_CONDA_BUILD_JOB") == "1":
+            # When building the conda package on M1, it's difficult to enforce
+            # that we build against turbo due to interactions with the libwebp
+            # package. So we just accept it, instead of raising an error.
+            print(msg)
+        else:
+            raise ValueError(msg)
+
+    smoke_test_torchvision()
+    smoke_test_torchvision_read_decode()
+    smoke_test_torchvision_resnet50_classify()
+    smoke_test_torchvision_decode_jpeg()
+    if torch.cuda.is_available():
+        smoke_test_torchvision_decode_jpeg("cuda")
+        smoke_test_torchvision_resnet50_classify("cuda")
+
+        #  torch.compile is not supported on Python 3.14+ and Python built with GIL disabled
+        if sys.version_info < (3, 14, 0) and not sysconfig.get_config_var("Py_GIL_DISABLED"):
+            smoke_test_compile()
+
+    if torch.backends.mps.is_available():
+        smoke_test_torchvision_resnet50_classify("mps")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_architecture_ops.py b/test/test_architecture_ops.py
new file mode 100644
index 00000000000..32ad1a32f89
--- /dev/null
+++ b/test/test_architecture_ops.py
@@ -0,0 +1,46 @@
+import unittest
+
+import pytest
+import torch
+
+from torchvision.models.maxvit import SwapAxes, WindowDepartition, WindowPartition
+
+
+class MaxvitTester(unittest.TestCase):
+    def test_maxvit_window_partition(self):
+        input_shape = (1, 3, 224, 224)
+        partition_size = 7
+        n_partitions = input_shape[3] // partition_size
+
+        x = torch.randn(input_shape)
+
+        partition = WindowPartition()
+        departition = WindowDepartition()
+
+        x_hat = partition(x, partition_size)
+        x_hat = departition(x_hat, partition_size, n_partitions, n_partitions)
+
+        torch.testing.assert_close(x, x_hat)
+
+    def test_maxvit_grid_partition(self):
+        input_shape = (1, 3, 224, 224)
+        partition_size = 7
+        n_partitions = input_shape[3] // partition_size
+
+        x = torch.randn(input_shape)
+        pre_swap = SwapAxes(-2, -3)
+        post_swap = SwapAxes(-2, -3)
+
+        partition = WindowPartition()
+        departition = WindowDepartition()
+
+        x_hat = partition(x, n_partitions)
+        x_hat = pre_swap(x_hat)
+        x_hat = post_swap(x_hat)
+        x_hat = departition(x_hat, n_partitions, partition_size, partition_size)
+
+        torch.testing.assert_close(x, x_hat)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py
index 4fba3c3d098..8f6fc5d9278 100644
--- a/test/test_backbone_utils.py
+++ b/test/test_backbone_utils.py
@@ -1,6 +1,7 @@
 import random
+from collections.abc import Mapping, Sequence
+from copy import deepcopy
 from itertools import chain
-from typing import Mapping, Sequence
 
 import pytest
 import torch
@@ -194,7 +195,7 @@ def test_feature_extraction_methods_equivalence(self):
             assert n1 == n2
             assert p1.equal(p2)
 
-        # And that ouputs match
+        # And that outputs match
         with torch.no_grad():
             ilg_out = ilg_model(self.inp)
             fgn_out = fx_model(self.inp)
@@ -322,3 +323,14 @@ def forward(self, x):
         out = model(self.inp)
         # And backward
         out["leaf_module"].float().mean().backward()
+
+    def test_deepcopy(self):
+        # Non-regression test for https://github.com/pytorch/vision/issues/8634
+        model = models.efficientnet_b3(weights=None)
+        extractor = create_feature_extractor(model=model, return_nodes={"classifier.0": "out"})
+
+        extractor.eval()
+        extractor.train()
+        extractor = deepcopy(extractor)
+        extractor.eval()
+        extractor.train()
diff --git a/test/test_cpp_models.py b/test/test_cpp_models.py
deleted file mode 100644
index d8d0836d499..00000000000
--- a/test/test_cpp_models.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import os
-import sys
-import unittest
-
-import torch
-import torchvision.transforms.functional as F
-from PIL import Image
-from torchvision import models
-
-try:
-    from torchvision import _C_tests
-except ImportError:
-    _C_tests = None
-
-
-def process_model(model, tensor, func, name):
-    model.eval()
-    traced_script_module = torch.jit.trace(model, tensor)
-    traced_script_module.save("model.pt")
-
-    py_output = model.forward(tensor)
-    cpp_output = func("model.pt", tensor)
-
-    assert torch.allclose(py_output, cpp_output), "Output mismatch of " + name + " models"
-
-
-def read_image1():
-    image_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
-    )
-    image = Image.open(image_path)
-    image = image.resize((224, 224))
-    x = F.pil_to_tensor(image)
-    x = F.convert_image_dtype(x)
-    return x.view(1, 3, 224, 224)
-
-
-def read_image2():
-    image_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
-    )
-    image = Image.open(image_path)
-    image = image.resize((299, 299))
-    x = F.pil_to_tensor(image)
-    x = F.convert_image_dtype(x)
-    x = x.view(1, 3, 299, 299)
-    return torch.cat([x, x], 0)
-
-
-@unittest.skipIf(
-    sys.platform == "darwin" or True,
-    "C++ models are broken on OS X at the moment, and there's a BC breakage on main; "
-    "see https://github.com/pytorch/vision/issues/1191",
-)
-class Tester(unittest.TestCase):
-    image = read_image1()
-
-    def test_alexnet(self):
-        process_model(models.alexnet(), self.image, _C_tests.forward_alexnet, "Alexnet")
-
-    def test_vgg11(self):
-        process_model(models.vgg11(), self.image, _C_tests.forward_vgg11, "VGG11")
-
-    def test_vgg13(self):
-        process_model(models.vgg13(), self.image, _C_tests.forward_vgg13, "VGG13")
-
-    def test_vgg16(self):
-        process_model(models.vgg16(), self.image, _C_tests.forward_vgg16, "VGG16")
-
-    def test_vgg19(self):
-        process_model(models.vgg19(), self.image, _C_tests.forward_vgg19, "VGG19")
-
-    def test_vgg11_bn(self):
-        process_model(models.vgg11_bn(), self.image, _C_tests.forward_vgg11bn, "VGG11BN")
-
-    def test_vgg13_bn(self):
-        process_model(models.vgg13_bn(), self.image, _C_tests.forward_vgg13bn, "VGG13BN")
-
-    def test_vgg16_bn(self):
-        process_model(models.vgg16_bn(), self.image, _C_tests.forward_vgg16bn, "VGG16BN")
-
-    def test_vgg19_bn(self):
-        process_model(models.vgg19_bn(), self.image, _C_tests.forward_vgg19bn, "VGG19BN")
-
-    def test_resnet18(self):
-        process_model(models.resnet18(), self.image, _C_tests.forward_resnet18, "Resnet18")
-
-    def test_resnet34(self):
-        process_model(models.resnet34(), self.image, _C_tests.forward_resnet34, "Resnet34")
-
-    def test_resnet50(self):
-        process_model(models.resnet50(), self.image, _C_tests.forward_resnet50, "Resnet50")
-
-    def test_resnet101(self):
-        process_model(models.resnet101(), self.image, _C_tests.forward_resnet101, "Resnet101")
-
-    def test_resnet152(self):
-        process_model(models.resnet152(), self.image, _C_tests.forward_resnet152, "Resnet152")
-
-    def test_resnext50_32x4d(self):
-        process_model(models.resnext50_32x4d(), self.image, _C_tests.forward_resnext50_32x4d, "ResNext50_32x4d")
-
-    def test_resnext101_32x8d(self):
-        process_model(models.resnext101_32x8d(), self.image, _C_tests.forward_resnext101_32x8d, "ResNext101_32x8d")
-
-    def test_wide_resnet50_2(self):
-        process_model(models.wide_resnet50_2(), self.image, _C_tests.forward_wide_resnet50_2, "WideResNet50_2")
-
-    def test_wide_resnet101_2(self):
-        process_model(models.wide_resnet101_2(), self.image, _C_tests.forward_wide_resnet101_2, "WideResNet101_2")
-
-    def test_squeezenet1_0(self):
-        process_model(models.squeezenet1_0(), self.image, _C_tests.forward_squeezenet1_0, "Squeezenet1.0")
-
-    def test_squeezenet1_1(self):
-        process_model(models.squeezenet1_1(), self.image, _C_tests.forward_squeezenet1_1, "Squeezenet1.1")
-
-    def test_densenet121(self):
-        process_model(models.densenet121(), self.image, _C_tests.forward_densenet121, "Densenet121")
-
-    def test_densenet169(self):
-        process_model(models.densenet169(), self.image, _C_tests.forward_densenet169, "Densenet169")
-
-    def test_densenet201(self):
-        process_model(models.densenet201(), self.image, _C_tests.forward_densenet201, "Densenet201")
-
-    def test_densenet161(self):
-        process_model(models.densenet161(), self.image, _C_tests.forward_densenet161, "Densenet161")
-
-    def test_mobilenet_v2(self):
-        process_model(models.mobilenet_v2(), self.image, _C_tests.forward_mobilenetv2, "MobileNet")
-
-    def test_googlenet(self):
-        process_model(models.googlenet(), self.image, _C_tests.forward_googlenet, "GoogLeNet")
-
-    def test_mnasnet0_5(self):
-        process_model(models.mnasnet0_5(), self.image, _C_tests.forward_mnasnet0_5, "MNASNet0_5")
-
-    def test_mnasnet0_75(self):
-        process_model(models.mnasnet0_75(), self.image, _C_tests.forward_mnasnet0_75, "MNASNet0_75")
-
-    def test_mnasnet1_0(self):
-        process_model(models.mnasnet1_0(), self.image, _C_tests.forward_mnasnet1_0, "MNASNet1_0")
-
-    def test_mnasnet1_3(self):
-        process_model(models.mnasnet1_3(), self.image, _C_tests.forward_mnasnet1_3, "MNASNet1_3")
-
-    def test_inception_v3(self):
-        self.image = read_image2()
-        process_model(models.inception_v3(), self.image, _C_tests.forward_inceptionv3, "Inceptionv3")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_datasets.py b/test/test_datasets.py
index ad31856cd01..22c14cbc08d 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -8,12 +8,13 @@
 import pathlib
 import pickle
 import random
+import re
 import shutil
 import string
 import unittest
 import xml.etree.ElementTree as ET
 import zipfile
-from typing import Callable, Tuple, Union
+from typing import Callable, Union
 
 import datasets_utils
 import numpy as np
@@ -21,12 +22,15 @@
 import pytest
 import torch
 import torch.nn.functional as F
+from common_utils import combinations_grid
 from torchvision import datasets
+from torchvision.io import decode_image
+from torchvision.transforms import v2
 
 
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
     @staticmethod
     def _make_binary_file(num_elements, root, name):
@@ -112,9 +116,7 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Caltech101
     FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        target_type=("category", "annotation", ["category", "annotation"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(target_type=("category", "annotation", ["category", "annotation"]))
     REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir, config):
@@ -183,6 +185,11 @@ def test_combined_targets(self):
                 ), "Type of the combined target does not match the type of the corresponding individual target: "
                 f"{actual} is not {expected}",
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(target_type="category", transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Caltech256
@@ -190,7 +197,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
 
-        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
+        categories = ((1, "ak47"), (2, "american-flag"), (3, "backpack"))
         num_images_per_category = 2
 
         for idx, category in categories:
@@ -207,7 +214,7 @@ def inject_fake_data(self, tmpdir, config):
 class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.WIDERFace
     FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         widerface_dir = pathlib.Path(tmpdir) / "widerface"
@@ -258,6 +265,11 @@ def inject_fake_data(self, tmpdir, config):
 
         return split_to_num_examples[config["split"]]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Cityscapes
@@ -268,8 +280,8 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
         "color",
     )
     ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
-        *datasets_utils.combinations_grid(
+        *combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
+        *combinations_grid(
             mode=("coarse",),
             split=("train", "train_extra", "val"),
             target_type=TARGET_TYPES,
@@ -382,11 +394,19 @@ def test_feature_types_target_polygon(self):
             assert isinstance(polygon_img, PIL.Image.Image)
             (polygon_target, info["expected_polygon_target"])
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        for target_type in ["instance", "semantic", ["instance", "semantic"]]:
+            with self.create_dataset(target_type=target_type, transform=v2.Resize(size=expected_size)) as (dataset, _):
+                datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ImageNet
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
+
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir)
@@ -413,10 +433,15 @@ def inject_fake_data(self, tmpdir, config):
         torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
         return num_examples
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CIFAR10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     _VERSION_CONFIG = dict(
         base_folder="cifar-10-batches-py",
@@ -489,7 +514,7 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CelebA
     FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "valid", "test", "all"),
         target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
     )
@@ -510,7 +535,8 @@ def inject_fake_data(self, tmpdir, config):
         self._create_bbox_txt(base_folder, num_images)
         self._create_landmarks_txt(base_folder, num_images)
 
-        return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names)
+        num_samples = num_images_per_split.get(config["split"], 0) if isinstance(config["split"], str) else 0
+        return dict(num_examples=num_samples, attr_names=attr_names)
 
     def _create_split_txt(self, root):
         num_images_per_split = dict(train=4, valid=3, test=2)
@@ -607,25 +633,46 @@ def test_images_names_split(self):
 
         assert merged_imgs_names == all_imgs_names
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        for target_type in ["identity", "bbox", ["identity", "bbox"]]:
+            with self.create_dataset(target_type=target_type, transform=v2.Resize(size=expected_size)) as (dataset, _):
+                datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
+    def test_invalid_split_list(self):
+        with pytest.raises(ValueError, match="Expected type str for argument split, but got type <class 'list'>."):
+            with self.create_dataset(split=[1]):
+                pass
+
+    def test_invalid_split_int(self):
+        with pytest.raises(ValueError, match="Expected type str for argument split, but got type <class 'int'>."):
+            with self.create_dataset(split=1):
+                pass
+
+    def test_invalid_split_value(self):
+        with pytest.raises(
+            ValueError,
+            match="Unknown value '{value}' for argument {arg}. Valid values are {{{valid_values}}}.".format(
+                value="invalid",
+                arg="split",
+                valid_values=("train", "valid", "test", "all"),
+            ),
+        ):
+            with self.create_dataset(split="invalid"):
+                pass
+
 
 class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.VOCSegmentation
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
 
     ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(
-            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
-        ),
+        *combinations_grid(year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")),
         dict(year="2007", image_set="test"),
-        dict(year="2007-test", image_set="test"),
     )
 
     def inject_fake_data(self, tmpdir, config):
-        year, is_test_set = (
-            ("2007", True)
-            if config["year"] == "2007-test" or config["image_set"] == "test"
-            else (config["year"], False)
-        )
+        year, is_test_set = config["year"], config["image_set"] == "test"
         image_set = config["image_set"]
 
         base_dir = pathlib.Path(tmpdir)
@@ -701,6 +748,11 @@ def add_bndbox(obj, bndbox=None):
 
         return data
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class VOCDetectionTestCase(VOCSegmentationTestCase):
     DATASET_CLASS = datasets.VOCDetection
@@ -721,6 +773,11 @@ def test_annotations(self):
 
             assert object == info["annotation"]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CocoDetection
@@ -751,28 +808,52 @@ def inject_fake_data(self, tmpdir, config):
 
         annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER
         os.makedirs(annotation_folder)
+
+        segmentation_kind = config.pop("segmentation_kind", "list")
         info = self._create_annotation_file(
-            annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image
+            annotation_folder,
+            self._ANNOTATIONS_FILE,
+            file_names,
+            num_annotations_per_image,
+            segmentation_kind=segmentation_kind,
         )
 
         info["num_examples"] = num_images
         return info
 
-    def _create_annotation_file(self, root, name, file_names, num_annotations_per_image):
+    def _create_annotation_file(self, root, name, file_names, num_annotations_per_image, segmentation_kind="list"):
         image_ids = [int(file_name.stem) for file_name in file_names]
         images = [dict(file_name=str(file_name), id=id) for file_name, id in zip(file_names, image_ids)]
 
-        annotations, info = self._create_annotations(image_ids, num_annotations_per_image)
+        annotations, info = self._create_annotations(image_ids, num_annotations_per_image, segmentation_kind)
         self._create_json(root, name, dict(images=images, annotations=annotations))
 
         return info
 
-    def _create_annotations(self, image_ids, num_annotations_per_image):
-        annotations = datasets_utils.combinations_grid(
-            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
-        )
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
+    def _create_annotations(self, image_ids, num_annotations_per_image, segmentation_kind="list"):
+        annotations = []
+        annotion_id = 0
+
+        for image_id in itertools.islice(itertools.cycle(image_ids), len(image_ids) * num_annotations_per_image):
+            segmentation = {
+                "list": [torch.rand(8).tolist()],
+                "rle": {"size": [10, 10], "counts": [1]},
+                "rle_encoded": {"size": [2400, 2400], "counts": "PQRQ2[1\\Y2f0gNVNRhMg2"},
+                "bad": 123,
+            }[segmentation_kind]
+
+            annotations.append(
+                dict(
+                    image_id=image_id,
+                    id=annotion_id,
+                    bbox=torch.rand(4).tolist(),
+                    segmentation=segmentation,
+                    category_id=int(torch.randint(91, ())),
+                    area=float(torch.rand(1)),
+                    iscrowd=int(torch.randint(2, size=(1,))),
+                )
+            )
+            annotion_id += 1
         return annotations, dict()
 
     def _create_json(self, root, name, content):
@@ -781,13 +862,39 @@ def _create_json(self, root, name, content):
             json.dump(content, fh)
         return file
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
+    def test_slice_error(self):
+        with self.create_dataset() as (dataset, _):
+            with pytest.raises(ValueError, match="Index must be of type integer"):
+                dataset[:2]
+
+    def test_segmentation_kind(self):
+        if isinstance(self, CocoCaptionsTestCase):
+            return
+
+        for segmentation_kind in ("list", "rle", "rle_encoded"):
+            config = {"segmentation_kind": segmentation_kind}
+            with self.create_dataset(config) as (dataset, _):
+                dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys="all")
+                list(dataset)
+
+        config = {"segmentation_kind": "bad"}
+        with self.create_dataset(config) as (dataset, _):
+            dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys="all")
+            with pytest.raises(ValueError, match="COCO segmentation expected to be a dict or a list"):
+                list(dataset)
+
 
 class CocoCaptionsTestCase(CocoDetectionTestCase):
     DATASET_CLASS = datasets.CocoCaptions
 
-    def _create_annotations(self, image_ids, num_annotations_per_image):
+    def _create_annotations(self, image_ids, num_annotations_per_image, segmentation_kind="list"):
         captions = [str(idx) for idx in range(num_annotations_per_image)]
-        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
+        annotations = combinations_grid(image_id=image_ids, caption=captions)
         for id, annotation in enumerate(annotations):
             annotation["id"] = id
         return annotations, dict(captions=captions)
@@ -797,11 +904,16 @@ def test_captions(self):
             _, captions = dataset[0]
             assert tuple(captions) == tuple(info["captions"])
 
+    def test_transforms_v2_wrapper_spawn(self):
+        # We need to define this method, because otherwise the test from the super class will
+        # be run
+        pytest.skip("CocoCaptions is currently not supported by the v2 wrapper.")
+
 
 class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.UCF101
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))
 
     _VIDEO_FOLDER = "videos"
     _ANNOTATIONS_FOLDER = "annotations"
@@ -862,9 +974,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.LSUN
 
     REQUIRED_PACKAGES = ("lmdb",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"]))
 
     _CATEGORIES = (
         "bedroom",
@@ -949,7 +1059,7 @@ def test_not_found_or_corrupted(self):
 
 class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.Kinetics
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
 
     def inject_fake_data(self, tmpdir, config):
         classes = ("Abseiling", "Zumba")
@@ -965,30 +1075,17 @@ def inject_fake_data(self, tmpdir, config):
             )
         return num_videos_per_class * len(classes)
 
-
-class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.Kinetics400
-
-    def inject_fake_data(self, tmpdir, config):
-        classes = ("Abseiling", "Zumba")
-        num_videos_per_class = 2
-
-        digits = string.ascii_letters + string.digits + "-_"
-        for cls in classes:
-            datasets_utils.create_video_folder(
-                tmpdir,
-                cls,
-                lambda _: f"{datasets_utils.create_random_string(11, digits)}.avi",
-                num_videos_per_class,
-            )
-
-        return num_videos_per_class * len(classes)
+    @pytest.mark.xfail(reason="FIXME")
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(output_format="TCHW", transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
 
 
 class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.HMDB51
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))
 
     _VIDEO_FOLDER = "videos"
     _SPLITS_FOLDER = "splits"
@@ -1048,7 +1145,8 @@ def _create_split_files(self, root, video_files, fold, train):
 class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Omniglot
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(background=(True, False))
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         target_folder = (
@@ -1079,6 +1177,8 @@ class SBUTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SBU
     FEATURE_TYPES = (PIL.Image.Image, str)
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         num_images = 3
 
@@ -1128,7 +1228,7 @@ def inject_fake_data(self, tmpdir, config):
 class USPSTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.USPS
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         num_images = 2 if config["train"] else 1
@@ -1150,7 +1250,7 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
 
     REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
     )
 
@@ -1211,6 +1311,11 @@ def _create_segmentation(self, size):
     def _file_stem(self, idx):
         return f"2008_{idx:06d}"
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(mode="segmentation", transforms=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FakeData
@@ -1236,7 +1341,7 @@ class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
     _TRAIN_FEATURE_TYPES = (torch.Tensor,)
     _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
 
-    datasets_utils.combinations_grid(train=(True, False))
+    combinations_grid(train=(True, False))
 
     _NAME = "liberty"
 
@@ -1312,6 +1417,8 @@ class Flickr8kTestCase(datasets_utils.ImageDatasetTestCase):
     _IMAGES_FOLDER = "images"
     _ANNOTATIONS_FILE = "captions.html"
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def dataset_args(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir)
         root = tmpdir / self._IMAGES_FOLDER
@@ -1381,6 +1488,8 @@ class Flickr30kTestCase(Flickr8kTestCase):
 
     _ANNOTATIONS_FILE = "captions.token"
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def _image_file_name(self, idx):
         return f"{idx}.jpg"
 
@@ -1395,7 +1504,7 @@ def _create_annotations_file(self, root, name, images, num_captions_per_image):
 class MNISTTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.MNIST
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     _MAGIC_DTYPES = {
         torch.uint8: 8,
@@ -1465,7 +1574,7 @@ class EMNISTTestCase(MNISTTestCase):
     DATASET_CLASS = datasets.EMNIST
 
     DEFAULT_CONFIG = dict(split="byclass")
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("byclass", "bymerge", "balanced", "letters", "digits", "mnist"), train=(True, False)
     )
 
@@ -1476,7 +1585,7 @@ def _prefix(self, config):
 class QMNISTTestCase(MNISTTestCase):
     DATASET_CLASS = datasets.QMNIST
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(what=("train", "test", "test10k", "nist"))
+    ADDITIONAL_CONFIGS = combinations_grid(what=("train", "test", "test10k", "nist"))
 
     _LABELS_SIZE = (8,)
     _LABELS_DTYPE = torch.int32
@@ -1518,30 +1627,51 @@ def test_num_examples_test50k(self):
             assert len(dataset) == info["num_examples"] - 10000
 
 
+class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
+    DATASET_CLASS = datasets.MovingMNIST
+    FEATURE_TYPES = (torch.Tensor,)
+
+    ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
+
+    _NUM_FRAMES = 20
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__)
+        os.makedirs(base_folder, exist_ok=True)
+        num_samples = 5
+        data = np.concatenate(
+            [
+                np.zeros((config["split_ratio"], num_samples, 64, 64)),
+                np.ones((self._NUM_FRAMES - config["split_ratio"], num_samples, 64, 64)),
+            ]
+        )
+        np.save(os.path.join(base_folder, "mnist_test_seq.npy"), data)
+        return num_samples
+
+    @datasets_utils.test_all_configs
+    def test_split(self, config):
+        with self.create_dataset(config) as (dataset, _):
+            if config["split"] == "train":
+                assert (dataset.data == 0).all()
+            elif config["split"] == "test":
+                assert (dataset.data == 1).all()
+            else:
+                assert dataset.data.size()[1] == self._NUM_FRAMES
+
+
 class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DatasetFolder
 
-    # The dataset has no fixed return type since it is defined by the loader parameter. For testing, we use a loader
-    # that simply returns the path as type 'str' instead of loading anything. See the 'dataset_args()' method.
-    FEATURE_TYPES = (str, int)
-
-    _IMAGE_EXTENSIONS = ("jpg", "png")
-    _VIDEO_EXTENSIONS = ("avi", "mp4")
-    _EXTENSIONS = (*_IMAGE_EXTENSIONS, *_VIDEO_EXTENSIONS)
+    _EXTENSIONS = ("jpg", "png")
 
     # DatasetFolder has two mutually exclusive parameters: 'extensions' and 'is_valid_file'. One of both is required.
     # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the
     # 'test_is_valid_file()' method.
     DEFAULT_CONFIG = dict(extensions=_EXTENSIONS)
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _IMAGE_EXTENSIONS]),
-        dict(extensions=_IMAGE_EXTENSIONS),
-        *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _VIDEO_EXTENSIONS]),
-        dict(extensions=_VIDEO_EXTENSIONS),
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])
 
     def dataset_args(self, tmpdir, config):
-        return tmpdir, lambda x: x
+        return tmpdir, datasets.folder.pil_loader
 
     def inject_fake_data(self, tmpdir, config):
         extensions = config["extensions"] or self._is_valid_file_to_extensions(config["is_valid_file"])
@@ -1552,18 +1682,16 @@ def inject_fake_data(self, tmpdir, config):
             if ext not in extensions:
                 continue
 
-            create_example_folder = (
-                datasets_utils.create_image_folder
-                if ext in self._IMAGE_EXTENSIONS
-                else datasets_utils.create_video_folder
-            )
-
             num_examples = torch.randint(1, 3, size=()).item()
-            create_example_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples)
+            datasets_utils.create_image_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples)
 
             num_examples_total += num_examples
             classes.append(cls)
 
+        if config.pop("make_empty_class", False):
+            os.makedirs(pathlib.Path(tmpdir) / "empty_class")
+            classes.append("empty_class")
+
         return dict(num_examples=num_examples_total, classes=classes)
 
     def _file_name_fn(self, cls, ext, idx):
@@ -1588,6 +1716,23 @@ def test_classes(self, config):
             assert len(dataset.classes) == len(info["classes"])
             assert all([a == b for a, b in zip(dataset.classes, info["classes"])])
 
+    def test_allow_empty(self):
+        config = {
+            "extensions": self._EXTENSIONS,
+            "make_empty_class": True,
+        }
+
+        config["allow_empty"] = True
+        with self.create_dataset(config) as (dataset, info):
+            assert "empty_class" in dataset.classes
+            assert len(dataset.classes) == len(info["classes"])
+            assert all([a == b for a, b in zip(dataset.classes, info["classes"])])
+
+        config["allow_empty"] = False
+        with pytest.raises(FileNotFoundError, match="Found no valid file"):
+            with self.create_dataset(config) as (dataset, info):
+                pass
+
 
 class ImageFolderTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ImageFolder
@@ -1613,7 +1758,7 @@ def test_classes(self, config):
 class KittiTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti
     FEATURE_TYPES = (PIL.Image.Image, (list, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         kitti_dir = os.path.join(tmpdir, "Kitti", "raw")
@@ -1645,11 +1790,16 @@ def inject_fake_data(self, tmpdir, config):
 
         return split_to_num_examples[config["train"]]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SVHN
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "extra"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "extra"))
 
     def inject_fake_data(self, tmpdir, config):
         import scipy.io as sio
@@ -1670,7 +1820,7 @@ def inject_fake_data(self, tmpdir, config):
 
 class Places365TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Places365
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train-standard", "train-challenge", "val"),
         small=(False, True),
     )
@@ -1752,20 +1902,16 @@ def test_class_to_idx(self):
         with self.create_dataset() as (dataset, _):
             assert dataset.class_to_idx == class_to_idx
 
-    def test_images_download_preexisting(self):
-        with pytest.raises(RuntimeError):
-            with self.create_dataset({"download": True}):
-                pass
-
 
 class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.INaturalist
     FEATURE_TYPES = (PIL.Image.Image, (int, tuple))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]),
         version=("2021_train",),
     )
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         categories = [
@@ -1799,12 +1945,14 @@ def test_targets(self):
 class LFWPeopleTestCase(datasets_utils.DatasetTestCase):
     DATASET_CLASS = datasets.LFWPeople
     FEATURE_TYPES = (PIL.Image.Image, int)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("10fold", "train", "test"), image_set=("original", "funneled", "deepfunneled")
     )
     _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"}
     _file_id = {"10fold": "", "train": "DevTrain", "test": "DevTest"}
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir) / "lfw-py"
         os.makedirs(tmpdir, exist_ok=True)
@@ -1841,6 +1989,18 @@ def _create_random_id(self):
         part2 = datasets_utils.create_random_string(random.randint(4, 7))
         return f"{part1}_{part2}"
 
+    def test_tv_decode_image_support(self):
+        if not self.SUPPORT_TV_IMAGE_DECODE:
+            pytest.skip(f"{self.DATASET_CLASS.__name__} does not support torchvision.io.decode_image.")
+
+        with self.create_dataset(
+            config=dict(
+                loader=decode_image,
+            )
+        ) as (dataset, _):
+            image = dataset[0][0]
+            assert isinstance(image, torch.Tensor)
+
 
 class LFWPairsTestCase(LFWPeopleTestCase):
     DATASET_CLASS = datasets.LFWPairs
@@ -1875,11 +2035,13 @@ def _inject_pairs(self, root, num_pairs, same):
 
 class SintelTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Sintel
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         root = pathlib.Path(tmpdir) / "Sintel"
 
@@ -1943,9 +2105,11 @@ def test_bad_input(self):
 
 class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.KittiFlow
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         root = pathlib.Path(tmpdir) / "KittiFlow"
 
@@ -2003,7 +2167,7 @@ def test_bad_input(self):
 
 class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FlyingChairs
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
@@ -2058,13 +2222,15 @@ def test_flow(self, config):
 
 class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FlyingThings3D
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "test"), pass_name=("clean", "final", "both"), camera=("left", "right", "both")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         root = pathlib.Path(tmpdir) / "FlyingThings3D"
 
@@ -2131,6 +2297,8 @@ def test_bad_input(self):
 class HD1KTestCase(KittiFlowTestCase):
     DATASET_CLASS = datasets.HD1K
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         root = pathlib.Path(tmpdir) / "hd1k"
 
@@ -2173,6 +2341,7 @@ def inject_fake_data(self, tmpdir, config):
 class EuroSATTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.EuroSAT
     FEATURE_TYPES = (PIL.Image.Image, int)
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         data_folder = os.path.join(tmpdir, "eurosat", "2750")
@@ -2195,7 +2364,9 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Food101
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
+
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = pathlib.Path(tmpdir) / "food-101"
@@ -2230,9 +2401,10 @@ def inject_fake_data(self, tmpdir: str, config):
 
 class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FGVCAircraft
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer")
     )
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir: str, config):
         split = config["split"]
@@ -2282,6 +2454,8 @@ def inject_fake_data(self, tmpdir: str, config):
 class SUN397TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SUN397
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir: str, config):
         data_dir = pathlib.Path(tmpdir) / "SUN397"
         data_dir.mkdir()
@@ -2313,7 +2487,9 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DTD
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    SUPPORT_TV_IMAGE_DECODE = True
+
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "test", "val"),
         # There is no need to test the whole matrix here, since each fold is treated exactly the same
         partition=(1, 5, 10),
@@ -2347,7 +2523,7 @@ def inject_fake_data(self, tmpdir: str, config):
 
 class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FER2013
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
@@ -2355,34 +2531,74 @@ def inject_fake_data(self, tmpdir, config):
         base_folder = os.path.join(tmpdir, "fer2013")
         os.makedirs(base_folder)
 
+        use_icml = config.pop("use_icml", False)
+        use_fer = config.pop("use_fer", False)
+
         num_samples = 5
-        with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file:
-            writer = csv.DictWriter(
-                file,
-                fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",),
-                quoting=csv.QUOTE_NONNUMERIC,
-                quotechar='"',
-            )
-            writer.writeheader()
-            for _ in range(num_samples):
-                row = dict(
-                    pixels=" ".join(
-                        str(pixel) for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
-                    )
+
+        if use_icml or use_fer:
+            pixels_key, usage_key = (" pixels", " Usage") if use_icml else ("pixels", "Usage")
+            fieldnames = ("emotion", usage_key, pixels_key) if use_icml else ("emotion", pixels_key, usage_key)
+            filename = "icml_face_data.csv" if use_icml else "fer2013.csv"
+            with open(os.path.join(base_folder, filename), "w", newline="") as file:
+                writer = csv.DictWriter(
+                    file,
+                    fieldnames=fieldnames,
+                    quoting=csv.QUOTE_NONNUMERIC,
+                    quotechar='"',
                 )
-                if config["split"] == "train":
-                    row["emotion"] = str(int(torch.randint(0, 7, ())))
+                writer.writeheader()
+                for i in range(num_samples):
+                    row = {
+                        "emotion": str(int(torch.randint(0, 7, ()))),
+                        usage_key: "Training" if i % 2 else "PublicTest",
+                        pixels_key: " ".join(
+                            str(pixel)
+                            for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
+                        ),
+                    }
+
+                    writer.writerow(row)
+        else:
+            with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file:
+                writer = csv.DictWriter(
+                    file,
+                    fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",),
+                    quoting=csv.QUOTE_NONNUMERIC,
+                    quotechar='"',
+                )
+                writer.writeheader()
+                for _ in range(num_samples):
+                    row = dict(
+                        pixels=" ".join(
+                            str(pixel)
+                            for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
+                        )
+                    )
+                    if config["split"] == "train":
+                        row["emotion"] = str(int(torch.randint(0, 7, ())))
 
-                writer.writerow(row)
+                    writer.writerow(row)
 
         return num_samples
 
+    def test_icml_file(self):
+        config = {"split": "test"}
+        with self.create_dataset(config=config) as (dataset, _):
+            assert all(s[1] is None for s in dataset)
+
+        for split in ("train", "test"):
+            for d in ({"use_icml": True}, {"use_fer": True}):
+                config = {"split": split, **d}
+                with self.create_dataset(config=config) as (dataset, _):
+                    assert all(s[1] is not None for s in dataset)
+
 
 class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.GTSRB
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = os.path.join(tmpdir, "gtsrb")
@@ -2432,7 +2648,8 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CLEVRClassification
     FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
@@ -2464,9 +2681,9 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.OxfordIIITPet
     FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("trainval", "test"),
-        target_types=("category", "segmentation", ["category", "segmentation"], []),
+        target_types=("category", "binary-category", "segmentation", ["category", "segmentation"], []),
     )
 
     def inject_fake_data(self, tmpdir, config):
@@ -2519,11 +2736,18 @@ def _meta_to_split_and_classification_ann(self, meta, idx):
         breed_id = "-1"
         return (image_id, class_id, species, breed_id)
 
+    def test_transforms_v2_wrapper_spawn(self):
+        expected_size = (123, 321)
+        with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size)
+
 
 class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StanfordCars
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
+
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         import scipy.io as io
@@ -2567,7 +2791,9 @@ def inject_fake_data(self, tmpdir, config):
 class Country211TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Country211
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test"))
+
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir: str, config):
         split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
@@ -2594,9 +2820,11 @@ def inject_fake_data(self, tmpdir: str, config):
 class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Flowers102
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     REQUIRED_PACKAGES = ("scipy",)
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir: str, config):
         base_folder = pathlib.Path(tmpdir) / "flowers-102"
 
@@ -2630,7 +2858,7 @@ def inject_fake_data(self, tmpdir: str, config):
 class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.PCAM
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     REQUIRED_PACKAGES = ("h5py",)
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2652,9 +2880,11 @@ def inject_fake_data(self, tmpdir: str, config):
 
 class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.RenderedSST2
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = pathlib.Path(tmpdir) / "rendered-sst2"
         image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]]
@@ -2674,7 +2904,7 @@ def inject_fake_data(self, tmpdir: str, config):
 
 class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti2012Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2736,7 +2966,7 @@ def test_bad_input(self):
 
 class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti2015Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2841,19 +3071,50 @@ def test_train_splits(self):
                 datasets_utils.shape_test_for_stereo(left, right, disparity)
 
 
+class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CREStereo
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, np.ndarray, type(None))
+
+    def inject_fake_data(self, tmpdir, config):
+        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
+        os.makedirs(crestereo_dir, exist_ok=True)
+
+        examples = {"tree": 2, "shapenet": 3, "reflective": 6, "hole": 5}
+
+        for category_name in ["shapenet", "reflective", "tree", "hole"]:
+            split_dir = crestereo_dir / category_name
+            os.makedirs(split_dir, exist_ok=True)
+            num_examples = examples[category_name]
+
+            for idx in range(num_examples):
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
+                # these are going to end up being gray scale images
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.png", size=(1, 100, 100))
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.png", size=(1, 100, 100))
+
+        return sum(examples.values())
+
+    def test_splits(self):
+        with self.create_dataset() as (dataset, _):
+            for left, right, disparity, mask in dataset:
+                assert mask is None
+                datasets_utils.shape_test_for_stereo(left, right, disparity)
+
+
 class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FallingThingsStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(variant=("single", "mixed", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
-    def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]):
+    def _make_dummy_depth_map(root: str, name: str, size: tuple[int, int]):
         file = pathlib.Path(root) / name
         image = np.ones((size[0], size[1]), dtype=np.uint8)
         PIL.Image.fromarray(image).save(file)
 
     @staticmethod
-    def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> None:
+    def _make_scene_folder(root: str, scene_name: str, size: tuple[int, int]) -> None:
         root = pathlib.Path(root) / scene_name
         os.makedirs(root, exist_ok=True)
         # jpg images
@@ -2917,14 +3178,14 @@ def test_bad_input(self):
 
 class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SceneFlowStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         variant=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final", "both")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
     def _create_pfm_folder(
-        root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]
+        root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: tuple[int, int]
     ) -> None:
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
@@ -3004,10 +3265,10 @@ def test_bad_input(self):
 class InStereo2k(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.InStereo2k
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     @staticmethod
-    def _make_scene_folder(root: str, name: str, size: Tuple[int, int]):
+    def _make_scene_folder(root: str, name: str, size: tuple[int, int]):
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
 
@@ -3046,7 +3307,7 @@ def test_bad_input(self):
 
 class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SintelStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(pass_name=("final", "clean", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -3122,7 +3383,7 @@ def test_bad_input(self):
 
 class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ETH3DStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
@@ -3187,5 +3448,177 @@ def test_bad_input(self):
                 pass
 
 
+class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Middlebury2014Stereo
+    ADDITIONAL_CONFIGS = combinations_grid(
+        split=("train", "additional"),
+        calibration=("perfect", "imperfect", "both"),
+        use_ambient_views=(True, False),
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> None:
+        calibrations = [None] if split == "test" else ["-perfect", "-imperfect"]
+        root_dir = pathlib.Path(root_dir)
+
+        for c in calibrations:
+            scene_dir = root_dir / f"{scene_name}{c}"
+            os.makedirs(scene_dir, exist_ok=True)
+            # make normal images first
+            datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1L.png", size=(3, 100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=scene_dir / "disp0.pfm")
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=scene_dir / "disp1.pfm")
+
+    def inject_fake_data(self, tmpdir, config):
+        split_scene_map = {
+            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
+            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"],
+        }
+
+        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
+        os.makedirs(middlebury_dir, exist_ok=True)
+
+        split_dir = middlebury_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"train": 2, "additional": 3, "test": 4}.get(config["split"], 0)
+        for idx in range(num_examples):
+            scene_name = split_scene_map[config["split"]][idx]
+            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+
+        if config["calibration"] == "both":
+            num_examples *= 2
+        return num_examples
+
+    def test_train_splits(self):
+        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
+            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
+                for left, right, disparity, mask in dataset:
+                    datasets_utils.shape_test_for_stereo(left, right, disparity, mask)
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split, calibration=None) as (dataset, _):
+                for left, right, disparity, mask in dataset:
+                    datasets_utils.shape_test_for_stereo(left, right)
+
+    def test_augmented_view_usage(self):
+        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
+            for left, right, disparity, mask in dataset:
+                datasets_utils.shape_test_for_stereo(left, right, disparity, mask)
+
+    def test_value_err_train(self):
+        # train set invalid
+        split = "train"
+        calibration = None
+        with pytest.raises(
+            ValueError,
+            match=f"Split '{split}' has calibration settings, however None was provided as an argument."
+            f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_value_err_test(self):
+        # test set invalid
+        split = "test"
+        calibration = "perfect"
+        with pytest.raises(
+            ValueError, match="Split 'test' has only no calibration settings, please set `calibration=None`."
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class ImagenetteTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Imagenette
+    ADDITIONAL_CONFIGS = combinations_grid(split=["train", "val"], size=["full", "320px", "160px"])
+
+    SUPPORT_TV_IMAGE_DECODE = True
+
+    _WNIDS = [
+        "n01440764",
+        "n02102040",
+        "n02979186",
+        "n03000684",
+        "n03028079",
+        "n03394916",
+        "n03417042",
+        "n03425413",
+        "n03445777",
+        "n03888257",
+    ]
+
+    def inject_fake_data(self, tmpdir, config):
+        archive_root = "imagenette2"
+        if config["size"] != "full":
+            archive_root += f"-{config['size'].replace('px', '')}"
+        image_root = pathlib.Path(tmpdir) / archive_root / config["split"]
+
+        num_images_per_class = 3
+        for wnid in self._WNIDS:
+            datasets_utils.create_image_folder(
+                root=image_root,
+                name=wnid,
+                file_name_fn=lambda idx: f"{wnid}_{idx}.JPEG",
+                num_examples=num_images_per_class,
+            )
+
+        return num_images_per_class * len(self._WNIDS)
+
+
+class TestDatasetWrapper:
+    def test_unknown_type(self):
+        unknown_object = object()
+        with pytest.raises(
+            TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`")
+        ):
+            datasets.wrap_dataset_for_transforms_v2(unknown_object)
+
+    def test_unknown_dataset(self):
+        class MyVisionDataset(datasets.VisionDataset):
+            pass
+
+        dataset = MyVisionDataset("root")
+
+        with pytest.raises(TypeError, match="No wrapper exist"):
+            datasets.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_missing_wrapper(self):
+        dataset = datasets.FakeData()
+
+        with pytest.raises(TypeError, match="please open an issue"):
+            datasets.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_subclass(self, mocker):
+        from torchvision import tv_tensors
+
+        sentinel = object()
+        mocker.patch.dict(
+            tv_tensors._dataset_wrapper.WRAPPER_FACTORIES,
+            clear=False,
+            values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel},
+        )
+
+        class MyFakeData(datasets.FakeData):
+            pass
+
+        dataset = MyFakeData()
+        wrapped_dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+        assert wrapped_dataset[0] is sentinel
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
index b4ae5a6d4b7..856a02b9d44 100644
--- a/test/test_datasets_download.py
+++ b/test/test_datasets_download.py
@@ -1,11 +1,12 @@
 import contextlib
 import itertools
+import shutil
 import tempfile
 import time
+import traceback
 import unittest.mock
 import warnings
 from datetime import datetime
-from distutils import dir_util
 from os import path
 from urllib.error import HTTPError, URLError
 from urllib.parse import urlparse
@@ -13,13 +14,7 @@
 
 import pytest
 from torchvision import datasets
-from torchvision.datasets.utils import (
-    _get_redirect_url,
-    check_integrity,
-    download_file_from_google_drive,
-    download_url,
-    USER_AGENT,
-)
+from torchvision.datasets.utils import _get_redirect_url, USER_AGENT
 
 
 def limit_requests_per_time(min_secs_between_requests=2.0):
@@ -83,63 +78,65 @@ def inner_wrapper(request, *args, **kwargs):
 
 @contextlib.contextmanager
 def log_download_attempts(
-    urls_and_md5s=None,
-    file="utils",
-    patch=True,
-    mock_auxiliaries=None,
+    urls,
+    *,
+    dataset_module,
 ):
-    def add_mock(stack, name, file, **kwargs):
+    def maybe_add_mock(*, module, name, stack, lst=None):
+        patcher = unittest.mock.patch(f"torchvision.datasets.{module}.{name}")
+
         try:
-            return stack.enter_context(unittest.mock.patch(f"torchvision.datasets.{file}.{name}", **kwargs))
-        except AttributeError as error:
-            if file != "utils":
-                return add_mock(stack, name, "utils", **kwargs)
-            else:
-                raise pytest.UsageError from error
-
-    if urls_and_md5s is None:
-        urls_and_md5s = set()
-    if mock_auxiliaries is None:
-        mock_auxiliaries = patch
+            mock = stack.enter_context(patcher)
+        except AttributeError:
+            return
 
-    with contextlib.ExitStack() as stack:
-        url_mock = add_mock(stack, "download_url", file, wraps=None if patch else download_url)
-        google_drive_mock = add_mock(
-            stack, "download_file_from_google_drive", file, wraps=None if patch else download_file_from_google_drive
-        )
+        if lst is not None:
+            lst.append(mock)
 
-        if mock_auxiliaries:
-            add_mock(stack, "extract_archive", file)
+    with contextlib.ExitStack() as stack:
+        download_url_mocks = []
+        download_file_from_google_drive_mocks = []
+        for module in [dataset_module, "utils"]:
+            maybe_add_mock(module=module, name="download_url", stack=stack, lst=download_url_mocks)
+            maybe_add_mock(
+                module=module,
+                name="download_file_from_google_drive",
+                stack=stack,
+                lst=download_file_from_google_drive_mocks,
+            )
+            maybe_add_mock(module=module, name="extract_archive", stack=stack)
 
         try:
-            yield urls_and_md5s
+            yield
         finally:
-            for args, kwargs in url_mock.call_args_list:
-                url = args[0]
-                md5 = args[-1] if len(args) == 4 else kwargs.get("md5")
-                urls_and_md5s.add((url, md5))
+            for download_url_mock in download_url_mocks:
+                for args, kwargs in download_url_mock.call_args_list:
+                    urls.append(args[0] if args else kwargs["url"])
 
-            for args, kwargs in google_drive_mock.call_args_list:
-                id = args[0]
-                url = f"https://drive.google.com/file/d/{id}"
-                md5 = args[3] if len(args) == 4 else kwargs.get("md5")
-                urls_and_md5s.add((url, md5))
+            for download_file_from_google_drive_mock in download_file_from_google_drive_mocks:
+                for args, kwargs in download_file_from_google_drive_mock.call_args_list:
+                    file_id = args[0] if args else kwargs["file_id"]
+                    urls.append(f"https://drive.google.com/file/d/{file_id}")
 
 
 def retry(fn, times=1, wait=5.0):
-    msgs = []
+    tbs = []
     for _ in range(times + 1):
         try:
             return fn()
         except AssertionError as error:
-            msgs.append(str(error))
+            tbs.append("".join(traceback.format_exception(type(error), error, error.__traceback__)))
             time.sleep(wait)
     else:
         raise AssertionError(
             "\n".join(
                 (
-                    f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time.\n",
-                    *(f"{idx}: {error}" for idx, error in enumerate(msgs, 1)),
+                    "\n",
+                    *[f"{'_' * 40}  {idx:2d}  {'_' * 40}\n\n{tb}" for idx, tb in enumerate(tbs, 1)],
+                    (
+                        f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time. "
+                        f"You can find the the full tracebacks above."
+                    ),
                 )
             )
         )
@@ -149,10 +146,12 @@ def retry(fn, times=1, wait=5.0):
 def assert_server_response_ok():
     try:
         yield
-    except URLError as error:
-        raise AssertionError("The request timed out.") from error
     except HTTPError as error:
         raise AssertionError(f"The server returned {error.code}: {error.reason}.") from error
+    except URLError as error:
+        raise AssertionError(
+            "Connection not possible due to SSL." if "SSL" in str(error) else "The request timed out."
+        ) from error
     except RecursionError as error:
         raise AssertionError(str(error)) from error
 
@@ -163,45 +162,14 @@ def assert_url_is_accessible(url, timeout=5.0):
         urlopen(request, timeout=timeout)
 
 
-def assert_file_downloads_correctly(url, md5, tmpdir, timeout=5.0):
-    file = path.join(tmpdir, path.basename(url))
-    with assert_server_response_ok():
-        with open(file, "wb") as fh:
-            request = Request(url, headers={"User-Agent": USER_AGENT})
-            response = urlopen(request, timeout=timeout)
-            fh.write(response.read())
-
-    assert check_integrity(file, md5=md5), "The MD5 checksums mismatch"
-
-
-class DownloadConfig:
-    def __init__(self, url, md5=None, id=None):
-        self.url = url
-        self.md5 = md5
-        self.id = id or url
+def collect_urls(dataset_cls, *args, **kwargs):
+    urls = []
+    with contextlib.suppress(Exception), log_download_attempts(
+        urls, dataset_module=dataset_cls.__module__.split(".")[-1]
+    ):
+        dataset_cls(*args, **kwargs)
 
-    def __repr__(self) -> str:
-        return self.id
-
-
-def make_download_configs(urls_and_md5s, name=None):
-    return [
-        DownloadConfig(url, md5=md5, id=f"{name}, {url}" if name is not None else None) for url, md5 in urls_and_md5s
-    ]
-
-
-def collect_download_configs(dataset_loader, name=None, **kwargs):
-    urls_and_md5s = set()
-    try:
-        with log_download_attempts(urls_and_md5s=urls_and_md5s, **kwargs):
-            dataset = dataset_loader()
-    except Exception:
-        dataset = None
-
-    if name is None and dataset is not None:
-        name = type(dataset).__name__
-
-    return make_download_configs(urls_and_md5s, name)
+    return [(url, f"{dataset_cls.__name__}, {url}") for url in urls]
 
 
 # This is a workaround since fixtures, such as the built-in tmp_dir, can only be used within a test but not within a
@@ -212,16 +180,18 @@ def collect_download_configs(dataset_loader, name=None, **kwargs):
 @pytest.fixture(scope="module", autouse=True)
 def root():
     yield ROOT
-    dir_util.remove_tree(ROOT)
+    shutil.rmtree(ROOT)
 
 
 def places365():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Places365(ROOT, split=split, small=small, download=True),
-                name=f"Places365, {split}, {'small' if small else 'large'}",
-                file="places365",
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(
+                datasets.Places365,
+                ROOT,
+                split=split,
+                small=small,
+                download=True,
             )
             for split, small in itertools.product(("train-standard", "train-challenge", "val"), (False, True))
         ]
@@ -229,85 +199,69 @@ def places365():
 
 
 def caltech101():
-    return collect_download_configs(lambda: datasets.Caltech101(ROOT, download=True), name="Caltech101")
+    return collect_urls(datasets.Caltech101, ROOT, download=True)
 
 
 def caltech256():
-    return collect_download_configs(lambda: datasets.Caltech256(ROOT, download=True), name="Caltech256")
+    return collect_urls(datasets.Caltech256, ROOT, download=True)
 
 
 def cifar10():
-    return collect_download_configs(lambda: datasets.CIFAR10(ROOT, download=True), name="CIFAR10")
+    return collect_urls(datasets.CIFAR10, ROOT, download=True)
 
 
 def cifar100():
-    return collect_download_configs(lambda: datasets.CIFAR100(ROOT, download=True), name="CIFAR100")
+    return collect_urls(datasets.CIFAR100, ROOT, download=True)
 
 
 def voc():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.VOCSegmentation(ROOT, year=year, download=True),
-                name=f"VOC, {year}",
-                file="voc",
-            )
-            for year in ("2007", "2007-test", "2008", "2009", "2010", "2011", "2012")
+    # TODO: Also test the "2007-test" key
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(datasets.VOCSegmentation, ROOT, year=year, download=True)
+            for year in ("2007", "2008", "2009", "2010", "2011", "2012")
         ]
     )
 
 
 def mnist():
     with unittest.mock.patch.object(datasets.MNIST, "mirrors", datasets.MNIST.mirrors[-1:]):
-        return collect_download_configs(lambda: datasets.MNIST(ROOT, download=True), name="MNIST")
+        return collect_urls(datasets.MNIST, ROOT, download=True)
 
 
 def fashion_mnist():
-    return collect_download_configs(lambda: datasets.FashionMNIST(ROOT, download=True), name="FashionMNIST")
+    return collect_urls(datasets.FashionMNIST, ROOT, download=True)
 
 
 def kmnist():
-    return collect_download_configs(lambda: datasets.KMNIST(ROOT, download=True), name="KMNIST")
+    return collect_urls(datasets.KMNIST, ROOT, download=True)
 
 
 def emnist():
     # the 'split' argument can be any valid one, since everything is downloaded anyway
-    return collect_download_configs(lambda: datasets.EMNIST(ROOT, split="byclass", download=True), name="EMNIST")
+    return collect_urls(datasets.EMNIST, ROOT, split="byclass", download=True)
 
 
 def qmnist():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.QMNIST(ROOT, what=what, download=True),
-                name=f"QMNIST, {what}",
-                file="mnist",
-            )
-            for what in ("train", "test", "nist")
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.QMNIST, ROOT, what=what, download=True) for what in ("train", "test", "nist")]
     )
 
 
+def moving_mnist():
+    return collect_urls(datasets.MovingMNIST, ROOT, download=True)
+
+
 def omniglot():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Omniglot(ROOT, background=background, download=True),
-                name=f"Omniglot, {'background' if background else 'evaluation'}",
-            )
-            for background in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.Omniglot, ROOT, background=background, download=True) for background in (True, False)]
     )
 
 
 def phototour():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.PhotoTour(ROOT, name=name, download=True),
-                name=f"PhotoTour, {name}",
-                file="phototour",
-            )
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(datasets.PhotoTour, ROOT, name=name, download=True)
             # The names postfixed with '_harris' point to the domain 'matthewalunbrown.com'. For some reason all
             # requests timeout from within CI. They are disabled until this is resolved.
             for name in ("notredame", "yosemite", "liberty")  # "notredame_harris", "yosemite_harris", "liberty_harris"
@@ -316,91 +270,51 @@ def phototour():
 
 
 def sbdataset():
-    return collect_download_configs(
-        lambda: datasets.SBDataset(ROOT, download=True),
-        name="SBDataset",
-        file="voc",
-    )
+    return collect_urls(datasets.SBDataset, ROOT, download=True)
 
 
 def sbu():
-    return collect_download_configs(
-        lambda: datasets.SBU(ROOT, download=True),
-        name="SBU",
-        file="sbu",
-    )
+    return collect_urls(datasets.SBU, ROOT, download=True)
 
 
 def semeion():
-    return collect_download_configs(
-        lambda: datasets.SEMEION(ROOT, download=True),
-        name="SEMEION",
-        file="semeion",
-    )
+    return collect_urls(datasets.SEMEION, ROOT, download=True)
 
 
 def stl10():
-    return collect_download_configs(
-        lambda: datasets.STL10(ROOT, download=True),
-        name="STL10",
-    )
+    return collect_urls(datasets.STL10, ROOT, download=True)
 
 
 def svhn():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.SVHN(ROOT, split=split, download=True),
-                name=f"SVHN, {split}",
-                file="svhn",
-            )
-            for split in ("train", "test", "extra")
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.SVHN, ROOT, split=split, download=True) for split in ("train", "test", "extra")]
     )
 
 
 def usps():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.USPS(ROOT, train=train, download=True),
-                name=f"USPS, {'train' if train else 'test'}",
-                file="usps",
-            )
-            for train in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.USPS, ROOT, train=train, download=True) for train in (True, False)]
     )
 
 
 def celeba():
-    return collect_download_configs(
-        lambda: datasets.CelebA(ROOT, download=True),
-        name="CelebA",
-        file="celeba",
-    )
+    return collect_urls(datasets.CelebA, ROOT, download=True)
 
 
 def widerface():
-    return collect_download_configs(
-        lambda: datasets.WIDERFace(ROOT, download=True),
-        name="WIDERFace",
-        file="widerface",
-    )
+    return collect_urls(datasets.WIDERFace, ROOT, download=True)
 
 
 def kinetics():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Kinetics(
-                    path.join(ROOT, f"Kinetics{num_classes}"),
-                    frames_per_clip=1,
-                    num_classes=num_classes,
-                    split=split,
-                    download=True,
-                ),
-                name=f"Kinetics, {num_classes}, {split}",
-                file="kinetics",
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(
+                datasets.Kinetics,
+                path.join(ROOT, f"Kinetics{num_classes}"),
+                frames_per_clip=1,
+                num_classes=num_classes,
+                split=split,
+                download=True,
             )
             for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val"))
         ]
@@ -408,57 +322,49 @@ def kinetics():
 
 
 def kitti():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda train=train: datasets.Kitti(ROOT, train=train, download=True),
-                name=f"Kitti, {'train' if train else 'test'}",
-                file="kitti",
-            )
-            for train in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.Kitti, ROOT, train=train, download=True) for train in (True, False)]
     )
 
 
-def make_parametrize_kwargs(download_configs):
-    argvalues = []
-    ids = []
-    for config in download_configs:
-        argvalues.append((config.url, config.md5))
-        ids.append(config.id)
-
-    return dict(argnames=("url", "md5"), argvalues=argvalues, ids=ids)
-
-
-@pytest.mark.parametrize(
-    **make_parametrize_kwargs(
-        itertools.chain(
-            caltech101(),
-            caltech256(),
-            cifar10(),
-            cifar100(),
-            # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
-            # voc(),
-            mnist(),
-            fashion_mnist(),
-            kmnist(),
-            emnist(),
-            qmnist(),
-            omniglot(),
-            phototour(),
-            sbdataset(),
-            semeion(),
-            stl10(),
-            svhn(),
-            usps(),
-            celeba(),
-            widerface(),
-            kinetics(),
-            kitti(),
-        )
+def url_parametrization(*dataset_urls_and_ids_fns):
+    return pytest.mark.parametrize(
+        "url",
+        [
+            pytest.param(url, id=id)
+            for dataset_urls_and_ids_fn in dataset_urls_and_ids_fns
+            for url, id in sorted(set(dataset_urls_and_ids_fn()))
+        ],
     )
+
+
+@url_parametrization(
+    caltech101,
+    caltech256,
+    cifar10,
+    cifar100,
+    # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
+    # voc,
+    mnist,
+    fashion_mnist,
+    kmnist,
+    emnist,
+    qmnist,
+    omniglot,
+    phototour,
+    sbdataset,
+    semeion,
+    stl10,
+    svhn,
+    usps,
+    celeba,
+    widerface,
+    kinetics,
+    kitti,
+    places365,
+    sbu,
 )
-def test_url_is_accessible(url, md5):
+def test_url_is_accessible(url):
     """
     If you see this test failing, find the offending dataset in the parametrization and move it to
     ``test_url_is_not_accessible`` and link an issue detailing the problem.
@@ -466,16 +372,11 @@ def test_url_is_accessible(url, md5):
     retry(lambda: assert_url_is_accessible(url))
 
 
-@pytest.mark.parametrize(
-    **make_parametrize_kwargs(
-        itertools.chain(
-            places365(),  # https://github.com/pytorch/vision/issues/6268
-            sbu(),  # https://github.com/pytorch/vision/issues/6390
-        )
-    )
-)
+# TODO: if e.g. caltech101 starts failing, remove the pytest.mark.parametrize below and use
+# @url_parametrization(caltech101)
+@pytest.mark.parametrize("url", ("http://url_that_doesnt_exist.com",))  # here until we actually have a failing dataset
 @pytest.mark.xfail
-def test_url_is_not_accessible(url, md5):
+def test_url_is_not_accessible(url):
     """
     As the name implies, this test is the 'inverse' of ``test_url_is_accessible``. Since the download servers are
     beyond our control, some files might not be accessible for longer stretches of time. Still, we want to know if they
@@ -484,9 +385,4 @@ def test_url_is_not_accessible(url, md5):
     If you see this test failing, find the offending dataset in the parametrization and move it to
     ``test_url_is_accessible``.
     """
-    retry(lambda: assert_url_is_accessible(url))
-
-
-@pytest.mark.parametrize(**make_parametrize_kwargs(itertools.chain()))
-def test_file_downloads_correctly(url, md5):
-    retry(lambda: assert_file_downloads_correctly(url, md5))
+    assert_url_is_accessible(url)
diff --git a/test/test_datasets_utils.py b/test/test_datasets_utils.py
index ec68fd72a5b..461688405d7 100644
--- a/test/test_datasets_utils.py
+++ b/test/test_datasets_utils.py
@@ -7,7 +7,9 @@
 import zipfile
 
 import pytest
+import torch
 import torchvision.datasets.utils as utils
+from common_utils import assert_equal
 from torch._utils_internal import get_file_path_2
 from torchvision.datasets.folder import make_dataset
 from torchvision.datasets.utils import _COMPRESSED_FILE_OPENERS
@@ -56,8 +58,11 @@ def test_get_redirect_url_max_hops_exceeded(self, mocker):
         assert mock.call_count == 1
         assert mock.call_args[0][0].full_url == url
 
-    def test_check_md5(self):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_check_md5(self, use_pathlib):
         fpath = TEST_FILE
+        if use_pathlib:
+            fpath = pathlib.Path(fpath)
         correct_md5 = "9c0bb82894bb3af7f7675ef2b3b6dcdc"
         false_md5 = ""
         assert utils.check_md5(fpath, correct_md5)
@@ -114,7 +119,8 @@ def test_detect_file_type_incompatible(self, file):
             utils._detect_file_type(file)
 
     @pytest.mark.parametrize("extension", [".bz2", ".gz", ".xz"])
-    def test_decompress(self, extension, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_decompress(self, extension, tmpdir, use_pathlib):
         def create_compressed(root, content="this is the content"):
             file = os.path.join(root, "file")
             compressed = f"{file}{extension}"
@@ -126,6 +132,8 @@ def create_compressed(root, content="this is the content"):
             return compressed, file, content
 
         compressed, file, content = create_compressed(tmpdir)
+        if use_pathlib:
+            compressed = pathlib.Path(compressed)
 
         utils._decompress(compressed)
 
@@ -138,7 +146,8 @@ def test_decompress_no_compression(self):
         with pytest.raises(RuntimeError):
             utils._decompress("foo.tar")
 
-    def test_decompress_remove_finished(self, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_decompress_remove_finished(self, tmpdir, use_pathlib):
         def create_compressed(root, content="this is the content"):
             file = os.path.join(root, "file")
             compressed = f"{file}.gz"
@@ -149,10 +158,20 @@ def create_compressed(root, content="this is the content"):
             return compressed, file, content
 
         compressed, file, content = create_compressed(tmpdir)
+        print(f"{type(compressed)=}")
+        if use_pathlib:
+            compressed = pathlib.Path(compressed)
+            tmpdir = pathlib.Path(tmpdir)
 
-        utils.extract_archive(compressed, tmpdir, remove_finished=True)
+        extracted_dir = utils.extract_archive(compressed, tmpdir, remove_finished=True)
 
         assert not os.path.exists(compressed)
+        if use_pathlib:
+            assert isinstance(extracted_dir, pathlib.Path)
+            assert isinstance(compressed, pathlib.Path)
+        else:
+            assert isinstance(extracted_dir, str)
+            assert isinstance(compressed, str)
 
     @pytest.mark.parametrize("extension", [".gz", ".xz"])
     @pytest.mark.parametrize("remove_finished", [True, False])
@@ -165,7 +184,8 @@ def test_extract_archive_defer_to_decompress(self, extension, remove_finished, m
 
         mocked.assert_called_once_with(file, filename, remove_finished=remove_finished)
 
-    def test_extract_zip(self, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_extract_zip(self, tmpdir, use_pathlib):
         def create_archive(root, content="this is the content"):
             file = os.path.join(root, "dst.txt")
             archive = os.path.join(root, "archive.zip")
@@ -175,6 +195,8 @@ def create_archive(root, content="this is the content"):
 
             return archive, file, content
 
+        if use_pathlib:
+            tmpdir = pathlib.Path(tmpdir)
         archive, file, content = create_archive(tmpdir)
 
         utils.extract_archive(archive, tmpdir)
@@ -187,7 +209,8 @@ def create_archive(root, content="this is the content"):
     @pytest.mark.parametrize(
         "extension, mode", [(".tar", "w"), (".tar.gz", "w:gz"), (".tgz", "w:gz"), (".tar.xz", "w:xz")]
     )
-    def test_extract_tar(self, extension, mode, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_extract_tar(self, extension, mode, tmpdir, use_pathlib):
         def create_archive(root, extension, mode, content="this is the content"):
             src = os.path.join(root, "src.txt")
             dst = os.path.join(root, "dst.txt")
@@ -201,6 +224,8 @@ def create_archive(root, extension, mode, content="this is the content"):
 
             return archive, dst, content
 
+        if use_pathlib:
+            tmpdir = pathlib.Path(tmpdir)
         archive, file, content = create_archive(tmpdir, extension, mode)
 
         utils.extract_archive(archive, tmpdir)
@@ -215,6 +240,24 @@ def test_verify_str_arg(self):
         pytest.raises(ValueError, utils.verify_str_arg, 0, ("a",), "arg")
         pytest.raises(ValueError, utils.verify_str_arg, "b", ("a",), "arg")
 
+    @pytest.mark.parametrize(
+        ("dtype", "actual_hex", "expected_hex"),
+        [
+            (torch.uint8, "01 23 45 67 89 AB CD EF", "01 23 45 67 89 AB CD EF"),
+            (torch.float16, "01 23 45 67 89 AB CD EF", "23 01 67 45 AB 89 EF CD"),
+            (torch.int32, "01 23 45 67 89 AB CD EF", "67 45 23 01 EF CD AB 89"),
+            (torch.float64, "01 23 45 67 89 AB CD EF", "EF CD AB 89 67 45 23 01"),
+        ],
+    )
+    def test_flip_byte_order(self, dtype, actual_hex, expected_hex):
+        def to_tensor(hex):
+            return torch.frombuffer(bytes.fromhex(hex), dtype=dtype)
+
+        assert_equal(
+            utils._flip_byte_order(to_tensor(actual_hex)),
+            to_tensor(expected_hex),
+        )
+
 
 @pytest.mark.parametrize(
     ("kwargs", "expected_error_msg"),
diff --git a/test/test_datasets_video_utils.py b/test/test_datasets_video_utils.py
index adaa4f5446c..51330911e50 100644
--- a/test/test_datasets_video_utils.py
+++ b/test/test_datasets_video_utils.py
@@ -60,7 +60,7 @@ def test_video_clips_custom_fps(self, tmpdir):
         video_list = get_list_of_videos(tmpdir, num_videos=3, sizes=[12, 12, 12], fps=[3, 4, 6])
         num_frames = 4
         for fps in [1, 3, 4, 10]:
-            video_clips = VideoClips(video_list, num_frames, num_frames, fps, num_workers=2)
+            video_clips = VideoClips(video_list, num_frames, num_frames, fps)
             for i in range(video_clips.num_clips()):
                 video, audio, info, video_idx = video_clips.get_clip(i)
                 assert video.shape[0] == num_frames
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index 55259bb150d..a9072826724 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -1,12 +1,15 @@
+import copy
 import os
+import pickle
 
 import pytest
 import test_models as TM
 import torch
+from common_extended_utils import get_file_size_mb, get_ops
 from torchvision import models
-from torchvision.models._api import get_model_weights, Weights, WeightsEnum
+from torchvision.models import get_model_weights, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
-
+from torchvision.models.detection.backbone_utils import mobilenet_backbone, resnet_fpn_backbone
 
 run_if_test_with_extended = pytest.mark.skipif(
     os.getenv("PYTORCH_TEST_WITH_EXTENDED", "0") != "1",
@@ -29,6 +32,21 @@ def test_get_model(name, model_class):
     assert isinstance(models.get_model(name), model_class)
 
 
+@pytest.mark.parametrize(
+    "name, model_fn",
+    [
+        ("resnet50", models.resnet50),
+        ("retinanet_resnet50_fpn_v2", models.detection.retinanet_resnet50_fpn_v2),
+        ("raft_large", models.optical_flow.raft_large),
+        ("quantized_resnet50", models.quantization.resnet50),
+        ("lraspp_mobilenet_v3_large", models.segmentation.lraspp_mobilenet_v3_large),
+        ("mvit_v1_b", models.video.mvit_v1_b),
+    ],
+)
+def test_get_model_builder(name, model_fn):
+    assert models.get_model_builder(name) == model_fn
+
+
 @pytest.mark.parametrize(
     "name, weight",
     [
@@ -44,24 +62,125 @@ def test_get_model_weights(name, weight):
     assert models.get_model_weights(name) == weight
 
 
+@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy])
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_copyable(copy_fn, name):
+    for weights in list(models.get_model_weights(name)):
+        # It is somewhat surprising that (deep-)copying is an identity operation here, but this is the default behavior
+        # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances
+        # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop
+        # support for the identity operation in the future.
+        assert copy_fn(weights) is weights
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_deserializable(name):
+    for weights in list(models.get_model_weights(name)):
+        # It is somewhat surprising that deserialization is an identity operation here, but this is the default behavior
+        # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances
+        # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop
+        # support for the identity operation in the future.
+        assert pickle.loads(pickle.dumps(weights)) is weights
+
+
+def get_models_from_module(module):
+    return [
+        v.__name__
+        for k, v in module.__dict__.items()
+        if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__
+    ]
+
+
 @pytest.mark.parametrize(
     "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow]
 )
 def test_list_models(module):
-    def get_models_from_module(module):
-        return [
-            v.__name__
-            for k, v in module.__dict__.items()
-            if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__
-        ]
-
     a = set(get_models_from_module(module))
-    b = set(x.replace("quantized_", "") for x in models.list_models(module))
+    b = {x.replace("quantized_", "") for x in models.list_models(module)}
 
     assert len(b) > 0
     assert a == b
 
 
+@pytest.mark.parametrize(
+    "include_filters",
+    [
+        None,
+        [],
+        (),
+        "",
+        "*resnet*",
+        ["*alexnet*"],
+        "*not-existing-model-for-test?",
+        ["*resnet*", "*alexnet*"],
+        ["*resnet*", "*alexnet*", "*not-existing-model-for-test?"],
+        ("*resnet*", "*alexnet*"),
+        {"*resnet*", "*alexnet*"},
+    ],
+)
+@pytest.mark.parametrize(
+    "exclude_filters",
+    [
+        None,
+        [],
+        (),
+        "",
+        "*resnet*",
+        ["*alexnet*"],
+        ["*not-existing-model-for-test?"],
+        ["resnet34", "*not-existing-model-for-test?"],
+        ["resnet34", "*resnet1*"],
+        ("resnet34", "*resnet1*"),
+        {"resnet34", "*resnet1*"},
+    ],
+)
+def test_list_models_filters(include_filters, exclude_filters):
+    actual = set(models.list_models(models, include=include_filters, exclude=exclude_filters))
+    classification_models = set(get_models_from_module(models))
+
+    if isinstance(include_filters, str):
+        include_filters = [include_filters]
+    if isinstance(exclude_filters, str):
+        exclude_filters = [exclude_filters]
+
+    if include_filters:
+        expected = set()
+        for include_f in include_filters:
+            include_f = include_f.strip("*?")
+            expected = expected | {x for x in classification_models if include_f in x}
+    else:
+        expected = classification_models
+
+    if exclude_filters:
+        for exclude_f in exclude_filters:
+            exclude_f = exclude_f.strip("*?")
+            if exclude_f != "":
+                a_exclude = {x for x in classification_models if exclude_f in x}
+                expected = expected - a_exclude
+
+    assert expected == actual
+
+
 @pytest.mark.parametrize(
     "name, weight",
     [
@@ -96,6 +215,22 @@ def test_naming_conventions(model_fn):
     assert len(weights_enum) == 0 or hasattr(weights_enum, "DEFAULT")
 
 
+detection_models_input_dims = {
+    "fasterrcnn_mobilenet_v3_large_320_fpn": (320, 320),
+    "fasterrcnn_mobilenet_v3_large_fpn": (800, 800),
+    "fasterrcnn_resnet50_fpn": (800, 800),
+    "fasterrcnn_resnet50_fpn_v2": (800, 800),
+    "fcos_resnet50_fpn": (800, 800),
+    "keypointrcnn_resnet50_fpn": (1333, 1333),
+    "maskrcnn_resnet50_fpn": (800, 800),
+    "maskrcnn_resnet50_fpn_v2": (800, 800),
+    "retinanet_resnet50_fpn": (800, 800),
+    "retinanet_resnet50_fpn_v2": (800, 800),
+    "ssd300_vgg16": (300, 300),
+    "ssdlite320_mobilenet_v3_large": (320, 320),
+}
+
+
 @pytest.mark.parametrize(
     "model_fn",
     TM.list_model_fns(models)
@@ -107,6 +242,9 @@ def test_naming_conventions(model_fn):
 )
 @run_if_test_with_extended
 def test_schema_meta_validation(model_fn):
+    if model_fn.__name__ == "maskrcnn_resnet50_fpn_v2":
+        pytest.skip(reason="FIXME https://github.com/pytorch/vision/issues/7349")
+
     # list of all possible supported high-level fields for weights meta-data
     permitted_fields = {
         "backend",
@@ -120,11 +258,13 @@ def test_schema_meta_validation(model_fn):
         "recipe",
         "unquantized",
         "_docs",
+        "_ops",
+        "_file_size",
     }
     # mandatory fields for each computer vision task
     classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")}
     defaults = {
-        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs"},
+        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_file_size", "_ops"},
         "models": classification_fields,
         "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")},
         "quantization": classification_fields | {"backend", "unquantized"},
@@ -145,37 +285,60 @@ def test_schema_meta_validation(model_fn):
         pytest.skip(f"Model '{model_name}' doesn't have any pre-trained weights.")
 
     problematic_weights = {}
-    incorrect_params = []
+    incorrect_meta = []
     bad_names = []
     for w in weights_enum:
         actual_fields = set(w.meta.keys())
-        actual_fields |= set(
+        actual_fields |= {
             ("_metrics", dataset, metric_key)
             for dataset in w.meta.get("_metrics", {}).keys()
             for metric_key in w.meta.get("_metrics", {}).get(dataset, {}).keys()
-        )
+        }
         missing_fields = expected_fields - actual_fields
         unsupported_fields = set(w.meta.keys()) - permitted_fields
         if missing_fields or unsupported_fields:
             problematic_weights[w] = {"missing": missing_fields, "unsupported": unsupported_fields}
-        if w == weights_enum.DEFAULT:
+
+        if w == weights_enum.DEFAULT or any(w.meta[k] != weights_enum.DEFAULT.meta[k] for k in ["num_params", "_ops"]):
             if module_name == "quantization":
                 # parameters() count doesn't work well with quantization, so we check against the non-quantized
                 unquantized_w = w.meta.get("unquantized")
-                if unquantized_w is not None and w.meta.get("num_params") != unquantized_w.meta.get("num_params"):
-                    incorrect_params.append(w)
+                if unquantized_w is not None:
+                    if w.meta.get("num_params") != unquantized_w.meta.get("num_params"):
+                        incorrect_meta.append((w, "num_params"))
+
+                    # the methodology for quantized ops count doesn't work as well, so we take unquantized FLOPs
+                    # instead
+                    if w.meta["_ops"] != unquantized_w.meta.get("_ops"):
+                        incorrect_meta.append((w, "_ops"))
+
             else:
-                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
-                    incorrect_params.append(w)
-        else:
-            if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"):
-                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
-                    incorrect_params.append(w)
+                # loading the model and using it for parameter and ops verification
+                model = model_fn(weights=w)
+
+                if w.meta.get("num_params") != sum(p.numel() for p in model.parameters()):
+                    incorrect_meta.append((w, "num_params"))
+
+                kwargs = {}
+                if model_name in detection_models_input_dims:
+                    # detection models have non default height and width
+                    height, width = detection_models_input_dims[model_name]
+                    kwargs = {"height": height, "width": width}
+
+                if not model_fn.__name__.startswith("vit"):
+                    # FIXME: https://github.com/pytorch/vision/issues/7871
+                    calculated_ops = get_ops(model=model, weight=w, **kwargs)
+                    if calculated_ops != w.meta["_ops"]:
+                        incorrect_meta.append((w, "_ops"))
+
         if not w.name.isupper():
             bad_names.append(w)
 
+        if get_file_size_mb(w) != w.meta.get("_file_size"):
+            incorrect_meta.append((w, "_file_size"))
+
     assert not problematic_weights
-    assert not incorrect_params
+    assert not incorrect_meta
     assert not bad_names
 
 
@@ -320,3 +483,21 @@ def builder(*, weights=None, flag):
 
         with pytest.raises(ValueError, match="weights"):
             builder(pretrained=True, flag=False)
+
+    @pytest.mark.parametrize(
+        "model_fn",
+        [fn for fn in TM.list_model_fns(models) if fn.__name__ not in {"vit_h_14", "regnet_y_128gf"}]
+        + TM.list_model_fns(models.detection)
+        + TM.list_model_fns(models.quantization)
+        + TM.list_model_fns(models.segmentation)
+        + TM.list_model_fns(models.video)
+        + TM.list_model_fns(models.optical_flow)
+        + [
+            lambda pretrained: resnet_fpn_backbone(backbone_name="resnet50", pretrained=pretrained),
+            lambda pretrained: mobilenet_backbone(backbone_name="mobilenet_v2", fpn=False, pretrained=pretrained),
+        ],
+    )
+    @run_if_test_with_extended
+    def test_pretrained_deprecation(self, model_fn):
+        with pytest.warns(UserWarning, match="deprecated"):
+            model_fn(pretrained=True)
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 1914bc571fb..7d491372b77 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -2,17 +2,17 @@
 import itertools
 import math
 import os
-import re
+from collections.abc import Sequence
 from functools import partial
-from typing import Sequence
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch
 import torchvision.transforms as T
+import torchvision.transforms._functional_pil as F_pil
+import torchvision.transforms._functional_tensor as F_t
 import torchvision.transforms.functional as F
-import torchvision.transforms.functional_pil as F_pil
-import torchvision.transforms.functional_tensor as F_t
 from common_utils import (
     _assert_approx_equal_tensor_to_pil,
     _assert_equal_tensor_to_pil,
@@ -20,15 +20,20 @@
     _create_data_batch,
     _test_fn_on_batch,
     assert_equal,
-    cpu_and_gpu,
+    cpu_and_cuda,
     needs_cuda,
 )
 from torchvision.transforms import InterpolationMode
 
-NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC
+NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
+    InterpolationMode.NEAREST,
+    InterpolationMode.NEAREST_EXACT,
+    InterpolationMode.BILINEAR,
+    InterpolationMode.BICUBIC,
+)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fn", [F.get_image_size, F.get_image_num_channels, F.get_dimensions])
 def test_image_sizes(device, fn):
     script_F = torch.jit.script(fn)
@@ -66,7 +71,7 @@ class TestRotate:
     scripted_rotate = torch.jit.script(F.rotate)
     IMG_W = 26
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(7, 33), (26, IMG_W), (32, IMG_W)])
     @pytest.mark.parametrize(
         "center",
@@ -125,7 +130,7 @@ def test_rotate(self, device, height, width, center, dt, angle, expand, fill, fn
             f"{out_pil_tensor[0, :7, :7]}"
         )
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_rotate_batch(self, device, dt):
         if dt == torch.float16 and device == "cpu":
@@ -139,33 +144,11 @@ def test_rotate_batch(self, device, dt):
         center = (20, 22)
         _test_fn_on_batch(batch_tensors, F.rotate, angle=32, interpolation=NEAREST, expand=True, center=center)
 
-    def test_rotate_deprecation_resample(self):
-        tensor, _ = _create_data(26, 26)
-        # assert deprecation warning and non-BC
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. "
-                "Please use 'interpolation' instead."
-            ),
-        ):
-            res1 = F.rotate(tensor, 45, resample=2)
-            res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
-            assert_equal(res1, res2)
-
     def test_rotate_interpolation_type(self):
         tensor, _ = _create_data(26, 26)
-        # assert changed type warning
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            ),
-        ):
-            res1 = F.rotate(tensor, 45, interpolation=2)
-            res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
-            assert_equal(res1, res2)
+        res1 = F.rotate(tensor, 45, interpolation=PIL.Image.BILINEAR)
+        res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
+        assert_equal(res1, res2)
 
 
 class TestAffine:
@@ -173,7 +156,7 @@ class TestAffine:
     ALL_DTYPES = [None, torch.float32, torch.float64, torch.float16]
     scripted_affine = torch.jit.script(F.affine)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_identity_map(self, device, height, width, dt):
@@ -196,7 +179,7 @@ def test_identity_map(self, device, height, width, dt):
         )
         assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize(
@@ -240,7 +223,7 @@ def test_square_rotations(self, device, height, width, dt, angle, config, fn):
         # Tolerance : less than 6% of different pixels
         assert ratio_diff_pixels < 0.06
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize("angle", [90, 45, 15, -30, -60, -120])
@@ -274,7 +257,7 @@ def test_rect_rotations(self, device, height, width, dt, angle, fn, center):
         # Tolerance : less than 3% of different pixels
         assert ratio_diff_pixels < 0.03
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize("t", [[10, 12], (-12, -13)])
@@ -299,7 +282,7 @@ def test_translations(self, device, height, width, dt, t, fn):
 
         _assert_equal_tensor_to_pil(out_tensor, out_pil_img)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize(
@@ -309,24 +292,8 @@ def test_translations(self, device, height, width, dt, t, fn):
             (33, (5, -4), 1.0, [0.0, 0.0], [0, 0, 0]),
             (45, [-5, 4], 1.2, [0.0, 0.0], (1, 2, 3)),
             (33, (-4, -8), 2.0, [0.0, 0.0], [255, 255, 255]),
-            (
-                85,
-                (10, -10),
-                0.7,
-                [0.0, 0.0],
-                [
-                    1,
-                ],
-            ),
-            (
-                0,
-                [0, 0],
-                1.0,
-                [
-                    35.0,
-                ],
-                (2.0,),
-            ),
+            (85, (10, -10), 0.7, [0.0, 0.0], [1]),
+            (0, [0, 0], 1.0, [35.0], (2.0,)),
             (-25, [0, 0], 1.2, [0.0, 15.0], None),
             (-45, [-10, 0], 0.7, [2.0, 5.0], None),
             (-45, [-10, -10], 1.2, [4.0, 5.0], None),
@@ -360,7 +327,7 @@ def test_all_ops(self, device, height, width, dt, a, t, s, sh, f, fn):
         tol = 0.06 if device == "cuda" else 0.05
         assert ratio_diff_pixels < tol
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_batches(self, device, dt):
         if dt == torch.float16 and device == "cpu":
@@ -373,45 +340,13 @@ def test_batches(self, device, dt):
 
         _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0])
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_warnings(self, device):
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_interpolation_type(self, device):
         tensor, pil_img = _create_data(26, 26, device=device)
 
-        # assert deprecation warning and non-BC
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. "
-                "Please use 'interpolation' instead."
-            ),
-        ):
-            res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], resample=2)
-            res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
-            assert_equal(res1, res2)
-
-        # assert changed type warning
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            ),
-        ):
-            res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=2)
-            res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
-            assert_equal(res1, res2)
-
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. "
-                "Please use 'fill' instead."
-            ),
-        ):
-            res1 = F.affine(pil_img, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], fillcolor=10)
-            res2 = F.affine(pil_img, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], fill=10)
-            # we convert the PIL images to numpy as assert_equal doesn't work on PIL images.
-            assert_equal(np.asarray(res1), np.asarray(res2))
+        res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=PIL.Image.BILINEAR)
+        res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
+        assert_equal(res1, res2)
 
 
 def _get_data_dims_and_points_for_perspective():
@@ -437,22 +372,10 @@ def _get_data_dims_and_points_for_perspective():
     return dims_and_points
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize(
-    "fill",
-    (
-        None,
-        [0, 0, 0],
-        [1, 2, 3],
-        [255, 255, 255],
-        [
-            1,
-        ],
-        (2.0,),
-    ),
-)
+@pytest.mark.parametrize("fill", (None, [0, 0, 0], [1, 2, 3], [255, 255, 255], [1], (2.0,)))
 @pytest.mark.parametrize("fn", [F.perspective, torch.jit.script(F.perspective)])
 def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn):
 
@@ -483,7 +406,7 @@ def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn):
     assert ratio_diff_pixels < 0.05
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 def test_perspective_batch(device, dims_and_points, dt):
@@ -511,40 +434,21 @@ def test_perspective_batch(device, dims_and_points, dt):
     )
 
 
-def test_perspective_interpolation_warning():
-    # assert changed type warning
+def test_perspective_interpolation_type():
     spoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
     epoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
     tensor = torch.randint(0, 256, (3, 26, 26))
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=2)
-        res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR)
-        assert_equal(res1, res2)
+
+    res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=PIL.Image.BILINEAR)
+    res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR)
+    assert_equal(res1, res2)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize(
-    "size",
-    [
-        32,
-        26,
-        [
-            32,
-        ],
-        [32, 32],
-        (32, 32),
-        [26, 35],
-    ],
-)
+@pytest.mark.parametrize("size", [32, 26, [32], [32, 32], (32, 32), [26, 35]])
 @pytest.mark.parametrize("max_size", [None, 34, 40, 1000])
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST])
+@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
 def test_resize(device, dt, size, max_size, interpolation):
 
     if dt == torch.float16 and device == "cpu":
@@ -564,14 +468,12 @@ def test_resize(device, dt, size, max_size, interpolation):
         tensor = tensor.to(dt)
         batch_tensors = batch_tensors.to(dt)
 
-    resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size)
-    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size)
+    resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size, antialias=True)
+    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size, antialias=True)
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
-    if interpolation not in [
-        NEAREST,
-    ]:
+    if interpolation != NEAREST:
         # We can not check values if mode = NEAREST, as results are different
         # E.g. resized_tensor  = [[a, a, b, c, d, d, e, ...]]
         # E.g. resized_pil_img = [[a, b, c, c, d, e, f, ...]]
@@ -581,36 +483,27 @@ def test_resize(device, dt, size, max_size, interpolation):
             resized_tensor_f = resized_tensor_f.to(torch.float)
 
         # Pay attention to high tolerance for MAE
-        _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=8.0)
+        _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=3.0)
 
     if isinstance(size, int):
-        script_size = [
-            size,
-        ]
+        script_size = [size]
     else:
         script_size = size
 
-    resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size)
+    resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True)
     assert_equal(resized_tensor, resize_result)
 
-    _test_fn_on_batch(batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size)
+    _test_fn_on_batch(
+        batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True
+    )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_resize_asserts(device):
 
     tensor, pil_img = _create_data(26, 36, device=device)
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        res1 = F.resize(tensor, size=32, interpolation=2)
-
+    res1 = F.resize(tensor, size=32, interpolation=PIL.Image.BILINEAR)
     res2 = F.resize(tensor, size=32, interpolation=BILINEAR)
     assert_equal(res1, res2)
 
@@ -622,7 +515,7 @@ def test_resize_asserts(device):
             F.resize(img, size=32, max_size=32)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("size", [[96, 72], [96, 420], [420, 72]])
 @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
@@ -641,7 +534,7 @@ def test_resize_antialias(device, dt, size, interpolation):
         tensor = tensor.to(dt)
 
     resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, antialias=True)
-    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation)
+    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, antialias=True)
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
@@ -675,40 +568,6 @@ def test_resize_antialias(device, dt, size, interpolation):
     assert_equal(resized_tensor, resize_result)
 
 
-@needs_cuda
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
-def test_assert_resize_antialias(interpolation):
-
-    # Checks implementation on very large scales
-    # and catch TORCH_CHECK inside PyTorch implementation
-    torch.manual_seed(12)
-    tensor, _ = _create_data(1000, 1000, device="cuda")
-
-    # Error message is not yet updated in pytorch nightly
-    # with pytest.raises(RuntimeError, match=r"Provided interpolation parameters can not be handled"):
-    with pytest.raises(RuntimeError, match=r"Too much shared memory required"):
-        F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("dt", [torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize("size", [[10, 7], [10, 42], [42, 7]])
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
-def test_interpolate_antialias_backward(device, dt, size, interpolation):
-
-    if dt == torch.float16 and device == "cpu":
-        # skip float16 on CPU case
-        return
-
-    torch.manual_seed(12)
-    x = (torch.rand(1, 32, 29, 3, dtype=torch.double, device=device).permute(0, 3, 1, 2).requires_grad_(True),)
-    resize = partial(F.resize, size=size, interpolation=interpolation, antialias=True)
-    assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False)
-
-    x = (torch.rand(1, 3, 32, 29, dtype=torch.double, device=device, requires_grad=True),)
-    assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False)
-
-
 def check_functional_vs_PIL_vs_scripted(
     fn, fn_pil, fn_t, config, device, dtype, channels=3, tol=2.0 + 1e-10, agg_method="max"
 ):
@@ -746,7 +605,7 @@ def check_functional_vs_PIL_vs_scripted(
     _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=atol, **config)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"brightness_factor": f} for f in (0.1, 0.5, 1.0, 1.34, 2.5)])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -762,7 +621,7 @@ def test_adjust_brightness(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_invert(device, dtype, channels):
@@ -771,7 +630,7 @@ def test_invert(device, dtype, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("config", [{"bits": bits} for bits in range(0, 8)])
 @pytest.mark.parametrize("channels", [1, 3])
 def test_posterize(device, config, channels):
@@ -788,7 +647,7 @@ def test_posterize(device, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0, 64, 128, 192, 255]])
 @pytest.mark.parametrize("channels", [1, 3])
 def test_solarize1(device, config, channels):
@@ -805,7 +664,7 @@ def test_solarize1(device, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0.0, 0.25, 0.5, 0.75, 1.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -823,37 +682,45 @@ def test_solarize2(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [0.0, 0.25, 0.5, 0.75, 1.0])
-def test_solarize_threshold1_bound(threshold, device):
-    img = torch.rand((3, 12, 23)).to(device)
-    F_t.solarize(img, threshold)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [1.5])
-def test_solarize_threshold1_upper_bound(threshold, device):
-    img = torch.rand((3, 12, 23)).to(device)
-    with pytest.raises(TypeError, match="Threshold should be less than bound of img."):
-        F_t.solarize(img, threshold)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [0, 64, 128, 192, 255])
-def test_solarize_threshold2_bound(threshold, device):
-    img = torch.randint(0, 256, (3, 12, 23)).to(device)
+@pytest.mark.parametrize(
+    ("dtype", "threshold"),
+    [
+        *[
+            (dtype, threshold)
+            for dtype, threshold in itertools.product(
+                [torch.float32, torch.float16],
+                [0.0, 0.25, 0.5, 0.75, 1.0],
+            )
+        ],
+        *[(torch.uint8, threshold) for threshold in [0, 64, 128, 192, 255]],
+        *[(torch.int64, threshold) for threshold in [0, 2**32, 2**63 - 1]],
+    ],
+)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_solarize_threshold_within_bound(threshold, dtype, device):
+    make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
+    img = make_img((3, 12, 23), dtype=dtype, device=device)
     F_t.solarize(img, threshold)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [260])
-def test_solarize_threshold2_upper_bound(threshold, device):
-    img = torch.randint(0, 256, (3, 12, 23)).to(device)
+@pytest.mark.parametrize(
+    ("dtype", "threshold"),
+    [
+        (torch.float32, 1.5),
+        (torch.float16, 1.5),
+        (torch.uint8, 260),
+        (torch.int64, 2**64),
+    ],
+)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_solarize_threshold_above_bound(threshold, dtype, device):
+    make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
+    img = make_img((3, 12, 23), dtype=dtype, device=device)
     with pytest.raises(TypeError, match="Threshold should be less than bound of img."):
         F_t.solarize(img, threshold)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"sharpness_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -869,7 +736,7 @@ def test_adjust_sharpness(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_autocontrast(device, dtype, channels):
@@ -878,7 +745,7 @@ def test_autocontrast(device, dtype, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_autocontrast_equal_minmax(device, dtype, channels):
@@ -890,7 +757,7 @@ def test_autocontrast_equal_minmax(device, dtype, channels):
     assert (F.autocontrast(a)[0] == F.autocontrast(a[0])).all()
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("channels", [1, 3])
 def test_equalize(device, channels):
     torch.use_deterministic_algorithms(False)
@@ -907,7 +774,7 @@ def test_equalize(device, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"contrast_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -917,7 +784,7 @@ def test_adjust_contrast(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"saturation_factor": f} for f in [0.5, 0.75, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -927,7 +794,7 @@ def test_adjust_saturation(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"hue_factor": f} for f in [-0.45, -0.25, 0.0, 0.25, 0.45]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -937,7 +804,7 @@ def test_adjust_hue(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"gamma": g1, "gain": g2} for g1, g2 in zip([0.8, 1.0, 1.2], [0.7, 1.0, 1.3])])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -953,7 +820,7 @@ def test_adjust_gamma(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("pad", [2, [3], [0, 3], (3, 3), [4, 2, 4, 3]])
 @pytest.mark.parametrize(
@@ -1003,14 +870,16 @@ def test_pad(device, dt, pad, config):
     _test_fn_on_batch(batch_tensors, F.pad, padding=script_pad, **config)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("mode", [NEAREST, BILINEAR, BICUBIC])
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC])
 def test_resized_crop(device, mode):
     # test values of F.resized_crop in several cases:
     # 1) resize to the same size, crop to the same size => should be identity
     tensor, _ = _create_data(26, 36, device=device)
 
-    out_tensor = F.resized_crop(tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode)
+    out_tensor = F.resized_crop(
+        tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode, antialias=True
+    )
     assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}")
 
     # 2) resize by half and crop a TL corner
@@ -1025,11 +894,18 @@ def test_resized_crop(device, mode):
 
     batch_tensors = _create_data_batch(26, 36, num_samples=4, device=device)
     _test_fn_on_batch(
-        batch_tensors, F.resized_crop, top=1, left=2, height=20, width=30, size=[10, 15], interpolation=NEAREST
+        batch_tensors,
+        F.resized_crop,
+        top=1,
+        left=2,
+        height=20,
+        width=30,
+        size=[10, 15],
+        interpolation=NEAREST,
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "func, args",
     [
@@ -1062,7 +938,7 @@ def test_assert_image_tensor(device, func, args):
         func(tensor, *args)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_vflip(device):
     script_vflip = torch.jit.script(F.vflip)
 
@@ -1079,7 +955,7 @@ def test_vflip(device):
     _test_fn_on_batch(batch_tensors, F.vflip)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_hflip(device):
     script_hflip = torch.jit.script(F.hflip)
 
@@ -1096,7 +972,7 @@ def test_hflip(device):
     _test_fn_on_batch(batch_tensors, F.hflip)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "top, left, height, width",
     [
@@ -1104,6 +980,8 @@ def test_hflip(device):
         (2, 12, 3, 4),  # crop inside top-right corner
         (8, 3, 5, 6),  # crop inside bottom-left corner
         (8, 11, 4, 3),  # crop inside bottom-right corner
+        (50, 50, 10, 10),  # crop outside the image
+        (-50, -50, 10, 10),  # crop outside the image
     ],
 )
 def test_crop(device, top, left, height, width):
@@ -1123,7 +1001,7 @@ def test_crop(device, top, left, height, width):
     _test_fn_on_batch(batch_tensors, F.crop, top=top, left=left, height=height, width=width)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("image_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
@@ -1146,7 +1024,8 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn):
     #     "23_23_1.7": ...
     # }
     p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
-    true_cv2_results = torch.load(p)
+
+    true_cv2_results = torch.load(p, weights_only=False)
 
     if image_size == "small":
         tensor = (
@@ -1177,7 +1056,7 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn):
     torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_hsv2rgb(device):
     scripted_fn = torch.jit.script(F_t._hsv2rgb)
     shape = (3, 100, 150)
@@ -1208,7 +1087,7 @@ def test_hsv2rgb(device):
     _test_fn_on_batch(batch_tensors, F_t._hsv2rgb)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_rgb2hsv(device):
     scripted_fn = torch.jit.script(F_t._rgb2hsv)
     shape = (3, 150, 100)
@@ -1247,7 +1126,7 @@ def test_rgb2hsv(device):
     _test_fn_on_batch(batch_tensors, F_t._rgb2hsv)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_output_channels", (3, 1))
 def test_rgb_to_grayscale(device, num_output_channels):
     script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale)
@@ -1266,7 +1145,7 @@ def test_rgb_to_grayscale(device, num_output_channels):
     _test_fn_on_batch(batch_tensors, F.rgb_to_grayscale, num_output_channels=num_output_channels)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_center_crop(device):
     script_center_crop = torch.jit.script(F.center_crop)
 
@@ -1284,7 +1163,7 @@ def test_center_crop(device):
     _test_fn_on_batch(batch_tensors, F.center_crop, output_size=[10, 11])
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_five_crop(device):
     script_five_crop = torch.jit.script(F.five_crop)
 
@@ -1318,7 +1197,7 @@ def test_five_crop(device):
         assert_equal(transformed_batch, s_transformed_batch)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_ten_crop(device):
     script_ten_crop = torch.jit.script(F.ten_crop)
 
@@ -1364,7 +1243,7 @@ def test_elastic_transform_asserts():
         _ = F.elastic_transform(img_tensor, displacement=torch.rand(1, 2))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC])
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
diff --git a/test/test_hub.py b/test/test_hub.py
deleted file mode 100644
index d88c6fa2cd2..00000000000
--- a/test/test_hub.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import shutil
-import sys
-import tempfile
-
-import pytest
-import torch.hub as hub
-
-
-def sum_of_model_parameters(model):
-    s = 0
-    for p in model.parameters():
-        s += p.sum()
-    return s
-
-
-SUM_OF_PRETRAINED_RESNET18_PARAMS = -12703.9931640625
-
-
-@pytest.mark.skipif("torchvision" in sys.modules, reason="TestHub must start without torchvision imported")
-class TestHub:
-    # Only run this check ONCE before all tests start.
-    # - If torchvision is imported before all tests start, e.g. we might find _C.so
-    #   which doesn't exist in downloaded zip but in the installed wheel.
-    # - After the first test is run, torchvision is already in sys.modules due to
-    #   Python cache as we run all hub tests in the same python process.
-
-    def test_load_from_github(self):
-        hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False)
-        assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS)
-
-    def test_set_dir(self):
-        temp_dir = tempfile.gettempdir()
-        hub.set_dir(temp_dir)
-        hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False)
-        assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS)
-        assert os.path.exists(temp_dir + "/pytorch_vision_master")
-        shutil.rmtree(temp_dir + "/pytorch_vision_master")
-
-    def test_list_entrypoints(self):
-        entry_lists = hub.list("pytorch/vision", force_reload=True)
-        assert "resnet18" in entry_lists
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/test_image.py b/test/test_image.py
index 7fcd54c9c8f..b11dd67ca12 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -1,20 +1,27 @@
+import concurrent.futures
+import contextlib
 import glob
 import io
 import os
+import re
 import sys
 from pathlib import Path
 
 import numpy as np
 import pytest
+import requests
 import torch
-import torchvision.transforms.functional as F
-from common_utils import assert_equal, needs_cuda
-from PIL import __version__ as PILLOW_VERSION, Image
+import torchvision.transforms.v2.functional as F
+from common_utils import assert_equal, cpu_and_cuda, IN_OSS_CI, needs_cuda
+from PIL import __version__ as PILLOW_VERSION, Image, ImageOps, ImageSequence
 from torchvision.io.image import (
-    _read_png_16,
+    decode_avif,
+    decode_gif,
+    decode_heic,
     decode_image,
     decode_jpeg,
     decode_png,
+    decode_webp,
     encode_jpeg,
     encode_png,
     ImageReadMode,
@@ -32,8 +39,14 @@
 DAMAGED_PNG = os.path.join(IMAGE_ROOT, "damaged_png")
 ENCODE_JPEG = os.path.join(IMAGE_ROOT, "encode_jpeg")
 INTERLACED_PNG = os.path.join(IMAGE_ROOT, "interlaced_png")
+TOOSMALL_PNG = os.path.join(IMAGE_ROOT, "toosmall_png")
 IS_WINDOWS = sys.platform in ("win32", "cygwin")
+IS_MACOS = sys.platform == "darwin"
+IS_LINUX = sys.platform == "linux"
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split("."))
+WEBP_TEST_IMAGES_DIR = os.environ.get("WEBP_TEST_IMAGES_DIR", "")
+# See https://github.com/pytorch/vision/pull/8724#issuecomment-2503964558
+HEIC_AVIF_MESSAGE = "AVIF and HEIF only available on linux."
 
 
 def _get_safe_image_name(name):
@@ -77,23 +90,24 @@ def normalize_dimensions(img_pil):
         ("RGB", ImageReadMode.RGB),
     ],
 )
-def test_decode_jpeg(img_path, pil_mode, mode):
+@pytest.mark.parametrize("scripted", (False, True))
+@pytest.mark.parametrize("decode_fun", (decode_jpeg, decode_image))
+def test_decode_jpeg(img_path, pil_mode, mode, scripted, decode_fun):
 
     with Image.open(img_path) as img:
         is_cmyk = img.mode == "CMYK"
         if pil_mode is not None:
-            if is_cmyk:
-                # libjpeg does not support the conversion
-                pytest.xfail("Decoding a CMYK jpeg isn't supported")
             img = img.convert(pil_mode)
         img_pil = torch.from_numpy(np.array(img))
-        if is_cmyk:
+        if is_cmyk and mode == ImageReadMode.UNCHANGED:
             # flip the colors to match libjpeg
             img_pil = 255 - img_pil
 
     img_pil = normalize_dimensions(img_pil)
     data = read_file(img_path)
-    img_ljpeg = decode_image(data, mode=mode)
+    if scripted:
+        decode_fun = torch.jit.script(decode_fun)
+    img_ljpeg = decode_fun(data, mode=mode)
 
     # Permit a small variation on pixel values to account for implementation
     # differences between Pillow and LibJPEG.
@@ -101,15 +115,43 @@ def test_decode_jpeg(img_path, pil_mode, mode):
     assert abs_mean_diff < 2
 
 
-def test_decode_jpeg_errors():
-    with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
-        decode_jpeg(torch.empty((100, 1), dtype=torch.uint8))
+@pytest.mark.parametrize("codec", ["png", "jpeg"])
+@pytest.mark.parametrize("orientation", [1, 2, 3, 4, 5, 6, 7, 8, 0])
+def test_decode_with_exif_orientation(tmpdir, codec, orientation):
+    fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.{codec}")
+    t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8)
+    im = F.to_pil_image(t)
+    exif = im.getexif()
+    exif[0x0112] = orientation  # set exif orientation
+    im.save(fp, codec.upper(), exif=exif.tobytes())
 
-    with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
-        decode_jpeg(torch.empty((100,), dtype=torch.float16))
+    data = read_file(fp)
+    output = decode_image(data, apply_exif_orientation=True)
+
+    pimg = Image.open(fp)
+    pimg = ImageOps.exif_transpose(pimg)
+
+    expected = F.pil_to_tensor(pimg)
+    torch.testing.assert_close(expected, output)
+
+
+@pytest.mark.parametrize("size", [65533, 1, 7, 10, 23, 33])
+def test_invalid_exif(tmpdir, size):
+    # Inspired from a PIL test:
+    # https://github.com/python-pillow/Pillow/blob/8f63748e50378424628155994efd7e0739a4d1d1/Tests/test_file_jpeg.py#L299
+    fp = os.path.join(tmpdir, "invalid_exif.jpg")
+    t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8)
+    im = F.to_pil_image(t)
+    im.save(fp, "JPEG", exif=b"1" * size)
+
+    data = read_file(fp)
+    output = decode_image(data, apply_exif_orientation=True)
+
+    pimg = Image.open(fp)
+    pimg = ImageOps.exif_transpose(pimg)
 
-    with pytest.raises(RuntimeError, match="Not a JPEG file"):
-        decode_jpeg(torch.empty((100), dtype=torch.uint8))
+    expected = F.pil_to_tensor(pimg)
+    torch.testing.assert_close(expected, output)
 
 
 def test_decode_bad_huffman_images():
@@ -150,7 +192,12 @@ def test_damaged_corrupt_images(img_path):
         ("RGBA", ImageReadMode.RGB_ALPHA),
     ],
 )
-def test_decode_png(img_path, pil_mode, mode):
+@pytest.mark.parametrize("scripted", (False, True))
+@pytest.mark.parametrize("decode_fun", (decode_png, decode_image))
+def test_decode_png(img_path, pil_mode, mode, scripted, decode_fun):
+
+    if scripted:
+        decode_fun = torch.jit.script(decode_fun)
 
     with Image.open(img_path) as img:
         if pil_mode is not None:
@@ -160,19 +207,14 @@ def test_decode_png(img_path, pil_mode, mode):
     img_pil = normalize_dimensions(img_pil)
 
     if img_path.endswith("16.png"):
-        # 16 bits image decoding is supported, but only as a private API
-        # FIXME: see https://github.com/pytorch/vision/issues/4731 for potential solutions to making it public
-        with pytest.raises(RuntimeError, match="At most 8-bit PNG images are supported"):
-            data = read_file(img_path)
-            img_lpng = decode_image(data, mode=mode)
-
-        img_lpng = _read_png_16(img_path, mode=mode)
-        assert img_lpng.dtype == torch.int32
-        # PIL converts 16 bits pngs in uint8
-        img_lpng = torch.round(img_lpng / (2**16 - 1) * 255).to(torch.uint8)
+        data = read_file(img_path)
+        img_lpng = decode_fun(data, mode=mode)
+        assert img_lpng.dtype == torch.uint16
+        # PIL converts 16 bits pngs to uint8
+        img_lpng = F.to_dtype(img_lpng, torch.uint8, scale=True)
     else:
         data = read_file(img_path)
-        img_lpng = decode_image(data, mode=mode)
+        img_lpng = decode_fun(data, mode=mode)
 
     tol = 0 if pil_mode is None else 1
 
@@ -187,23 +229,23 @@ def test_decode_png(img_path, pil_mode, mode):
 
 
 def test_decode_png_errors():
-    with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
-        decode_png(torch.empty((), dtype=torch.uint8))
-    with pytest.raises(RuntimeError, match="Content is not png"):
-        decode_png(torch.randint(3, 5, (300,), dtype=torch.uint8))
     with pytest.raises(RuntimeError, match="Out of bound read in decode_png"):
         decode_png(read_file(os.path.join(DAMAGED_PNG, "sigsegv.png")))
+    with pytest.raises(RuntimeError, match="Content is too small for png"):
+        decode_png(read_file(os.path.join(TOOSMALL_PNG, "heapbof.png")))
 
 
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(png_path, id=_get_safe_image_name(png_path)) for png_path in get_images(IMAGE_DIR, ".png")],
 )
-def test_encode_png(img_path):
+@pytest.mark.parametrize("scripted", (True, False))
+def test_encode_png(img_path, scripted):
     pil_image = Image.open(img_path)
     img_pil = torch.from_numpy(np.array(pil_image))
     img_pil = img_pil.permute(2, 0, 1)
-    png_buf = encode_png(img_pil, compression_level=6)
+    encode = torch.jit.script(encode_png) if scripted else encode_png
+    png_buf = encode(img_pil, compression_level=6)
 
     rec_img = Image.open(io.BytesIO(bytes(png_buf.tolist())))
     rec_img = torch.from_numpy(np.array(rec_img))
@@ -230,27 +272,39 @@ def test_encode_png_errors():
     "img_path",
     [pytest.param(png_path, id=_get_safe_image_name(png_path)) for png_path in get_images(IMAGE_DIR, ".png")],
 )
-def test_write_png(img_path, tmpdir):
+@pytest.mark.parametrize("scripted", (True, False))
+def test_write_png(img_path, tmpdir, scripted):
     pil_image = Image.open(img_path)
     img_pil = torch.from_numpy(np.array(pil_image))
     img_pil = img_pil.permute(2, 0, 1)
 
     filename, _ = os.path.splitext(os.path.basename(img_path))
     torch_png = os.path.join(tmpdir, f"{filename}_torch.png")
-    write_png(img_pil, torch_png, compression_level=6)
+    write = torch.jit.script(write_png) if scripted else write_png
+    write(img_pil, torch_png, compression_level=6)
     saved_image = torch.from_numpy(np.array(Image.open(torch_png)))
     saved_image = saved_image.permute(2, 0, 1)
 
     assert_equal(img_pil, saved_image)
 
 
-def test_read_file(tmpdir):
+def test_read_image():
+    # Just testing torchcsript, the functionality is somewhat tested already in other tests.
+    path = next(get_images(IMAGE_ROOT, ".jpg"))
+    out = read_image(path)
+    out_scripted = torch.jit.script(read_image)(path)
+    torch.testing.assert_close(out, out_scripted, atol=0, rtol=0)
+
+
+@pytest.mark.parametrize("scripted", (True, False))
+def test_read_file(tmpdir, scripted):
     fname, content = "test1.bin", b"TorchVision\211\n"
     fpath = os.path.join(tmpdir, fname)
     with open(fpath, "wb") as f:
         f.write(content)
 
-    data = read_file(fpath)
+    fun = torch.jit.script(read_file) if scripted else read_file
+    data = fun(fpath)
     expected = torch.tensor(list(content), dtype=torch.uint8)
     os.unlink(fpath)
     assert_equal(data, expected)
@@ -271,11 +325,13 @@ def test_read_file_non_ascii(tmpdir):
     assert_equal(data, expected)
 
 
-def test_write_file(tmpdir):
+@pytest.mark.parametrize("scripted", (True, False))
+def test_write_file(tmpdir, scripted):
     fname, content = "test1.bin", b"TorchVision\211\n"
     fpath = os.path.join(tmpdir, fname)
     content_tensor = torch.tensor(list(content), dtype=torch.uint8)
-    write_file(fpath, content_tensor)
+    write = torch.jit.script(write_file) if scripted else write_file
+    write(fpath, content_tensor)
 
     with open(fpath, "rb") as f:
         saved_content = f.read()
@@ -343,39 +399,64 @@ def test_read_1_bit_png_consistency(shape, mode, tmpdir):
 def test_read_interlaced_png():
     imgs = list(get_images(INTERLACED_PNG, ".png"))
     with Image.open(imgs[0]) as im1, Image.open(imgs[1]) as im2:
-        assert not (im1.info.get("interlace") is im2.info.get("interlace"))
+        assert im1.info.get("interlace") is not im2.info.get("interlace")
     img1 = read_image(imgs[0])
     img2 = read_image(imgs[1])
     assert_equal(img1, img2)
 
 
 @needs_cuda
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")],
-)
 @pytest.mark.parametrize("mode", [ImageReadMode.UNCHANGED, ImageReadMode.GRAY, ImageReadMode.RGB])
 @pytest.mark.parametrize("scripted", (False, True))
-def test_decode_jpeg_cuda(mode, img_path, scripted):
-    if "cmyk" in img_path:
-        pytest.xfail("Decoding a CMYK jpeg isn't supported")
+def test_decode_jpegs_cuda(mode, scripted):
+    encoded_images = []
+    for jpeg_path in get_images(IMAGE_ROOT, ".jpg"):
+        if "cmyk" in jpeg_path:
+            continue
+        encoded_image = read_file(jpeg_path)
+        encoded_images.append(encoded_image)
+    decoded_images_cpu = decode_jpeg(encoded_images, mode=mode)
+    decode_fn = torch.jit.script(decode_jpeg) if scripted else decode_jpeg
+
+    # test multithreaded decoding
+    # in the current version we prevent this by using a lock but we still want to test it
+    num_workers = 10
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = [executor.submit(decode_fn, encoded_images, mode, "cuda") for _ in range(num_workers)]
+    decoded_images_threaded = [future.result() for future in futures]
+    assert len(decoded_images_threaded) == num_workers
+    for decoded_images in decoded_images_threaded:
+        assert len(decoded_images) == len(encoded_images)
+        for decoded_image_cuda, decoded_image_cpu in zip(decoded_images, decoded_images_cpu):
+            assert decoded_image_cuda.shape == decoded_image_cpu.shape
+            assert decoded_image_cuda.dtype == decoded_image_cpu.dtype == torch.uint8
+            assert (decoded_image_cuda.cpu().float() - decoded_image_cpu.cpu().float()).abs().mean() < 2
 
-    data = read_file(img_path)
-    img = decode_image(data, mode=mode)
-    f = torch.jit.script(decode_jpeg) if scripted else decode_jpeg
-    img_nvjpeg = f(data, mode=mode, device="cuda")
 
-    # Some difference expected between jpeg implementations
-    assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2
+@needs_cuda
+def test_decode_image_cuda_raises():
+    data = torch.randint(0, 127, size=(255,), device="cuda", dtype=torch.uint8)
+    with pytest.raises(RuntimeError):
+        decode_image(data)
 
 
 @needs_cuda
-@pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda")))
-def test_decode_jpeg_cuda_device_param(cuda_device):
-    """Make sure we can pass a string or a torch.device as device param"""
+def test_decode_jpeg_cuda_device_param():
     path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path)
     data = read_file(path)
-    decode_jpeg(data, device=cuda_device)
+    current_device = torch.cuda.current_device()
+    current_stream = torch.cuda.current_stream()
+    num_devices = torch.cuda.device_count()
+    devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
+    results = []
+    for device in devices:
+        results.append(decode_jpeg(data, device=device))
+    assert len(results) == len(devices)
+    for result in results:
+        assert torch.all(result.cpu() == results[0].cpu())
+    assert current_device == torch.cuda.current_device()
+    assert current_stream == torch.cuda.current_stream()
 
 
 @needs_cuda
@@ -383,12 +464,73 @@ def test_decode_jpeg_cuda_errors():
     data = read_file(next(get_images(IMAGE_ROOT, ".jpg")))
     with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
         decode_jpeg(data.reshape(-1, 1), device="cuda")
-    with pytest.raises(RuntimeError, match="input tensor must be on CPU"):
+    with pytest.raises(ValueError, match="must be tensors"):
+        decode_jpeg([1, 2, 3])
+    with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"):
         decode_jpeg(data.to("cuda"), device="cuda")
     with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
         decode_jpeg(data.to(torch.float), device="cuda")
-    with pytest.raises(RuntimeError, match="Expected a cuda device"):
-        torch.ops.image.decode_jpeg_cuda(data, ImageReadMode.UNCHANGED.value, "cpu")
+    with pytest.raises(RuntimeError, match="Expected the device parameter to be a cuda device"):
+        torch.ops.image.decode_jpegs_cuda([data], ImageReadMode.UNCHANGED.value, "cpu")
+    with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"):
+        decode_jpeg(
+            torch.empty((100,), dtype=torch.uint8, device="cuda"),
+        )
+    with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+            ]
+        )
+
+    with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8, device="cpu"),
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8),
+                torch.empty((100,), dtype=torch.float32),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8),
+                torch.empty((1, 100), dtype=torch.uint8),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(RuntimeError, match="Error while decoding JPEG images"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8),
+                torch.empty((100,), dtype=torch.uint8),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(ValueError, match="Input list must contain at least one element"):
+        decode_jpeg([], device="cuda")
 
 
 def test_encode_jpeg_errors():
@@ -412,102 +554,262 @@ def test_encode_jpeg_errors():
         encode_jpeg(torch.empty((100, 100), dtype=torch.uint8))
 
 
-def _collect_if(cond):
-    # TODO: remove this once test_encode_jpeg_reference and test_write_jpeg_reference
-    # are removed
-    def _inner(test_func):
-        if cond:
-            return test_func
-        else:
-            return pytest.mark.dont_collect(test_func)
-
-    return _inner
-
-
-@_collect_if(cond=False)
+@pytest.mark.skipif(IS_MACOS, reason="https://github.com/pytorch/vision/issues/8031")
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
 )
-def test_encode_jpeg_reference(img_path):
-    # This test is *wrong*.
-    # It compares a torchvision-encoded jpeg with a PIL-encoded jpeg (the reference), but it
-    # starts encoding the torchvision version from an image that comes from
-    # decode_jpeg, which can yield different results from pil.decode (see
-    # test_decode... which uses a high tolerance).
-    # Instead, we should start encoding from the exact same decoded image, for a
-    # valid comparison. This is done in test_encode_jpeg, but unfortunately
-    # these more correct tests fail on windows (probably because of a difference
-    # in libjpeg) between torchvision and PIL.
-    # FIXME: make the correct tests pass on windows and remove this.
-    dirname = os.path.dirname(img_path)
-    filename, _ = os.path.splitext(os.path.basename(img_path))
-    write_folder = os.path.join(dirname, "jpeg_write")
-    expected_file = os.path.join(write_folder, f"{filename}_pil.jpg")
-    img = decode_jpeg(read_file(img_path))
+@pytest.mark.parametrize("scripted", (True, False))
+def test_encode_jpeg(img_path, scripted):
+    img = read_image(img_path)
 
-    with open(expected_file, "rb") as f:
-        pil_bytes = f.read()
-        pil_bytes = torch.as_tensor(list(pil_bytes), dtype=torch.uint8)
+    pil_img = F.to_pil_image(img)
+    buf = io.BytesIO()
+    pil_img.save(buf, format="JPEG", quality=75)
+
+    encoded_jpeg_pil = torch.frombuffer(buf.getvalue(), dtype=torch.uint8)
+
+    encode = torch.jit.script(encode_jpeg) if scripted else encode_jpeg
     for src_img in [img, img.contiguous()]:
-        # PIL sets jpeg quality to 75 by default
-        jpeg_bytes = encode_jpeg(src_img, quality=75)
-        assert_equal(jpeg_bytes, pil_bytes)
+        encoded_jpeg_torch = encode(src_img, quality=75)
+        assert_equal(encoded_jpeg_torch, encoded_jpeg_pil)
+
+
+@needs_cuda
+def test_encode_jpeg_cuda_device_param():
+    path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path)
+
+    data = read_image(path)
 
+    current_device = torch.cuda.current_device()
+    current_stream = torch.cuda.current_stream()
+    num_devices = torch.cuda.device_count()
+    devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
+    results = []
+    for device in devices:
+        results.append(encode_jpeg(data.to(device=device)))
+    assert len(results) == len(devices)
+    for result in results:
+        assert torch.all(result.cpu() == results[0].cpu())
+    assert current_device == torch.cuda.current_device()
+    assert current_stream == torch.cuda.current_stream()
 
-@_collect_if(cond=False)
+
+@needs_cuda
 @pytest.mark.parametrize(
     "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
+    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")],
 )
-def test_write_jpeg_reference(img_path, tmpdir):
-    # FIXME: Remove this eventually, see test_encode_jpeg_reference
-    data = read_file(img_path)
-    img = decode_jpeg(data)
+@pytest.mark.parametrize("scripted", (False, True))
+@pytest.mark.parametrize("contiguous", (False, True))
+def test_encode_jpeg_cuda(img_path, scripted, contiguous):
+    decoded_image_tv = read_image(img_path)
+    encode_fn = torch.jit.script(encode_jpeg) if scripted else encode_jpeg
 
-    basedir = os.path.dirname(img_path)
-    filename, _ = os.path.splitext(os.path.basename(img_path))
-    torch_jpeg = os.path.join(tmpdir, f"{filename}_torch.jpg")
-    pil_jpeg = os.path.join(basedir, "jpeg_write", f"{filename}_pil.jpg")
+    if "cmyk" in img_path:
+        pytest.xfail("Encoding a CMYK jpeg isn't supported")
+    if decoded_image_tv.shape[0] == 1:
+        pytest.xfail("Decoding a grayscale jpeg isn't supported")
+        # For more detail as to why check out: https://github.com/NVIDIA/cuda-samples/issues/23#issuecomment-559283013
+    if contiguous:
+        decoded_image_tv = decoded_image_tv[None].contiguous(memory_format=torch.contiguous_format)[0]
+    else:
+        decoded_image_tv = decoded_image_tv[None].contiguous(memory_format=torch.channels_last)[0]
+    encoded_jpeg_cuda_tv = encode_fn(decoded_image_tv.cuda(), quality=75)
+    decoded_jpeg_cuda_tv = decode_jpeg(encoded_jpeg_cuda_tv.cpu())
 
-    write_jpeg(img, torch_jpeg, quality=75)
+    # the actual encoded bytestreams from libnvjpeg and libjpeg-turbo differ for the same quality
+    # instead, we re-decode the encoded image and compare to the original
+    abs_mean_diff = (decoded_jpeg_cuda_tv.float() - decoded_image_tv.float()).abs().mean().item()
+    assert abs_mean_diff < 3
 
-    with open(torch_jpeg, "rb") as f:
-        torch_bytes = f.read()
 
-    with open(pil_jpeg, "rb") as f:
-        pil_bytes = f.read()
+@needs_cuda
+def test_encode_jpeg_cuda_sync():
+    """
+    Non-regression test for https://github.com/pytorch/vision/issues/8587.
+    Attempts to reproduce an intermittent CUDA stream synchronization bug
+    by randomly creating images and round-tripping them via encode_jpeg
+    and decode_jpeg on the GPU. Fails if the mean difference in uint8 range
+    exceeds 5.
+    """
+    torch.manual_seed(42)
+
+    # manual testing shows this bug appearing often in iterations between 50 and 100
+    # as a synchronization bug, this can't be reliably reproduced
+    max_iterations = 100
+    threshold = 5.0  # in [0..255]
+
+    device = torch.device("cuda")
+
+    for iteration in range(max_iterations):
+        height, width = torch.randint(4000, 5000, size=(2,))
+
+        image = torch.linspace(0, 1, steps=height * width, device=device)
+        image = image.view(1, height, width).expand(3, -1, -1)
+
+        image = (image * 255).clamp(0, 255).to(torch.uint8)
+        jpeg_bytes = encode_jpeg(image, quality=100)
+
+        decoded_image = decode_jpeg(jpeg_bytes.cpu(), device=device)
+        mean_difference = (image.float() - decoded_image.float()).abs().mean().item()
+
+        assert mean_difference <= threshold, (
+            f"Encode/decode mismatch at iteration={iteration}, "
+            f"size={height}x{width}, mean diff={mean_difference:.2f}"
+        )
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("scripted", (True, False))
+@pytest.mark.parametrize("contiguous", (True, False))
+def test_encode_jpegs_batch(scripted, contiguous, device):
+    if device == "cpu" and IS_MACOS:
+        pytest.skip("https://github.com/pytorch/vision/issues/8031")
+    decoded_images_tv = []
+    for jpeg_path in get_images(IMAGE_ROOT, ".jpg"):
+        if "cmyk" in jpeg_path:
+            continue
+        decoded_image = read_image(jpeg_path)
+        if decoded_image.shape[0] == 1:
+            continue
+        if contiguous:
+            decoded_image = decoded_image[None].contiguous(memory_format=torch.contiguous_format)[0]
+        else:
+            decoded_image = decoded_image[None].contiguous(memory_format=torch.channels_last)[0]
+        decoded_images_tv.append(decoded_image)
+
+    encode_fn = torch.jit.script(encode_jpeg) if scripted else encode_jpeg
+
+    decoded_images_tv_device = [img.to(device=device) for img in decoded_images_tv]
+    encoded_jpegs_tv_device = encode_fn(decoded_images_tv_device, quality=75)
+    encoded_jpegs_tv_device = [decode_jpeg(img.cpu()) for img in encoded_jpegs_tv_device]
+
+    for original, encoded_decoded in zip(decoded_images_tv, encoded_jpegs_tv_device):
+        c, h, w = original.shape
+        abs_mean_diff = (original.float() - encoded_decoded.float()).abs().mean().item()
+        assert abs_mean_diff < 3
+
+    # test multithreaded decoding
+    # in the current version we prevent this by using a lock but we still want to test it
+    num_workers = 10
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = [executor.submit(encode_fn, decoded_images_tv_device) for _ in range(num_workers)]
+    encoded_images_threaded = [future.result() for future in futures]
+    assert len(encoded_images_threaded) == num_workers
+    for encoded_images in encoded_images_threaded:
+        assert len(decoded_images_tv_device) == len(encoded_images)
+        for i, (encoded_image_cuda, decoded_image_tv) in enumerate(zip(encoded_images, decoded_images_tv_device)):
+            # make sure all the threads produce identical outputs
+            assert torch.all(encoded_image_cuda == encoded_images_threaded[0][i])
+
+            # make sure the outputs are identical or close enough to baseline
+            decoded_cuda_encoded_image = decode_jpeg(encoded_image_cuda.cpu())
+            assert decoded_cuda_encoded_image.shape == decoded_image_tv.shape
+            assert decoded_cuda_encoded_image.dtype == decoded_image_tv.dtype
+            assert (decoded_cuda_encoded_image.cpu().float() - decoded_image_tv.cpu().float()).abs().mean() < 3
 
-    assert_equal(torch_bytes, pil_bytes)
 
+@needs_cuda
+def test_single_encode_jpeg_cuda_errors():
+    with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"):
+        encode_jpeg(torch.empty((3, 100, 100), dtype=torch.float32, device="cuda"))
 
-# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162.
-@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo")
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
-)
-def test_encode_jpeg(img_path):
-    img = read_image(img_path)
+    with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 5"):
+        encode_jpeg(torch.empty((5, 100, 100), dtype=torch.uint8, device="cuda"))
 
-    pil_img = F.to_pil_image(img)
-    buf = io.BytesIO()
-    pil_img.save(buf, format="JPEG", quality=75)
+    with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 1"):
+        encode_jpeg(torch.empty((1, 100, 100), dtype=torch.uint8, device="cuda"))
 
-    encoded_jpeg_pil = torch.frombuffer(buf.getvalue(), dtype=torch.uint8)
+    with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
+        encode_jpeg(torch.empty((1, 3, 100, 100), dtype=torch.uint8, device="cuda"))
 
-    for src_img in [img, img.contiguous()]:
-        encoded_jpeg_torch = encode_jpeg(src_img, quality=75)
-        assert_equal(encoded_jpeg_torch, encoded_jpeg_pil)
+    with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
+        encode_jpeg(torch.empty((100, 100), dtype=torch.uint8, device="cuda"))
+
+
+@needs_cuda
+def test_batch_encode_jpegs_cuda_errors():
+    with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+                torch.empty((3, 100, 100), dtype=torch.float32, device="cuda"),
+            ]
+        )
+
+    with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 5"):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+                torch.empty((5, 100, 100), dtype=torch.uint8, device="cuda"),
+            ]
+        )
+
+    with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 1"):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+                torch.empty((1, 100, 100), dtype=torch.uint8, device="cuda"),
+            ]
+        )
 
+    with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+                torch.empty((1, 3, 100, 100), dtype=torch.uint8, device="cuda"),
+            ]
+        )
 
-# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162.
-@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo")
+    with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+                torch.empty((100, 100), dtype=torch.uint8, device="cuda"),
+            ]
+        )
+
+    with pytest.raises(RuntimeError, match="Input tensor should be on CPU"):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cpu"),
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+            ]
+        )
+
+    with pytest.raises(
+        RuntimeError, match="All input tensors must be on the same CUDA device when encoding with nvjpeg"
+    ):
+        encode_jpeg(
+            [
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
+                torch.empty((3, 100, 100), dtype=torch.uint8, device="cpu"),
+            ]
+        )
+
+    if torch.cuda.device_count() >= 2:
+        with pytest.raises(
+            RuntimeError, match="All input tensors must be on the same CUDA device when encoding with nvjpeg"
+        ):
+            encode_jpeg(
+                [
+                    torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda:0"),
+                    torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda:1"),
+                ]
+            )
+
+    with pytest.raises(ValueError, match="encode_jpeg requires at least one input tensor when a list is passed"):
+        encode_jpeg([])
+
+
+@pytest.mark.skipif(IS_MACOS, reason="https://github.com/pytorch/vision/issues/8031")
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
 )
-def test_write_jpeg(img_path, tmpdir):
+@pytest.mark.parametrize("scripted", (True, False))
+def test_write_jpeg(img_path, tmpdir, scripted):
     tmpdir = Path(tmpdir)
     img = read_image(img_path)
     pil_img = F.to_pil_image(img)
@@ -515,7 +817,8 @@ def test_write_jpeg(img_path, tmpdir):
     torch_jpeg = str(tmpdir / "torch.jpg")
     pil_jpeg = str(tmpdir / "pil.jpg")
 
-    write_jpeg(img, torch_jpeg, quality=75)
+    write = torch.jit.script(write_jpeg) if scripted else write_jpeg
+    write(img, torch_jpeg, quality=75)
     pil_img.save(pil_jpeg, quality=75)
 
     with open(torch_jpeg, "rb") as f:
@@ -527,5 +830,301 @@ def test_write_jpeg(img_path, tmpdir):
     assert_equal(torch_bytes, pil_bytes)
 
 
+def test_pathlib_support(tmpdir):
+    # Just make sure pathlib.Path is supported where relevant
+
+    jpeg_path = Path(next(get_images(ENCODE_JPEG, ".jpg")))
+
+    read_file(jpeg_path)
+    read_image(jpeg_path)
+
+    write_path = Path(tmpdir) / "whatever"
+    img = torch.randint(0, 10, size=(3, 4, 4), dtype=torch.uint8)
+
+    write_file(write_path, data=img.flatten())
+    write_jpeg(img, write_path)
+    write_png(img, write_path)
+
+
+@pytest.mark.parametrize(
+    "name", ("gifgrid", "fire", "porsche", "treescap", "treescap-interlaced", "solid2", "x-trans", "earth")
+)
+@pytest.mark.parametrize("scripted", (True, False))
+def test_decode_gif(tmpdir, name, scripted):
+    # Using test images from GIFLIB
+    # https://sourceforge.net/p/giflib/code/ci/master/tree/pic/, we assert PIL
+    # and torchvision decoded outputs are equal.
+    # We're not testing against "welcome2" because PIL and GIFLIB disagee on what
+    # the background color should be (likely a difference in the way they handle
+    # transparency?)
+    # 'earth' image is from wikipedia, licensed under CC BY-SA 3.0
+    # https://creativecommons.org/licenses/by-sa/3.0/
+    # it allows to properly test for transparency, TOP-LEFT offsets, and
+    # disposal modes.
+
+    path = tmpdir / f"{name}.gif"
+    if name == "earth":
+        if IN_OSS_CI:
+            # TODO: Fix this... one day.
+            pytest.skip("Skipping 'earth' test as it's flaky on OSS CI")
+        url = "https://upload.wikimedia.org/wikipedia/commons/2/2c/Rotating_earth_%28large%29.gif"
+    else:
+        url = f"https://sourceforge.net/p/giflib/code/ci/master/tree/pic/{name}.gif?format=raw"
+    with open(path, "wb") as f:
+        f.write(requests.get(url).content)
+
+    encoded_bytes = read_file(path)
+    f = torch.jit.script(decode_gif) if scripted else decode_gif
+    tv_out = f(encoded_bytes)
+    if tv_out.ndim == 3:
+        tv_out = tv_out[None]
+
+    assert tv_out.is_contiguous(memory_format=torch.channels_last)
+
+    # For some reason, not using Image.open() as a CM causes "ResourceWarning: unclosed file"
+    with Image.open(path) as pil_img:
+        pil_seq = ImageSequence.Iterator(pil_img)
+
+        for pil_frame, tv_frame in zip(pil_seq, tv_out):
+            pil_frame = F.pil_to_tensor(pil_frame.convert("RGB"))
+            torch.testing.assert_close(tv_frame, pil_frame, atol=0, rtol=0)
+
+
+@pytest.mark.parametrize(
+    "decode_fun, match",
+    [
+        (decode_png, "Content is not png"),
+        (decode_jpeg, "Not a JPEG file"),
+        (decode_gif, re.escape("DGifOpenFileName() failed - 103")),
+        (decode_webp, "WebPGetFeatures failed."),
+        pytest.param(
+            decode_avif,
+            "BMFF parsing failed",
+            # marks=pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE)
+            marks=pytest.mark.skipif(True, reason="Skipping avif/heic tests for now."),
+        ),
+        pytest.param(
+            decode_heic,
+            "Invalid input: No 'ftyp' box",
+            # marks=pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE),
+            marks=pytest.mark.skipif(True, reason="Skipping avif/heic tests for now."),
+        ),
+    ],
+)
+def test_decode_bad_encoded_data(decode_fun, match):
+    encoded_data = torch.randint(0, 256, (100,), dtype=torch.uint8)
+    with pytest.raises(RuntimeError, match="Input tensor must be 1-dimensional"):
+        decode_fun(encoded_data[None])
+    with pytest.raises(RuntimeError, match="Input tensor must have uint8 data type"):
+        decode_fun(encoded_data.float())
+    with pytest.raises(RuntimeError, match="Input tensor must be contiguous"):
+        decode_fun(encoded_data[::2])
+    with pytest.raises(RuntimeError, match=match):
+        decode_fun(encoded_data)
+
+
+@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image))
+@pytest.mark.parametrize("scripted", (False, True))
+def test_decode_webp(decode_fun, scripted):
+    encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".webp")))
+    if scripted:
+        decode_fun = torch.jit.script(decode_fun)
+    img = decode_fun(encoded_bytes)
+    assert img.shape == (3, 100, 100)
+    assert img[None].is_contiguous(memory_format=torch.channels_last)
+    img += 123  # make sure image buffer wasn't freed by underlying decoding lib
+
+
+@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image))
+def test_decode_webp_grayscale(decode_fun, capfd):
+    encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".webp")))
+
+    # We warn at the C++ layer because for decode_image(), we don't do the image
+    # type dispatch until we get to the C++ version of decode_image(). We could
+    # warn at the Python layer in decode_webp(), but then users would get a
+    # double wanring: one from the Python layer and one from the C++ layer.
+    #
+    # Because we use the TORCH_WARN_ONCE macro, we need to do this dance to
+    # temporarily always warn so we can test.
+    @contextlib.contextmanager
+    def set_always_warn():
+        torch._C._set_warnAlways(True)
+        yield
+        torch._C._set_warnAlways(False)
+
+    with set_always_warn():
+        img = decode_fun(encoded_bytes, mode=ImageReadMode.GRAY)
+        assert "Webp does not support grayscale conversions" in capfd.readouterr().err
+
+        # Note that because we do not support grayscale conversions, we expect
+        # that the number of color channels is still 3.
+        assert img.shape == (3, 100, 100)
+
+
+# This test is skipped by default because it requires webp images that we're not
+# including within the repo. The test images were downloaded manually from the
+# different pages of https://developers.google.com/speed/webp/gallery
+@pytest.mark.skipif(not WEBP_TEST_IMAGES_DIR, reason="WEBP_TEST_IMAGES_DIR is not set")
+@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image))
+@pytest.mark.parametrize("scripted", (False, True))
+@pytest.mark.parametrize(
+    "mode, pil_mode",
+    (
+        # Note that converting an RGBA image to RGB leads to bad results because the
+        # transparent pixels aren't necessarily set to "black" or "white", they can be
+        # random stuff. This is consistent with PIL results.
+        (ImageReadMode.RGB, "RGB"),
+        (ImageReadMode.RGB_ALPHA, "RGBA"),
+        (ImageReadMode.UNCHANGED, None),
+    ),
+)
+@pytest.mark.parametrize("filename", Path(WEBP_TEST_IMAGES_DIR).glob("*.webp"), ids=lambda p: p.name)
+def test_decode_webp_against_pil(decode_fun, scripted, mode, pil_mode, filename):
+    encoded_bytes = read_file(filename)
+    if scripted:
+        decode_fun = torch.jit.script(decode_fun)
+    img = decode_fun(encoded_bytes, mode=mode)
+    assert img[None].is_contiguous(memory_format=torch.channels_last)
+
+    pil_img = Image.open(filename).convert(pil_mode)
+    from_pil = F.pil_to_tensor(pil_img)
+    assert_equal(img, from_pil)
+    img += 123  # make sure image buffer wasn't freed by underlying decoding lib
+
+
+# @pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE)
+@pytest.mark.skipif(True, reason="Skipping avif/heic tests for now.")
+@pytest.mark.parametrize("decode_fun", (decode_avif,))
+def test_decode_avif(decode_fun):
+    encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".avif")))
+    img = decode_fun(encoded_bytes)
+    assert img.shape == (3, 100, 100)
+    assert img[None].is_contiguous(memory_format=torch.channels_last)
+    img += 123  # make sure image buffer wasn't freed by underlying decoding lib
+
+
+# Note: decode_image fails because some of these files have a (valid) signature
+# we don't recognize. We should probably use libmagic....
+# @pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE)
+@pytest.mark.skipif(True, reason="Skipping avif/heic tests for now.")
+@pytest.mark.parametrize("decode_fun", (decode_avif, decode_heic))
+@pytest.mark.parametrize(
+    "mode, pil_mode",
+    (
+        (ImageReadMode.RGB, "RGB"),
+        (ImageReadMode.RGB_ALPHA, "RGBA"),
+        (ImageReadMode.UNCHANGED, None),
+    ),
+)
+@pytest.mark.parametrize(
+    "filename", Path("/home/nicolashug/dev/libavif/tests/data/").glob("*.avif"), ids=lambda p: p.name
+)
+def test_decode_avif_heic_against_pil(decode_fun, mode, pil_mode, filename):
+    if "reversed_dimg_order" in str(filename):
+        # Pillow properly decodes this one, but we don't (order of parts of the
+        # image is wrong). This is due to a bug that was recently fixed in
+        # libavif. Hopefully this test will end up passing soon with a new
+        # libavif version https://github.com/AOMediaCodec/libavif/issues/2311
+        pytest.xfail()
+    import pillow_avif  # noqa
+
+    encoded_bytes = read_file(filename)
+    try:
+        img = decode_fun(encoded_bytes, mode=mode)
+    except RuntimeError as e:
+        if any(
+            s in str(e)
+            for s in (
+                "BMFF parsing failed",
+                "avifDecoderParse failed: ",
+                "file contains more than one image",
+                "no 'ispe' property",
+                "'iref' has double references",
+                "Invalid image grid",
+                "decode_heif failed: Invalid input: No 'meta' box",
+            )
+        ):
+            pytest.skip(reason="Expected failure, that's OK")
+        else:
+            raise e
+    assert img[None].is_contiguous(memory_format=torch.channels_last)
+    if mode == ImageReadMode.RGB:
+        assert img.shape[0] == 3
+    if mode == ImageReadMode.RGB_ALPHA:
+        assert img.shape[0] == 4
+
+    if img.dtype == torch.uint16:
+        img = F.to_dtype(img, dtype=torch.uint8, scale=True)
+    try:
+        from_pil = F.pil_to_tensor(Image.open(filename).convert(pil_mode))
+    except RuntimeError as e:
+        if any(s in str(e) for s in ("Invalid image grid", "Failed to decode image: Not implemented")):
+            pytest.skip(reason="PIL failure")
+        else:
+            raise e
+
+    if True:
+        from torchvision.utils import make_grid
+
+        g = make_grid([img, from_pil])
+        F.to_pil_image(g).save(f"/home/nicolashug/out_images/{filename.name}.{pil_mode}.png")
+
+    is_decode_heic = getattr(decode_fun, "__name__", getattr(decode_fun, "name", None)) == "decode_heic"
+    if mode == ImageReadMode.RGB and not is_decode_heic:
+        # We don't compare torchvision's AVIF against PIL for RGB because
+        # results look pretty different on RGBA images (other images are fine).
+        # The result on torchvision basically just plainly ignores the alpha
+        # channel, resuting in transparent pixels looking dark. PIL seems to be
+        # using a sort of k-nn thing (Take a look at the resuting images)
+        return
+    if filename.name == "sofa_grid1x5_420.avif" and is_decode_heic:
+        return
+
+    torch.testing.assert_close(img, from_pil, rtol=0, atol=3)
+
+
+# @pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE)
+@pytest.mark.skipif(True, reason="Skipping avif/heic tests for now.")
+@pytest.mark.parametrize("decode_fun", (decode_heic,))
+def test_decode_heic(decode_fun):
+    encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".heic")))
+    img = decode_fun(encoded_bytes)
+    assert img.shape == (3, 100, 100)
+    assert img[None].is_contiguous(memory_format=torch.channels_last)
+    img += 123  # make sure image buffer wasn't freed by underlying decoding lib
+
+
+@pytest.mark.parametrize("input_type", ("Path", "str", "tensor"))
+@pytest.mark.parametrize("scripted", (False, True))
+def test_decode_image_path(input_type, scripted):
+    # Check that decode_image can support not just tensors as input
+    path = next(get_images(IMAGE_ROOT, ".jpg"))
+    if input_type == "Path":
+        input = Path(path)
+    elif input_type == "str":
+        input = path
+    elif input_type == "tensor":
+        input = read_file(path)
+    else:
+        raise ValueError("Oops")
+
+    if scripted and input_type == "Path":
+        pytest.xfail(reason="Can't pass a Path when scripting")
+
+    decode_fun = torch.jit.script(decode_image) if scripted else decode_image
+    decode_fun(input)
+
+
+def test_mode_str():
+    # Make sure decode_image supports string modes. We just test decode_image,
+    # not all of the decoding functions, but they should all support that too.
+    # Torchscript fails when passing strings, which is expected.
+    path = next(get_images(IMAGE_ROOT, ".png"))
+    assert decode_image(path, mode="RGB").shape[0] == 3
+    assert decode_image(path, mode="rGb").shape[0] == 3
+    assert decode_image(path, mode="GRAY").shape[0] == 1
+    assert decode_image(path, mode="RGBA").shape[0] == 4
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/test_internet.py b/test/test_internet.py
index 03828c151c0..34fc3d4aa08 100644
--- a/test/test_internet.py
+++ b/test/test_internet.py
@@ -6,6 +6,7 @@
 """
 
 import os
+import pathlib
 from urllib.error import URLError
 
 import pytest
@@ -13,7 +14,10 @@
 
 
 class TestDatasetUtils:
-    def test_download_url(self, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_download_url(self, tmpdir, use_pathlib):
+        if use_pathlib:
+            tmpdir = pathlib.Path(tmpdir)
         url = "http://github.com/pytorch/vision/archive/master.zip"
         try:
             utils.download_url(url, tmpdir)
@@ -21,7 +25,10 @@ def test_download_url(self, tmpdir):
         except URLError:
             pytest.skip(f"could not download test file '{url}'")
 
-    def test_download_url_retry_http(self, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_download_url_retry_http(self, tmpdir, use_pathlib):
+        if use_pathlib:
+            tmpdir = pathlib.Path(tmpdir)
         url = "https://github.com/pytorch/vision/archive/master.zip"
         try:
             utils.download_url(url, tmpdir)
@@ -29,12 +36,18 @@ def test_download_url_retry_http(self, tmpdir):
         except URLError:
             pytest.skip(f"could not download test file '{url}'")
 
-    def test_download_url_dont_exist(self, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_download_url_dont_exist(self, tmpdir, use_pathlib):
+        if use_pathlib:
+            tmpdir = pathlib.Path(tmpdir)
         url = "http://github.com/pytorch/vision/archive/this_doesnt_exist.zip"
         with pytest.raises(URLError):
             utils.download_url(url, tmpdir)
 
-    def test_download_url_dispatch_download_from_google_drive(self, mocker, tmpdir):
+    @pytest.mark.parametrize("use_pathlib", (True, False))
+    def test_download_url_dispatch_download_from_google_drive(self, mocker, tmpdir, use_pathlib):
+        if use_pathlib:
+            tmpdir = pathlib.Path(tmpdir)
         url = "https://drive.google.com/file/d/1GO-BHUYRuvzr1Gtp2_fqXRsr9TIeYbhV/view"
 
         id = "1GO-BHUYRuvzr1Gtp2_fqXRsr9TIeYbhV"
@@ -44,7 +57,7 @@ def test_download_url_dispatch_download_from_google_drive(self, mocker, tmpdir):
         mocked = mocker.patch("torchvision.datasets.utils.download_file_from_google_drive")
         utils.download_url(url, tmpdir, filename, md5)
 
-        mocked.assert_called_once_with(id, tmpdir, filename, md5)
+        mocked.assert_called_once_with(id, os.path.expanduser(tmpdir), filename, md5)
 
 
 if __name__ == "__main__":
diff --git a/test/test_io.py b/test/test_io.py
index c45180571f0..d2950ac9595 100644
--- a/test/test_io.py
+++ b/test/test_io.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 import torchvision.io as io
-from common_utils import assert_equal
+from common_utils import assert_equal, cpu_and_cuda
 from torchvision import get_video_backend
 
 
@@ -63,7 +63,7 @@ def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None,
 
 
 @pytest.mark.skipif(
-    get_video_backend() != "pyav" and not io._HAS_VIDEO_OPT, reason="video_reader backend not available"
+    get_video_backend() != "pyav" and not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend not available"
 )
 @pytest.mark.skipif(av is None, reason="PyAV unavailable")
 class TestVideo:
@@ -77,14 +77,14 @@ def test_write_read_video(self):
             assert_equal(data, lv)
             assert info["video_fps"] == 5
 
-    @pytest.mark.skipif(not io._HAS_VIDEO_OPT, reason="video_reader backend is not chosen")
+    @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen")
     def test_probe_video_from_file(self):
         with temp_video(10, 300, 300, 5) as (f_name, data):
             video_info = io._probe_video_from_file(f_name)
             assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration
             assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps
 
-    @pytest.mark.skipif(not io._HAS_VIDEO_OPT, reason="video_reader backend is not chosen")
+    @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen")
     def test_probe_video_from_memory(self):
         with temp_video(10, 300, 300, 5) as (f_name, data):
             with open(f_name, "rb") as fp:
@@ -255,18 +255,19 @@ def test_read_video_partially_corrupted_file(self):
                 assert_equal(video, data)
 
     @pytest.mark.skipif(sys.platform == "win32", reason="temporarily disabled on Windows")
-    def test_write_video_with_audio(self, tmpdir):
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_write_video_with_audio(self, device, tmpdir):
         f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4")
         video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec")
 
         out_f_name = os.path.join(tmpdir, "testing.mp4")
         io.video.write_video(
             out_f_name,
-            video_tensor,
+            video_tensor.to(device),
             round(info["video_fps"]),
             video_codec="libx264rgb",
             options={"crf": "0"},
-            audio_array=audio_tensor,
+            audio_array=audio_tensor.to(device),
             audio_fps=info["audio_fps"],
             audio_codec="aac",
         )
diff --git a/test/test_models.cpp b/test/test_models.cpp
deleted file mode 100644
index 092fc567ac2..00000000000
--- a/test/test_models.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-#include <torch/script.h>
-#include <torch/torch.h>
-#include <iostream>
-
-#include "../torchvision/csrc/models/models.h"
-
-using namespace vision::models;
-
-template <typename Model>
-torch::Tensor forward_model(const std::string& input_path, torch::Tensor x) {
-  Model network;
-  torch::load(network, input_path);
-  network->eval();
-  return network->forward(x);
-}
-
-torch::Tensor forward_alexnet(const std::string& input_path, torch::Tensor x) {
-  return forward_model<AlexNet>(input_path, x);
-}
-
-torch::Tensor forward_vgg11(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG11>(input_path, x);
-}
-torch::Tensor forward_vgg13(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG13>(input_path, x);
-}
-torch::Tensor forward_vgg16(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG16>(input_path, x);
-}
-torch::Tensor forward_vgg19(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG19>(input_path, x);
-}
-
-torch::Tensor forward_vgg11bn(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG11BN>(input_path, x);
-}
-torch::Tensor forward_vgg13bn(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG13BN>(input_path, x);
-}
-torch::Tensor forward_vgg16bn(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG16BN>(input_path, x);
-}
-torch::Tensor forward_vgg19bn(const std::string& input_path, torch::Tensor x) {
-  return forward_model<VGG19BN>(input_path, x);
-}
-
-torch::Tensor forward_resnet18(const std::string& input_path, torch::Tensor x) {
-  return forward_model<ResNet18>(input_path, x);
-}
-torch::Tensor forward_resnet34(const std::string& input_path, torch::Tensor x) {
-  return forward_model<ResNet34>(input_path, x);
-}
-torch::Tensor forward_resnet50(const std::string& input_path, torch::Tensor x) {
-  return forward_model<ResNet50>(input_path, x);
-}
-torch::Tensor forward_resnet101(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<ResNet101>(input_path, x);
-}
-torch::Tensor forward_resnet152(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<ResNet152>(input_path, x);
-}
-torch::Tensor forward_resnext50_32x4d(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<ResNext50_32x4d>(input_path, x);
-}
-torch::Tensor forward_resnext101_32x8d(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<ResNext101_32x8d>(input_path, x);
-}
-torch::Tensor forward_wide_resnet50_2(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<WideResNet50_2>(input_path, x);
-}
-torch::Tensor forward_wide_resnet101_2(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<WideResNet101_2>(input_path, x);
-}
-
-torch::Tensor forward_squeezenet1_0(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<SqueezeNet1_0>(input_path, x);
-}
-torch::Tensor forward_squeezenet1_1(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<SqueezeNet1_1>(input_path, x);
-}
-
-torch::Tensor forward_densenet121(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<DenseNet121>(input_path, x);
-}
-torch::Tensor forward_densenet169(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<DenseNet169>(input_path, x);
-}
-torch::Tensor forward_densenet201(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<DenseNet201>(input_path, x);
-}
-torch::Tensor forward_densenet161(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<DenseNet161>(input_path, x);
-}
-
-torch::Tensor forward_mobilenetv2(
-    const std::string& input_path,
-    torch::Tensor x) {
-  return forward_model<MobileNetV2>(input_path, x);
-}
-
-torch::Tensor forward_googlenet(
-    const std::string& input_path,
-    torch::Tensor x) {
-  GoogLeNet network;
-  torch::load(network, input_path);
-  network->eval();
-  return network->forward(x).output;
-}
-torch::Tensor forward_inceptionv3(
-    const std::string& input_path,
-    torch::Tensor x) {
-  InceptionV3 network;
-  torch::load(network, input_path);
-  network->eval();
-  return network->forward(x).output;
-}
-
-torch::Tensor forward_mnasnet0_5(const std::string& input_path, torch::Tensor x) {
-  return forward_model<MNASNet0_5>(input_path, x);
-}
-torch::Tensor forward_mnasnet0_75(const std::string& input_path, torch::Tensor x) {
-  return forward_model<MNASNet0_75>(input_path, x);
-}
-torch::Tensor forward_mnasnet1_0(const std::string& input_path, torch::Tensor x) {
-  return forward_model<MNASNet1_0>(input_path, x);
-}
-torch::Tensor forward_mnasnet1_3(const std::string& input_path, torch::Tensor x) {
-  return forward_model<MNASNet1_3>(input_path, x);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward_alexnet", &forward_alexnet, "forward_alexnet");
-
-  m.def("forward_vgg11", &forward_vgg11, "forward_vgg11");
-  m.def("forward_vgg13", &forward_vgg13, "forward_vgg13");
-  m.def("forward_vgg16", &forward_vgg16, "forward_vgg16");
-  m.def("forward_vgg19", &forward_vgg19, "forward_vgg19");
-
-  m.def("forward_vgg11bn", &forward_vgg11bn, "forward_vgg11bn");
-  m.def("forward_vgg13bn", &forward_vgg13bn, "forward_vgg13bn");
-  m.def("forward_vgg16bn", &forward_vgg16bn, "forward_vgg16bn");
-  m.def("forward_vgg19bn", &forward_vgg19bn, "forward_vgg19bn");
-
-  m.def("forward_resnet18", &forward_resnet18, "forward_resnet18");
-  m.def("forward_resnet34", &forward_resnet34, "forward_resnet34");
-  m.def("forward_resnet50", &forward_resnet50, "forward_resnet50");
-  m.def("forward_resnet101", &forward_resnet101, "forward_resnet101");
-  m.def("forward_resnet152", &forward_resnet152, "forward_resnet152");
-  m.def(
-      "forward_resnext50_32x4d",
-      &forward_resnext50_32x4d,
-      "forward_resnext50_32x4d");
-  m.def(
-      "forward_resnext101_32x8d",
-      &forward_resnext101_32x8d,
-      "forward_resnext101_32x8d");
-  m.def(
-      "forward_wide_resnet50_2",
-      &forward_wide_resnet50_2,
-      "forward_wide_resnet50_2");
-  m.def(
-      "forward_wide_resnet101_2",
-      &forward_wide_resnet101_2,
-      "forward_wide_resnet101_2");
-
-  m.def(
-      "forward_squeezenet1_0", &forward_squeezenet1_0, "forward_squeezenet1_0");
-  m.def(
-      "forward_squeezenet1_1", &forward_squeezenet1_1, "forward_squeezenet1_1");
-
-  m.def("forward_densenet121", &forward_densenet121, "forward_densenet121");
-  m.def("forward_densenet169", &forward_densenet169, "forward_densenet169");
-  m.def("forward_densenet201", &forward_densenet201, "forward_densenet201");
-  m.def("forward_densenet161", &forward_densenet161, "forward_densenet161");
-
-  m.def("forward_mobilenetv2", &forward_mobilenetv2, "forward_mobilenetv2");
-
-  m.def("forward_googlenet", &forward_googlenet, "forward_googlenet");
-  m.def("forward_inceptionv3", &forward_inceptionv3, "forward_inceptionv3");
-
-  m.def("forward_mnasnet0_5", &forward_mnasnet0_5, "forward_mnasnet0_5");
-  m.def("forward_mnasnet0_75", &forward_mnasnet0_75, "forward_mnasnet0_75");
-  m.def("forward_mnasnet1_0", &forward_mnasnet1_0, "forward_mnasnet1_0");
-  m.def("forward_mnasnet1_3", &forward_mnasnet1_3, "forward_mnasnet1_3");
-}
diff --git a/test/test_models.py b/test/test_models.py
index f145727bbd1..202bbdbd0cd 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -3,6 +3,7 @@
 import operator
 import os
 import pkgutil
+import platform
 import sys
 import warnings
 from collections import OrderedDict
@@ -14,9 +15,10 @@
 import torch.fx
 import torch.nn as nn
 from _utils_internal import get_relative_path
-from common_utils import cpu_and_gpu, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
-from torchvision import models
-from torchvision.models._api import find_model, list_models
+from common_utils import cpu_and_cuda, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
+from PIL import Image
+from torchvision import models, transforms
+from torchvision.models import get_model_builder, list_models
 
 
 ACCEPT = os.getenv("EXPECTTEST_ACCEPT", "0") == "1"
@@ -24,7 +26,44 @@
 
 
 def list_model_fns(module):
-    return [find_model(name) for name in list_models(module)]
+    return [get_model_builder(name) for name in list_models(module)]
+
+
+def _get_image(input_shape, real_image, device, dtype=None):
+    """This routine loads a real or random image based on `real_image` argument.
+    Currently, the real image is utilized for the following list of models:
+    - `retinanet_resnet50_fpn`,
+    - `retinanet_resnet50_fpn_v2`,
+    - `keypointrcnn_resnet50_fpn`,
+    - `fasterrcnn_resnet50_fpn`,
+    - `fasterrcnn_resnet50_fpn_v2`,
+    - `fcos_resnet50_fpn`,
+    - `maskrcnn_resnet50_fpn`,
+    - `maskrcnn_resnet50_fpn_v2`,
+    in `test_classification_model` and `test_detection_model`.
+    To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params`
+    """
+    if real_image:
+        # TODO: Maybe unify file discovery logic with test_image.py
+        GRACE_HOPPER = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
+        )
+
+        img = Image.open(GRACE_HOPPER)
+
+        original_width, original_height = img.size
+
+        # make the image square
+        img = img.crop((0, 0, original_width, original_width))
+        img = img.resize(input_shape[1:3])
+
+        convert_tensor = transforms.ToTensor()
+        image = convert_tensor(img)
+        assert tuple(image.size()) == input_shape
+        return image.to(device=device, dtype=dtype)
+
+    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
+    return torch.rand(input_shape).to(device=device, dtype=dtype)
 
 
 @pytest.fixture
@@ -110,10 +149,10 @@ def _assert_expected(output, name, prec=None, atol=None, rtol=None):
         if binary_size > MAX_PICKLE_SIZE:
             raise RuntimeError(f"The output for {filename}, is larger than 50kb - got {binary_size}kb")
     else:
-        expected = torch.load(expected_file)
+        expected = torch.load(expected_file, weights_only=True)
         rtol = rtol or prec  # keeping prec param for legacy reason, but could be removed ideally
         atol = atol or prec
-        torch.testing.assert_close(output, expected, rtol=rtol, atol=atol, check_dtype=False)
+        torch.testing.assert_close(output, expected, rtol=rtol, atol=atol, check_dtype=False, check_device=False)
 
 
 def _check_jit_scriptable(nn_module, args, unwrapper=None, eager_out=None):
@@ -128,6 +167,7 @@ def get_export_import_copy(m):
         return imported
 
     sm = torch.jit.script(nn_module)
+    sm.eval()
 
     if eager_out is None:
         with torch.no_grad(), freeze_rng_state():
@@ -153,7 +193,8 @@ def _check_fx_compatible(model, inputs, eager_out=None):
     model_fx = torch.fx.symbolic_trace(model)
     if eager_out is None:
         eager_out = model(inputs)
-    fx_out = model_fx(inputs)
+    with torch.no_grad(), freeze_rng_state():
+        fx_out = model_fx(inputs)
     torch.testing.assert_close(eager_out, fx_out)
 
 
@@ -237,17 +278,23 @@ def _check_input_backprop(model, inputs):
 # tests under test_quantized_classification_model will be skipped for the following models.
 quantized_flaky_models = ("inception_v3", "resnet50")
 
+# The tests for the following detection models are flaky.
+# We run those tests on float64 to avoid floating point errors.
+# FIXME: we shouldn't have to do that :'/
+detection_flaky_models = ("keypointrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn_v2")
+
 
 # The following contains configuration parameters for all models which are used by
 # the _test_*_model methods.
 _model_params = {
-    "inception_v3": {"input_shape": (1, 3, 299, 299)},
+    "inception_v3": {"input_shape": (1, 3, 299, 299), "init_weights": True},
     "retinanet_resnet50_fpn": {
         "num_classes": 20,
         "score_thresh": 0.01,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "retinanet_resnet50_fpn_v2": {
         "num_classes": 20,
@@ -255,6 +302,7 @@ def _check_input_backprop(model, inputs):
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "keypointrcnn_resnet50_fpn": {
         "num_classes": 2,
@@ -262,18 +310,21 @@ def _check_input_backprop(model, inputs):
         "max_size": 224,
         "box_score_thresh": 0.17,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_resnet50_fpn": {
         "num_classes": 20,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_resnet50_fpn_v2": {
         "num_classes": 20,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fcos_resnet50_fpn": {
         "num_classes": 2,
@@ -281,18 +332,21 @@ def _check_input_backprop(model, inputs):
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "maskrcnn_resnet50_fpn": {
         "num_classes": 10,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "maskrcnn_resnet50_fpn_v2": {
         "num_classes": 10,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_mobilenet_v3_large_fpn": {
         "box_score_thresh": 0.02076,
@@ -315,6 +369,7 @@ def _check_input_backprop(model, inputs):
     "s3d": {
         "input_shape": (1, 3, 16, 224, 224),
     },
+    "googlenet": {"init_weights": True},
 }
 # speeding up slow models:
 slow_models = [
@@ -343,12 +398,25 @@ def _check_input_backprop(model, inputs):
     _model_params[m] = {"input_shape": (1, 3, 64, 64)}
 
 
-# skip big models to reduce memory usage on CI test
+# skip big models to reduce memory usage on CI test. We can exclude combinations of (platform-system, device).
 skipped_big_models = {
-    "vit_h_14",
-    "regnet_y_128gf",
+    "vit_h_14": {("Windows", "cpu"), ("Windows", "cuda")},
+    "regnet_y_128gf": {("Windows", "cpu"), ("Windows", "cuda")},
+    "mvit_v1_b": {("Windows", "cuda"), ("Linux", "cuda")},
+    "mvit_v2_s": {("Windows", "cuda"), ("Linux", "cuda")},
 }
 
+
+def is_skippable(model_name, device):
+    if model_name not in skipped_big_models:
+        return False
+
+    platform_system = platform.system()
+    device_name = str(device).split(":")[0]
+
+    return (platform_system, device_name) in skipped_big_models[model_name]
+
+
 # The following contains configuration and expected values to be used tests that are model specific
 _model_tests_values = {
     "retinanet_resnet50_fpn": {
@@ -598,13 +666,14 @@ def vitc_b_16(**kwargs: Any):
 
 
 @pytest.mark.parametrize("model_fn", [vitc_b_16])
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_vitc_models(model_fn, dev):
     test_classification_model(model_fn, dev)
 
 
+@torch.backends.cudnn.flags(allow_tf32=False)  # see: https://github.com/pytorch/vision/issues/7618
 @pytest.mark.parametrize("model_fn", list_model_fns(models))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_classification_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -612,18 +681,25 @@ def test_classification_model(model_fn, dev):
         "input_shape": (1, 3, 224, 224),
     }
     model_name = model_fn.__name__
-    if SKIP_BIG_MODEL and model_name in skipped_big_models:
+    if SKIP_BIG_MODEL and is_skippable(model_name, dev):
         pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model")
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
     input_shape = kwargs.pop("input_shape")
+    real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
     model.eval().to(device=dev)
-    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    x = torch.rand(input_shape).to(device=dev)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=1e-3)
+    # FIXME: this if/else is nasty and only here to please our CI prior to the
+    # release. We rethink these tests altogether.
+    if model_name == "resnet101":
+        prec = 0.2
+    else:
+        # FIXME: this is probably still way too high.
+        prec = 0.1
+    _assert_expected(out.cpu(), model_name, prec=prec)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
@@ -640,7 +716,7 @@ def test_classification_model(model_fn, dev):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.segmentation))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_segmentation_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -656,7 +732,8 @@ def test_segmentation_model(model_fn, dev):
     model.eval().to(device=dev)
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
     x = torch.rand(input_shape).to(device=dev)
-    out = model(x)
+    with torch.no_grad(), freeze_rng_state():
+        out = model(x)
 
     def check_out(out):
         prec = 0.01
@@ -670,8 +747,10 @@ def check_out(out):
             # so instead of validating the probability scores, check that the class
             # predictions match.
             expected_file = _get_expected_file(model_name)
-            expected = torch.load(expected_file)
-            torch.testing.assert_close(out.argmax(dim=1), expected.argmax(dim=1), rtol=prec, atol=prec)
+            expected = torch.load(expected_file, weights_only=True)
+            torch.testing.assert_close(
+                out.argmax(dim=1), expected.argmax(dim=1), rtol=prec, atol=prec, check_device=False
+            )
             return False  # Partial validation performed
 
         return True  # Full validation performed
@@ -682,7 +761,7 @@ def check_out(out):
     _check_fx_compatible(model, x, eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
             out = model(x)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -702,7 +781,7 @@ def check_out(out):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.detection))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_detection_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -711,15 +790,20 @@ def test_detection_model(model_fn, dev):
         "input_shape": (3, 300, 300),
     }
     model_name = model_fn.__name__
+    if model_name in detection_flaky_models:
+        dtype = torch.float64
+    else:
+        dtype = torch.get_default_dtype()
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     input_shape = kwargs.pop("input_shape")
+    real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
-    model.eval().to(device=dev)
-    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    x = torch.rand(input_shape).to(device=dev)
+    model.eval().to(device=dev, dtype=dtype)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev, dtype=dtype)
     model_input = [x]
-    out = model(model_input)
+    with torch.no_grad(), freeze_rng_state():
+        out = model(model_input)
     assert model_input[0] is x
 
     def check_out(out):
@@ -763,7 +847,7 @@ def compute_mean_std(tensor):
             # as in NMSTester.test_nms_cuda to see if this is caused by duplicate
             # scores.
             expected_file = _get_expected_file(model_name)
-            expected = torch.load(expected_file)
+            expected = torch.load(expected_file, weights_only=True)
             torch.testing.assert_close(
                 output[0]["scores"], expected[0]["scores"], rtol=prec, atol=prec, check_device=False, check_dtype=False
             )
@@ -780,7 +864,7 @@ def compute_mean_std(tensor):
     _check_jit_scriptable(model, ([x],), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
             out = model(model_input)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -829,7 +913,7 @@ def test_detection_model_validation(model_fn):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.video))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_video_model(model_fn, dev):
     set_rng_seed(0)
     # the default input shape is
@@ -839,7 +923,7 @@ def test_video_model(model_fn, dev):
         "num_classes": 50,
     }
     model_name = model_fn.__name__
-    if SKIP_BIG_MODEL and model_name in skipped_big_models:
+    if SKIP_BIG_MODEL and is_skippable(model_name, dev):
         pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model")
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
@@ -850,7 +934,7 @@ def test_video_model(model_fn, dev):
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
     x = torch.rand(input_shape).to(device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=1e-5)
+    _assert_expected(out.cpu(), model_name, prec=0.1)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
@@ -893,7 +977,7 @@ def test_quantized_classification_model(model_fn):
     out = model(x)
 
     if model_name not in quantized_flaky_models:
-        _assert_expected(out, model_name + "_quantized", prec=2e-2)
+        _assert_expected(out.cpu(), model_name + "_quantized", prec=2e-2)
         assert out.shape[-1] == 5
         _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
         _check_fx_compatible(model, x, eager_out=out)
@@ -943,7 +1027,7 @@ def test_raft(model_fn, scripted):
     torch.manual_seed(0)
 
     # We need very small images, otherwise the pickle size would exceed the 50KB
-    # As a resut we need to override the correlation pyramid to not downsample
+    # As a result we need to override the correlation pyramid to not downsample
     # too much, otherwise we would get nan values (effective H and W would be
     # reduced to 1)
     corr_block = models.optical_flow.raft.CorrBlock(num_levels=2, radius=2)
@@ -959,8 +1043,8 @@ def test_raft(model_fn, scripted):
     preds = model(img1, img2)
     flow_pred = preds[-1]
     # Tolerance is fairly high, but there are 2 * H * W outputs to check
-    # The .pkl were generated on the AWS cluter, on the CI it looks like the resuts are slightly different
-    _assert_expected(flow_pred, name=model_fn.__name__, atol=1e-2, rtol=1)
+    # The .pkl were generated on the AWS cluter, on the CI it looks like the results are slightly different
+    _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1)
 
 
 if __name__ == "__main__":
diff --git a/test/test_models_detection_utils.py b/test/test_models_detection_utils.py
index 09895057a9a..69703ab5817 100644
--- a/test/test_models_detection_utils.py
+++ b/test/test_models_detection_utils.py
@@ -38,7 +38,7 @@ def test_box_linear_coder(self):
     def test_resnet_fpn_backbone_frozen_layers(self, train_layers, exp_froz_params):
         # we know how many initial layers and parameters of the network should
         # be frozen for each trainable_backbone_layers parameter value
-        # i.e all 53 params are frozen if trainable_backbone_layers=0
+        # i.e. all 53 params are frozen if trainable_backbone_layers=0
         # ad first 24 params are frozen if trainable_backbone_layers=2
         model = backbone_utils.resnet_fpn_backbone("resnet50", weights=None, trainable_layers=train_layers)
         # boolean list that is true if the param at that index is frozen
diff --git a/test/test_onnx.py b/test/test_onnx.py
index d5dae64b4d0..c9d91454d7c 100644
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@@ -1,6 +1,6 @@
 import io
 from collections import OrderedDict
-from typing import List, Tuple
+from typing import Optional
 
 import pytest
 import torch
@@ -11,7 +11,7 @@
 from torchvision.models.detection.roi_heads import RoIHeads
 from torchvision.models.detection.rpn import AnchorGenerator, RegionProposalNetwork, RPNHead
 from torchvision.models.detection.transform import GeneralizedRCNNTransform
-from torchvision.ops._register_onnx_ops import _onnx_opset_version
+from torchvision.ops import _register_onnx_ops
 
 # In environments without onnxruntime we prefer to
 # invoke all tests in the repo and have this one skipped rather than fail.
@@ -27,12 +27,15 @@ def run_model(
         self,
         model,
         inputs_list,
-        tolerate_small_mismatch=False,
         do_constant_folding=True,
         dynamic_axes=None,
         output_names=None,
         input_names=None,
+        opset_version: Optional[int] = None,
     ):
+        if opset_version is None:
+            opset_version = _register_onnx_ops.BASE_ONNX_OPSET_VERSION
+
         model.eval()
 
         onnx_io = io.BytesIO()
@@ -46,10 +49,11 @@ def run_model(
             torch_onnx_input,
             onnx_io,
             do_constant_folding=do_constant_folding,
-            opset_version=_onnx_opset_version,
+            opset_version=opset_version,
             dynamic_axes=dynamic_axes,
             input_names=input_names,
             output_names=output_names,
+            verbose=True,
         )
         # validate the exported model with onnx runtime
         for test_inputs in inputs_list:
@@ -59,9 +63,9 @@ def run_model(
                 test_ouputs = model(*test_inputs)
                 if isinstance(test_ouputs, torch.Tensor):
                     test_ouputs = (test_ouputs,)
-            self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch)
+            self.ort_validate(onnx_io, test_inputs, test_ouputs)
 
-    def ort_validate(self, onnx_io, inputs, outputs, tolerate_small_mismatch=False):
+    def ort_validate(self, onnx_io, inputs, outputs):
 
         inputs, _ = torch.jit._flatten(inputs)
         outputs, _ = torch.jit._flatten(outputs)
@@ -75,19 +79,13 @@ def to_numpy(tensor):
         inputs = list(map(to_numpy, inputs))
         outputs = list(map(to_numpy, outputs))
 
-        ort_session = onnxruntime.InferenceSession(onnx_io.getvalue())
+        ort_session = onnxruntime.InferenceSession(onnx_io.getvalue(), providers=onnxruntime.get_available_providers())
         # compute onnxruntime output prediction
         ort_inputs = {ort_session.get_inputs()[i].name: inpt for i, inpt in enumerate(inputs)}
         ort_outs = ort_session.run(None, ort_inputs)
 
         for i in range(0, len(outputs)):
-            try:
-                torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
-            except AssertionError as error:
-                if tolerate_small_mismatch:
-                    assert "(0.00%)" in str(error), str(error)
-                else:
-                    raise
+            torch.testing.assert_close(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
 
     def test_nms(self):
         num_boxes = 100
@@ -140,39 +138,39 @@ def test_roi_align(self):
         model = ops.RoIAlign((5, 5), 1, -1)
         self.run_model(model, [(x, single_roi)])
 
-    @pytest.mark.skip(reason="ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16.")
     def test_roi_align_aligned(self):
+        supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 2, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 0.5, 3, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1.8, 2, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((2, 2), 2.5, 0, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((2, 2), 2.5, -1, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
-    @pytest.mark.skip(reason="Issue in exporting ROIAlign with aligned = True for malformed boxes")
     def test_roi_align_malformed_boxes(self):
+        supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16
         x = torch.randn(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 1, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
     def test_roi_pool(self):
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
@@ -320,7 +318,6 @@ def forward(self_module, images, features):
         self.run_model(
             model,
             [(images, features), (images2, test_features)],
-            tolerate_small_mismatch=True,
             input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
             dynamic_axes={
                 "input1": [0, 1, 2, 3],
@@ -396,7 +393,6 @@ def forward(self_module, images, features):
         self.run_model(
             model,
             [(images, features), (images2, test_features)],
-            tolerate_small_mismatch=True,
             input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
             dynamic_axes={
                 "input1": [0, 1, 2, 3],
@@ -408,20 +404,19 @@ def forward(self_module, images, features):
             },
         )
 
-    def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
+    def get_image(self, rel_path: str, size: tuple[int, int]) -> torch.Tensor:
         import os
 
-        import torchvision.transforms._pil_constants as _pil_constants
         from PIL import Image
         from torchvision.transforms import functional as F
 
         data_dir = os.path.join(os.path.dirname(__file__), "assets")
         path = os.path.join(data_dir, *rel_path.split("/"))
-        image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR)
+        image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR)
 
         return F.convert_image_dtype(F.pil_to_tensor(image))
 
-    def get_test_images(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    def get_test_images(self) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         return (
             [self.get_image("encode_jpeg/grace_hopper_517x606.jpg", (100, 320))],
             [self.get_image("fakedata/logos/rgb_pytorch.png", (250, 380))],
@@ -442,7 +437,6 @@ def test_faster_rcnn(self):
             input_names=["images_tensors"],
             output_names=["outputs"],
             dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
         # Test exported model for an image with no detections on other images
         self.run_model(
@@ -451,7 +445,6 @@ def test_faster_rcnn(self):
             input_names=["images_tensors"],
             output_names=["outputs"],
             dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
     # Verify that paste_mask_in_image beahves the same in tracing.
@@ -506,7 +499,6 @@ def test_mask_rcnn(self):
                 "scores": [0],
                 "masks": [0, 1, 2],
             },
-            tolerate_small_mismatch=True,
         )
         # Test exported model for an image with no detections on other images
         self.run_model(
@@ -521,7 +513,6 @@ def test_mask_rcnn(self):
                 "scores": [0],
                 "masks": [0, 1, 2],
             },
-            tolerate_small_mismatch=True,
         )
 
     # Verify that heatmaps_to_keypoints behaves the same in tracing.
@@ -563,7 +554,6 @@ def test_keypoint_rcnn(self):
             input_names=["images_tensors"],
             output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
             dynamic_axes={"images_tensors": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
         self.run_model(
@@ -572,7 +562,6 @@ def test_keypoint_rcnn(self):
             input_names=["images_tensors"],
             output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
             dynamic_axes={"images_tensors": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
     def test_shufflenet_v2_dynamic_axes(self):
@@ -586,7 +575,6 @@ def test_shufflenet_v2_dynamic_axes(self):
             input_names=["input_images"],
             output_names=["output"],
             dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}},
-            tolerate_small_mismatch=True,
         )
 
 
diff --git a/test/test_ops.py b/test/test_ops.py
index a1cb5aa33d6..d2cf8d29181 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1,24 +1,50 @@
 import math
 import os
 from abc import ABC, abstractmethod
-from functools import lru_cache
+from functools import lru_cache, partial
 from itertools import product
-from typing import Callable, List, Tuple
+from typing import Callable
 
 import numpy as np
 import pytest
 import torch
 import torch.fx
 import torch.nn.functional as F
-from common_utils import assert_equal, cpu_and_gpu, needs_cuda
+import torch.testing._internal.optests as optests
+from common_utils import assert_equal, cpu_and_cuda, cpu_and_cuda_and_mps, needs_cuda, needs_mps
 from PIL import Image
 from torch import nn, Tensor
+from torch._dynamo.utils import is_compile_supported
 from torch.autograd import gradcheck
 from torch.nn.modules.utils import _pair
 from torchvision import models, ops
 from torchvision.models.feature_extraction import get_graph_node_names
 
 
+OPTESTS = [
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+]
+
+
+# Context manager for setting deterministic flag and automatically
+# resetting it to its original value
+class DeterministicGuard:
+    def __init__(self, deterministic, *, warn_only=False):
+        self.deterministic = deterministic
+        self.warn_only = warn_only
+
+    def __enter__(self):
+        self.deterministic_restore = torch.are_deterministic_algorithms_enabled()
+        self.warn_only_restore = torch.is_deterministic_algorithms_warn_only_enabled()
+        torch.use_deterministic_algorithms(self.deterministic, warn_only=self.warn_only)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.use_deterministic_algorithms(self.deterministic_restore, warn_only=self.warn_only_restore)
+
+
 class RoIOpTesterModuleWrapper(nn.Module):
     def __init__(self, obj):
         super().__init__()
@@ -74,20 +100,43 @@ def __init__(self, pool: nn.Module):
         super().__init__()
         self.pool = pool
 
-    def forward(self, imgs: Tensor, boxes: List[Tensor]) -> Tensor:
+    def forward(self, imgs: Tensor, boxes: list[Tensor]) -> Tensor:
         return self.pool(imgs, boxes)
 
 
 class RoIOpTester(ABC):
     dtype = torch.float64
+    mps_dtype = torch.float32
+    mps_backward_atol = 2e-2
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwargs):
-        x_dtype = self.dtype if x_dtype is None else x_dtype
-        rois_dtype = self.dtype if rois_dtype is None else rois_dtype
+    @pytest.mark.parametrize(
+        "x_dtype",
+        (
+            torch.float16,
+            torch.float32,
+            torch.float64,
+        ),
+        ids=str,
+    )
+    def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, deterministic=False, **kwargs):
+        if device == "mps" and x_dtype is torch.float64:
+            pytest.skip("MPS does not support float64")
+
+        rois_dtype = x_dtype if rois_dtype is None else rois_dtype
+
+        tol = 1e-5
+        if x_dtype is torch.half:
+            if device == "mps":
+                tol = 5e-3
+            else:
+                tol = 4e-3
+        elif x_dtype == torch.bfloat16:
+            tol = 5e-3
+
         pool_size = 5
-        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        # n_channels % (pool_size ** 2) == 0 required for PS operations.
         n_channels = 2 * (pool_size**2)
         x = torch.rand(2, n_channels, 10, 10, dtype=x_dtype, device=device)
         if not contiguous:
@@ -99,17 +148,17 @@ def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwar
         )
 
         pool_h, pool_w = pool_size, pool_size
-        y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs)
+        with DeterministicGuard(deterministic):
+            y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs)
         # the following should be true whether we're running an autocast test or not.
         assert y.dtype == x.dtype
         gt_y = self.expected_fn(
-            x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=self.dtype, **kwargs
+            x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=x_dtype, **kwargs
         )
 
-        tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5
         torch.testing.assert_close(gt_y.to(y), y, rtol=tol, atol=tol)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -118,17 +167,39 @@ def test_is_leaf_node(self, device):
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_torch_fx_trace(self, device, x_dtype=torch.float, rois_dtype=torch.float):
+        op_obj = self.make_obj().to(device=device)
+        graph_module = torch.fx.symbolic_trace(op_obj)
+        pool_size = 5
+        n_channels = 2 * (pool_size**2)
+        x = torch.rand(2, n_channels, 5, 5, dtype=x_dtype, device=device)
+        rois = torch.tensor(
+            [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9], [1, 0, 0, 9, 9]],  # format is (xyxy)
+            dtype=rois_dtype,
+            device=device,
+        )
+        output_gt = op_obj(x, rois)
+        assert output_gt.dtype == x.dtype
+        output_fx = graph_module(x, rois)
+        assert output_fx.dtype == x.dtype
+        tol = 1e-5
+        torch.testing.assert_close(output_gt, output_fx, rtol=tol, atol=tol)
+
     @pytest.mark.parametrize("seed", range(10))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_backward(self, seed, device, contiguous):
+    def test_backward(self, seed, device, contiguous, deterministic=False):
+        atol = self.mps_backward_atol if device == "mps" else 1e-05
+        dtype = self.mps_dtype if device == "mps" else self.dtype
+
         torch.random.manual_seed(seed)
         pool_size = 2
-        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
+        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=dtype, device=device, requires_grad=True)
         if not contiguous:
             x = x.permute(0, 1, 3, 2)
         rois = torch.tensor(
-            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=self.dtype, device=device  # format is (xyxy)
+            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=dtype, device=device  # format is (xyxy)
         )
 
         def func(z):
@@ -136,8 +207,26 @@ def func(z):
 
         script_func = self.get_script_fn(rois, pool_size)
 
-        gradcheck(func, (x,))
-        gradcheck(script_func, (x,))
+        with DeterministicGuard(deterministic):
+            gradcheck(func, (x,), atol=atol)
+
+        gradcheck(script_func, (x,), atol=atol)
+
+    @needs_mps
+    def test_mps_error_inputs(self):
+        pool_size = 2
+        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=torch.float16, device="mps", requires_grad=True)
+        rois = torch.tensor(
+            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=torch.float16, device="mps"  # format is (xyxy)
+        )
+
+        def func(z):
+            return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1)
+
+        with pytest.raises(
+            RuntimeError, match="MPS does not support (?:ps_)?roi_(?:align|pool)? backward with float16 inputs."
+        ):
+            gradcheck(func, (x,))
 
     @needs_cuda
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
@@ -153,7 +242,7 @@ def _helper_boxes_shape(self, func):
             boxes = torch.tensor([[0, 0, 3, 3]], dtype=a.dtype)
             func(a, boxes, output_size=(2, 2))
 
-        # test boxes as List[Tensor[N, 4]]
+        # test boxes as list[Tensor[N, 4]]
         with pytest.raises(AssertionError):
             a = torch.linspace(1, 8 * 8, 8 * 8).reshape(1, 1, 8, 8)
             boxes = torch.tensor([[0, 0, 3]], dtype=a.dtype)
@@ -233,6 +322,8 @@ def test_jit_boxes_list(self):
 
 
 class TestPSRoIPool(RoIOpTester):
+    mps_backward_atol = 5e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)
 
@@ -314,6 +405,8 @@ def bilinear_interpolate(data, y, x, snap_border=False):
 
 
 class TestRoIAlign(RoIOpTester):
+    mps_backward_atol = 6e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, aligned=False, **kwargs):
         return ops.RoIAlign(
             (pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio, aligned=aligned
@@ -365,7 +458,6 @@ def expected_fn(
                     grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
 
                     for channel in range(0, n_channels):
-
                         val = 0
                         for iy in range(0, grid_h):
                             y = start_h + (iy + 0.5) * bin_h / grid_h
@@ -381,23 +473,70 @@ def test_boxes_shape(self):
         self._helper_boxes_shape(ops.roi_align)
 
     @pytest.mark.parametrize("aligned", (True, False))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    @pytest.mark.parametrize("x_dtype", (torch.float16, torch.float32, torch.float64))  # , ids=str)
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=None):
+    @pytest.mark.parametrize("deterministic", (True, False))
+    @pytest.mark.opcheck_only_one()
+    def test_forward(self, device, contiguous, deterministic, aligned, x_dtype, rois_dtype=None):
+        if deterministic and device == "cpu":
+            pytest.skip("cpu is always deterministic, don't retest")
         super().test_forward(
-            device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype, aligned=aligned
+            device=device,
+            contiguous=contiguous,
+            deterministic=deterministic,
+            x_dtype=x_dtype,
+            rois_dtype=rois_dtype,
+            aligned=aligned,
         )
 
     @needs_cuda
     @pytest.mark.parametrize("aligned", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
     @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half))
-    def test_autocast(self, aligned, x_dtype, rois_dtype):
+    @pytest.mark.opcheck_only_one()
+    def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
         with torch.cuda.amp.autocast():
             self.test_forward(
-                torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype
+                torch.device("cuda"),
+                contiguous=False,
+                deterministic=deterministic,
+                aligned=aligned,
+                x_dtype=x_dtype,
+                rois_dtype=rois_dtype,
             )
 
+    @pytest.mark.skip(reason="1/5000 flaky failure")
+    @pytest.mark.parametrize("aligned", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
+    @pytest.mark.parametrize("x_dtype", (torch.float, torch.bfloat16))
+    @pytest.mark.parametrize("rois_dtype", (torch.float, torch.bfloat16))
+    def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype):
+        with torch.cpu.amp.autocast():
+            self.test_forward(
+                torch.device("cpu"),
+                contiguous=False,
+                deterministic=deterministic,
+                aligned=aligned,
+                x_dtype=x_dtype,
+                rois_dtype=rois_dtype,
+            )
+
+    @pytest.mark.parametrize("seed", range(10))
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    @pytest.mark.parametrize("contiguous", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
+    @pytest.mark.opcheck_only_one()
+    def test_backward(self, seed, device, contiguous, deterministic):
+        if deterministic and device == "cpu":
+            pytest.skip("cpu is always deterministic, don't retest")
+        if deterministic and device == "mps":
+            pytest.skip("no deterministic implementation for mps")
+        if deterministic and not is_compile_supported(device):
+            pytest.skip("deterministic implementation only if torch.compile supported")
+        super().test_backward(seed, device, contiguous, deterministic)
+
     def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
         rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
         rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
@@ -407,6 +546,7 @@ def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
     @pytest.mark.parametrize("aligned", (True, False))
     @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 10), (0.1, 50)))
     @pytest.mark.parametrize("qdtype", (torch.qint8, torch.quint8, torch.qint32))
+    @pytest.mark.opcheck_only_one()
     def test_qroialign(self, aligned, scale, zero_point, qdtype):
         """Make sure quantized version of RoIAlign is close to float version"""
         pool_size = 5
@@ -477,6 +617,8 @@ def test_jit_boxes_list(self):
 
 
 class TestPSRoIAlign(RoIOpTester):
+    mps_backward_atol = 5e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)(x, rois)
 
@@ -531,6 +673,43 @@ def test_boxes_shape(self):
         self._helper_boxes_shape(ops.ps_roi_align)
 
 
+@pytest.mark.parametrize(
+    "op",
+    (
+        torch.ops.torchvision.roi_pool,
+        torch.ops.torchvision.ps_roi_pool,
+        torch.ops.torchvision.roi_align,
+        torch.ops.torchvision.ps_roi_align,
+    ),
+)
+@pytest.mark.parametrize("dtype", (torch.float16, torch.float32, torch.float64))
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("requires_grad", (True, False))
+def test_roi_opcheck(op, dtype, device, requires_grad):
+    # This manually calls opcheck() on the roi ops. We do that instead of
+    # relying on opcheck.generate_opcheck_tests() as e.g. done for nms, because
+    # pytest and generate_opcheck_tests() don't interact very well when it comes
+    # to skipping tests - and these ops need to skip the MPS tests since MPS we
+    # don't support dynamic shapes yet for MPS.
+    rois = torch.tensor(
+        [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9], [1, 0, 0, 9, 9]],
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+    )
+    pool_size = 5
+    num_channels = 2 * (pool_size**2)
+    x = torch.rand(2, num_channels, 10, 10, dtype=dtype, device=device)
+
+    kwargs = dict(rois=rois, spatial_scale=1, pooled_height=pool_size, pooled_width=pool_size)
+    if op in (torch.ops.torchvision.roi_align, torch.ops.torchvision.ps_roi_align):
+        kwargs["sampling_ratio"] = -1
+    if op is torch.ops.torchvision.roi_align:
+        kwargs["aligned"] = True
+
+    optests.opcheck(op, args=(x,), kwargs=kwargs)
+
+
 class TestMultiScaleRoIAlign:
     def make_obj(self, fmap_names=None, output_size=(7, 7), sampling_ratio=2, wrap=False):
         if fmap_names is None:
@@ -552,7 +731,7 @@ def test_msroialign_repr(self):
         )
         assert repr(t) == expected_string
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -566,8 +745,9 @@ class TestNMS:
     def _reference_nms(self, boxes, scores, iou_threshold):
         """
         Args:
-            box_scores (N, 5): boxes in corner-form and probabilities.
-            iou_threshold: intersection over union threshold.
+            boxes: boxes in corner-form
+            scores: probabilities
+            iou_threshold: intersection over union threshold
         Returns:
              picked: a list of indexes of the kept boxes
         """
@@ -605,13 +785,14 @@ def _create_tensors_with_iou(self, N, iou_thresh):
 
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("seed", range(10))
+    @pytest.mark.opcheck_only_one()
     def test_nms_ref(self, iou, seed):
         torch.random.manual_seed(seed)
         err_msg = "NMS incompatible between CPU and reference implementation for IoU={}"
         boxes, scores = self._create_tensors_with_iou(1000, iou)
         keep_ref = self._reference_nms(boxes, scores, iou)
         keep = ops.nms(boxes, scores, iou)
-        assert torch.allclose(keep, keep_ref), err_msg.format(iou)
+        torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou))
 
     def test_nms_input_errors(self):
         with pytest.raises(RuntimeError):
@@ -625,13 +806,14 @@ def test_nms_input_errors(self):
 
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 50), (3, 10)))
+    @pytest.mark.opcheck_only_one()
     def test_qnms(self, iou, scale, zero_point):
         # Note: we compare qnms vs nms instead of qnms vs reference implementation.
-        # This is because with the int convertion, the trick used in _create_tensors_with_iou
+        # This is because with the int conversion, the trick used in _create_tensors_with_iou
         # doesn't really work (in fact, nms vs reference implem will also fail with ints)
         err_msg = "NMS and QNMS give different results for IoU={}"
         boxes, scores = self._create_tensors_with_iou(1000, iou)
-        scores *= 100  # otherwise most scores would be 0 or 1 after int convertion
+        scores *= 100  # otherwise most scores would be 0 or 1 after int conversion
 
         qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point, dtype=torch.quint8)
         qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point, dtype=torch.quint8)
@@ -642,42 +824,67 @@ def test_qnms(self, iou, scale, zero_point):
         keep = ops.nms(boxes, scores, iou)
         qkeep = ops.nms(qboxes, qscores, iou)
 
-        assert torch.allclose(qkeep, keep), err_msg.format(iou)
+        torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou))
 
-    @needs_cuda
+    @pytest.mark.parametrize(
+        "device",
+        (
+            pytest.param("cuda", marks=pytest.mark.needs_cuda),
+            pytest.param("mps", marks=pytest.mark.needs_mps),
+        ),
+    )
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
-    def test_nms_cuda(self, iou, dtype=torch.float64):
+    @pytest.mark.opcheck_only_one()
+    def test_nms_gpu(self, iou, device, dtype=torch.float64):
+        dtype = torch.float32 if device == "mps" else dtype
         tol = 1e-3 if dtype is torch.half else 1e-5
         err_msg = "NMS incompatible between CPU and CUDA for IoU={}"
 
         boxes, scores = self._create_tensors_with_iou(1000, iou)
         r_cpu = ops.nms(boxes, scores, iou)
-        r_cuda = ops.nms(boxes.cuda(), scores.cuda(), iou)
+        r_gpu = ops.nms(boxes.to(device), scores.to(device), iou)
 
-        is_eq = torch.allclose(r_cpu, r_cuda.cpu())
+        is_eq = torch.allclose(r_cpu, r_gpu.cpu())
         if not is_eq:
             # if the indices are not the same, ensure that it's because the scores
             # are duplicate
-            is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol)
+            is_eq = torch.allclose(scores[r_cpu], scores[r_gpu.cpu()], rtol=tol, atol=tol)
         assert is_eq, err_msg.format(iou)
 
     @needs_cuda
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
+    @pytest.mark.opcheck_only_one()
     def test_autocast(self, iou, dtype):
         with torch.cuda.amp.autocast():
-            self.test_nms_cuda(iou=iou, dtype=dtype)
+            self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda")
 
-    @needs_cuda
-    def test_nms_cuda_float16(self):
+    @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
+    @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16))
+    def test_autocast_cpu(self, iou, dtype):
+        boxes, scores = self._create_tensors_with_iou(1000, iou)
+        with torch.cpu.amp.autocast():
+            keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
+            keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
+        torch.testing.assert_close(keep_ref_float, keep_dtype)
+
+    @pytest.mark.parametrize(
+        "device",
+        (
+            pytest.param("cuda", marks=pytest.mark.needs_cuda),
+            pytest.param("mps", marks=pytest.mark.needs_mps),
+        ),
+    )
+    @pytest.mark.opcheck_only_one()
+    def test_nms_float16(self, device):
         boxes = torch.tensor(
             [
                 [285.3538, 185.5758, 1193.5110, 851.4551],
                 [285.1472, 188.7374, 1192.4984, 851.0669],
                 [279.2440, 197.9812, 1189.4746, 849.2019],
             ]
-        ).cuda()
-        scores = torch.tensor([0.6370, 0.7569, 0.3966]).cuda()
+        ).to(device)
+        scores = torch.tensor([0.6370, 0.7569, 0.3966]).to(device)
 
         iou_thres = 0.2
         keep32 = ops.nms(boxes, scores, iou_thres)
@@ -685,6 +892,7 @@ def test_nms_cuda_float16(self):
         assert_equal(keep32, keep16)
 
     @pytest.mark.parametrize("seed", range(10))
+    @pytest.mark.opcheck_only_one()
     def test_batched_nms_implementations(self, seed):
         """Make sure that both implementations of batched_nms yield identical results"""
         torch.random.manual_seed(seed)
@@ -710,8 +918,18 @@ def test_batched_nms_implementations(self, seed):
         torch.testing.assert_close(empty, ops.batched_nms(empty, None, None, None))
 
 
+optests.generate_opcheck_tests(
+    testcase=TestNMS,
+    namespaces=["torchvision"],
+    failures_dict_path=os.path.join(os.path.dirname(__file__), "optests_failures_dict.json"),
+    additional_decorators=[],
+    test_utils=OPTESTS,
+)
+
+
 class TestDeformConv:
     dtype = torch.float64
+    mps_dtype = torch.float32
 
     def expected_fn(self, x, weight, offset, mask, bias, stride=1, padding=0, dilation=1):
         stride_h, stride_w = _pair(stride)
@@ -824,7 +1042,7 @@ def make_obj(self, in_channels=6, out_channels=2, kernel_size=(3, 2), groups=2,
         )
         return DeformConvModuleWrapper(obj) if wrap else obj
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -833,11 +1051,11 @@ def test_is_leaf_node(self, device):
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
     def test_forward(self, device, contiguous, batch_sz, dtype=None):
-        dtype = dtype or self.dtype
+        dtype = self.mps_dtype if device == "mps" else dtype or self.dtype
         x, _, offset, mask, _, stride, padding, dilation = self.get_fn_args(device, contiguous, batch_sz, dtype)
         in_channels = 6
         out_channels = 2
@@ -855,7 +1073,7 @@ def test_forward(self, device, contiguous, batch_sz, dtype=None):
         expected = self.expected_fn(x, weight, offset, mask, bias, stride=stride, padding=padding, dilation=dilation)
 
         torch.testing.assert_close(
-            res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres:\n{res}\nexpected:\n{expected}"
+            res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres: \n{res}\nexpected: \n{expected}"
         )
 
         # no modulation test
@@ -863,7 +1081,7 @@ def test_forward(self, device, contiguous, batch_sz, dtype=None):
         expected = self.expected_fn(x, weight, offset, None, bias, stride=stride, padding=padding, dilation=dilation)
 
         torch.testing.assert_close(
-            res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres:\n{res}\nexpected:\n{expected}"
+            res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres: \n{res}\nexpected: \n{expected}"
         )
 
     def test_wrong_sizes(self):
@@ -885,9 +1103,10 @@ def test_wrong_sizes(self):
             wrong_mask = torch.rand_like(mask[:, :2])
             layer(x, offset, wrong_mask)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
+    @pytest.mark.opcheck_only_one()
     def test_backward(self, device, contiguous, batch_sz):
         x, weight, offset, mask, bias, stride, padding, dilation = self.get_fn_args(
             device, contiguous, batch_sz, self.dtype
@@ -937,6 +1156,7 @@ def script_func_no_mask(x_, offset_, weight_, bias_, stride_, pad_, dilation_):
 
     @needs_cuda
     @pytest.mark.parametrize("contiguous", (True, False))
+    @pytest.mark.opcheck_only_one()
     def test_compare_cpu_cuda_grads(self, contiguous):
         # Test from https://github.com/pytorch/vision/issues/2598
         # Run on CUDA only
@@ -958,7 +1178,6 @@ def test_compare_cpu_cuda_grads(self, contiguous):
             weight = init_weight
 
         for d in ["cpu", "cuda"]:
-
             out = ops.deform_conv2d(img.to(d), offset.to(d), weight.to(d), padding=1, mask=mask.to(d))
             out.mean().backward()
             if true_cpu_grads is None:
@@ -972,6 +1191,7 @@ def test_compare_cpu_cuda_grads(self, contiguous):
     @needs_cuda
     @pytest.mark.parametrize("batch_sz", (0, 33))
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
+    @pytest.mark.opcheck_only_one()
     def test_autocast(self, batch_sz, dtype):
         with torch.cuda.amp.autocast():
             self.test_forward(torch.device("cuda"), contiguous=False, batch_sz=batch_sz, dtype=dtype)
@@ -981,6 +1201,34 @@ def test_forward_scriptability(self):
         torch.jit.script(ops.DeformConv2d(in_channels=8, out_channels=8, kernel_size=3))
 
 
+# NS: Remove me once backward is implemented for MPS
+def xfail_if_mps(x):
+    mps_xfail_param = pytest.param("mps", marks=(pytest.mark.needs_mps, pytest.mark.xfail))
+    new_pytestmark = []
+    for mark in x.pytestmark:
+        if isinstance(mark, pytest.Mark) and mark.name == "parametrize":
+            if mark.args[0] == "device":
+                params = cpu_and_cuda() + (mps_xfail_param,)
+                new_pytestmark.append(pytest.mark.parametrize("device", params))
+                continue
+        new_pytestmark.append(mark)
+    x.__dict__["pytestmark"] = new_pytestmark
+    return x
+
+
+optests.generate_opcheck_tests(
+    testcase=TestDeformConv,
+    namespaces=["torchvision"],
+    failures_dict_path=os.path.join(os.path.dirname(__file__), "optests_failures_dict.json"),
+    # Skip tests due to unimplemented backward
+    additional_decorators={
+        "test_aot_dispatch_dynamic__test_forward": [xfail_if_mps],
+        "test_autograd_registration__test_forward": [xfail_if_mps],
+    },
+    test_utils=OPTESTS,
+)
+
+
 class TestFrozenBNT:
     def test_frozenbatchnorm2d_repr(self):
         num_features = 32
@@ -1110,8 +1358,69 @@ def test_bbox_xywh_cxcywh(self):
         box_xywh = ops.box_convert(box_cxcywh, in_fmt="cxcywh", out_fmt="xywh")
         assert_equal(box_xywh, box_tensor)
 
-    @pytest.mark.parametrize("inv_infmt", ["xwyh", "cxwyh"])
-    @pytest.mark.parametrize("inv_outfmt", ["xwcx", "xhwcy"])
+    def test_bbox_xywhr_cxcywhr(self):
+        box_tensor = torch.tensor(
+            [
+                [0, 0, 100, 100, 0],
+                [0, 0, 0, 0, 0],
+                [10, 15, 20, 20, 0],
+                [23, 35, 70, 60, 0],
+                [4, 2, 4, 2, 0],
+                [5, 5, 4, 2, 90],
+                [8, 4, 4, 2, 180],
+                [7, 1, 4, 2, -90],
+            ],
+            dtype=torch.float,
+        )
+
+        exp_cxcywhr = torch.tensor(
+            [
+                [50, 50, 100, 100, 0],
+                [0, 0, 0, 0, 0],
+                [20, 25, 20, 20, 0],
+                [58, 65, 70, 60, 0],
+                [6, 3, 4, 2, 0],
+                [6, 3, 4, 2, 90],
+                [6, 3, 4, 2, 180],
+                [6, 3, 4, 2, -90],
+            ],
+            dtype=torch.float,
+        )
+
+        assert exp_cxcywhr.size() == torch.Size([8, 5])
+        box_cxcywhr = ops.box_convert(box_tensor, in_fmt="xywhr", out_fmt="cxcywhr")
+        torch.testing.assert_close(box_cxcywhr, exp_cxcywhr)
+
+        # Reverse conversion
+        box_xywhr = ops.box_convert(box_cxcywhr, in_fmt="cxcywhr", out_fmt="xywhr")
+        torch.testing.assert_close(box_xywhr, box_tensor)
+
+    def test_bbox_cxcywhr_to_xyxyxyxy(self):
+        box_tensor = torch.tensor([[5, 3, 4, 2, 90]], dtype=torch.float)
+        exp_xyxyxyxy = torch.tensor([[4, 5, 4, 1, 6, 1, 6, 5]], dtype=torch.float)
+
+        assert exp_xyxyxyxy.size() == torch.Size([1, 8])
+        box_xyxyxyxy = ops.box_convert(box_tensor, in_fmt="cxcywhr", out_fmt="xyxyxyxy")
+        torch.testing.assert_close(box_xyxyxyxy, exp_xyxyxyxy)
+
+        # Reverse conversion
+        box_cxcywhr = ops.box_convert(box_xyxyxyxy, in_fmt="xyxyxyxy", out_fmt="cxcywhr")
+        torch.testing.assert_close(box_cxcywhr, box_tensor)
+
+    def test_bbox_xywhr_to_xyxyxyxy(self):
+        box_tensor = torch.tensor([[4, 5, 4, 2, 90]], dtype=torch.float)
+        exp_xyxyxyxy = torch.tensor([[4, 5, 4, 1, 6, 1, 6, 5]], dtype=torch.float)
+
+        assert exp_xyxyxyxy.size() == torch.Size([1, 8])
+        box_xyxyxyxy = ops.box_convert(box_tensor, in_fmt="xywhr", out_fmt="xyxyxyxy")
+        torch.testing.assert_close(box_xyxyxyxy, exp_xyxyxyxy)
+
+        # Reverse conversion
+        box_xywhr = ops.box_convert(box_xyxyxyxy, in_fmt="xyxyxyxy", out_fmt="xywhr")
+        torch.testing.assert_close(box_xywhr, box_tensor)
+
+    @pytest.mark.parametrize("inv_infmt", ["xwyh", "cxwyh", "xwyhr", "cxwyhr", "xxxxyyyy"])
+    @pytest.mark.parametrize("inv_outfmt", ["xwcx", "xhwcy", "xwcxr", "xhwcyr", "xyxyxxyy"])
     def test_bbox_invalid(self, inv_infmt, inv_outfmt):
         box_tensor = torch.tensor(
             [[0, 0, 100, 100], [0, 0, 0, 0], [10, 15, 20, 20], [23, 35, 70, 60]], dtype=torch.float
@@ -1137,34 +1446,60 @@ def test_bbox_convert_jit(self):
 
 
 class TestBoxArea:
-    def area_check(self, box, expected, atol=1e-4):
-        out = ops.box_area(box)
+    def area_check(self, box, expected, fmt="xyxy", atol=1e-4):
+        out = ops.box_area(box, fmt=fmt)
         torch.testing.assert_close(out, expected, rtol=0.0, check_dtype=False, atol=atol)
 
     @pytest.mark.parametrize("dtype", [torch.int8, torch.int16, torch.int32, torch.int64])
-    def test_int_boxes(self, dtype):
-        box_tensor = torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=dtype)
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_int_boxes(self, dtype, fmt):
+        box_tensor = ops.box_convert(
+            torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=dtype), in_fmt="xyxy", out_fmt=fmt
+        )
         expected = torch.tensor([10000, 0], dtype=torch.int32)
-        self.area_check(box_tensor, expected)
+        self.area_check(box_tensor, expected, fmt)
 
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
-    def test_float_boxes(self, dtype):
-        box_tensor = torch.tensor(FLOAT_BOXES, dtype=dtype)
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_float_boxes(self, dtype, fmt):
+        box_tensor = ops.box_convert(torch.tensor(FLOAT_BOXES, dtype=dtype), in_fmt="xyxy", out_fmt=fmt)
         expected = torch.tensor([604723.0806, 600965.4666, 592761.0085], dtype=dtype)
-        self.area_check(box_tensor, expected)
-
-    def test_float16_box(self):
-        box_tensor = torch.tensor(
-            [[2.825, 1.8625, 3.90, 4.85], [2.825, 4.875, 19.20, 5.10], [2.925, 1.80, 8.90, 4.90]], dtype=torch.float16
+        self.area_check(box_tensor, expected, fmt)
+
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_float16_box(self, fmt):
+        box_tensor = ops.box_convert(
+            torch.tensor(
+                [[2.825, 1.8625, 3.90, 4.85], [2.825, 4.875, 19.20, 5.10], [2.925, 1.80, 8.90, 4.90]],
+                dtype=torch.float16,
+            ),
+            in_fmt="xyxy",
+            out_fmt=fmt,
         )
 
         expected = torch.tensor([3.2170, 3.7108, 18.5071], dtype=torch.float16)
-        self.area_check(box_tensor, expected, atol=0.01)
+        self.area_check(box_tensor, expected, fmt, atol=0.01)
+
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_box_area_jit(self, fmt):
+        box_tensor = ops.box_convert(
+            torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=torch.float), in_fmt="xyxy", out_fmt=fmt
+        )
+        expected = ops.box_area(box_tensor, fmt)
+
+        class BoxArea(torch.nn.Module):
+            # We are using this intermediate class
+            # since torchscript does not support
+            # neither partial nor lambda functions for this test.
+            def __init__(self, fmt):
+                super().__init__()
+                self.area = ops.box_area
+                self.fmt = fmt
+
+            def forward(self, boxes):
+                return self.area(boxes, self.fmt)
 
-    def test_box_area_jit(self):
-        box_tensor = torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=torch.float)
-        expected = ops.box_area(box_tensor)
-        scripted_fn = torch.jit.script(ops.box_area)
+        scripted_fn = torch.jit.script(BoxArea(fmt))
         scripted_area = scripted_fn(box_tensor)
         torch.testing.assert_close(scripted_area, expected)
 
@@ -1178,25 +1513,28 @@ def test_box_area_jit(self):
 ]
 
 
-def gen_box(size, dtype=torch.float):
+def gen_box(size, dtype=torch.float, fmt="xyxy") -> Tensor:
     xy1 = torch.rand((size, 2), dtype=dtype)
     xy2 = xy1 + torch.rand((size, 2), dtype=dtype)
-    return torch.cat([xy1, xy2], axis=-1)
+    return ops.box_convert(torch.cat([xy1, xy2], axis=-1), in_fmt="xyxy", out_fmt=fmt)
 
 
 class TestIouBase:
     @staticmethod
-    def _run_test(target_fn: Callable, actual_box1, actual_box2, dtypes, atol, expected):
+    def _run_test(target_fn: Callable, actual_box1, actual_box2, dtypes, atol, expected, fmt="xyxy"):
         for dtype in dtypes:
-            actual_box1 = torch.tensor(actual_box1, dtype=dtype)
-            actual_box2 = torch.tensor(actual_box2, dtype=dtype)
+            _actual_box1 = ops.box_convert(torch.tensor(actual_box1, dtype=dtype), in_fmt="xyxy", out_fmt=fmt)
+            _actual_box2 = ops.box_convert(torch.tensor(actual_box2, dtype=dtype), in_fmt="xyxy", out_fmt=fmt)
             expected_box = torch.tensor(expected)
-            out = target_fn(actual_box1, actual_box2)
+            out = target_fn(
+                _actual_box1,
+                _actual_box2,
+            )
             torch.testing.assert_close(out, expected_box, rtol=0.0, check_dtype=False, atol=atol)
 
     @staticmethod
-    def _run_jit_test(target_fn: Callable, actual_box: List):
-        box_tensor = torch.tensor(actual_box, dtype=torch.float)
+    def _run_jit_test(target_fn: Callable, actual_box: list, fmt="xyxy"):
+        box_tensor = ops.box_convert(torch.tensor(actual_box, dtype=torch.float), in_fmt="xyxy", out_fmt=fmt)
         expected = target_fn(box_tensor, box_tensor)
         scripted_fn = torch.jit.script(target_fn)
         scripted_out = scripted_fn(box_tensor, box_tensor)
@@ -1213,12 +1551,20 @@ def _cartesian_product(boxes1, boxes2, target_fn: Callable):
         return result
 
     @staticmethod
-    def _run_cartesian_test(target_fn: Callable):
-        boxes1 = gen_box(5)
-        boxes2 = gen_box(7)
+    def _run_cartesian_test(target_fn: Callable, fmt: str = "xyxy"):
+        boxes1 = gen_box(5, fmt=fmt)
+        boxes2 = gen_box(7, fmt=fmt)
         a = TestIouBase._cartesian_product(boxes1, boxes2, target_fn)
         b = target_fn(boxes1, boxes2)
-        assert torch.allclose(a, b)
+        torch.testing.assert_close(a, b)
+
+    @staticmethod
+    def _run_batch_test(target_fn: Callable, fmt: str = "xyxy"):
+        boxes1 = torch.stack([gen_box(5, fmt=fmt) for _ in range(3)], dim=0)
+        boxes2 = torch.stack([gen_box(5, fmt=fmt) for _ in range(3)], dim=0)
+        native: Tensor = target_fn(boxes1, boxes2)
+        iterative: Tensor = torch.stack([target_fn(*pairs) for pairs in zip(boxes1, boxes2)], dim=0)
+        torch.testing.assert_close(native, iterative)
 
 
 class TestBoxIou(TestIouBase):
@@ -1233,14 +1579,33 @@ class TestBoxIou(TestIouBase):
             pytest.param(FLOAT_BOXES, FLOAT_BOXES, [torch.float32, torch.float64], 1e-3, float_expected),
         ],
     )
-    def test_iou(self, actual_box1, actual_box2, dtypes, atol, expected):
-        self._run_test(ops.box_iou, actual_box1, actual_box2, dtypes, atol, expected)
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_iou(self, actual_box1, actual_box2, dtypes, atol, expected, fmt):
+        self._run_test(partial(ops.box_iou, fmt=fmt), actual_box1, actual_box2, dtypes, atol, expected, fmt)
 
-    def test_iou_jit(self):
-        self._run_jit_test(ops.box_iou, INT_BOXES)
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_iou_jit(self, fmt):
+        class IoUJit(torch.nn.Module):
+            # We are using this intermediate class
+            # since torchscript does not support
+            # neither partial nor lambda functions for this test.
+            def __init__(self, fmt):
+                super().__init__()
+                self.iou = ops.box_iou
+                self.fmt = fmt
 
-    def test_iou_cartesian(self):
-        self._run_cartesian_test(ops.box_iou)
+            def forward(self, boxes1, boxes2):
+                return self.iou(boxes1, boxes2, fmt=self.fmt)
+
+        self._run_jit_test(IoUJit(fmt=fmt), INT_BOXES, fmt)
+
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_iou_cartesian(self, fmt):
+        self._run_cartesian_test(partial(ops.box_iou, fmt=fmt))
+
+    @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"])
+    def test_iou_batch(self, fmt):
+        self._run_batch_test(partial(ops.box_iou, fmt=fmt))
 
 
 class TestGeneralizedBoxIou(TestIouBase):
@@ -1264,6 +1629,9 @@ def test_iou_jit(self):
     def test_iou_cartesian(self):
         self._run_cartesian_test(ops.generalized_box_iou)
 
+    def test_iou_batch(self):
+        self._run_batch_test(ops.generalized_box_iou)
+
 
 class TestDistanceBoxIoU(TestIouBase):
     int_expected = [
@@ -1291,6 +1659,9 @@ def test_iou_jit(self):
     def test_iou_cartesian(self):
         self._run_cartesian_test(ops.distance_box_iou)
 
+    def test_iou_batch(self):
+        self._run_batch_test(ops.distance_box_iou)
+
 
 class TestCompleteBoxIou(TestIouBase):
     int_expected = [
@@ -1318,6 +1689,9 @@ def test_iou_jit(self):
     def test_iou_cartesian(self):
         self._run_cartesian_test(ops.complete_box_iou)
 
+    def test_iou_batch(self):
+        self._run_batch_test(ops.complete_box_iou)
+
 
 def get_boxes(dtype, device):
     box1 = torch.tensor([-1, -1, 1, 1], dtype=dtype, device=device)
@@ -1351,10 +1725,9 @@ def assert_empty_loss(iou_fn, dtype, device):
 
 class TestGeneralizedBoxIouLoss:
     # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_giou_loss(self, dtype, device):
-
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
 
         # Identical boxes should have loss of 0
@@ -1375,7 +1748,12 @@ def test_giou_loss(self, dtype, device):
         assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, device=device, reduction="sum")
         assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, device=device, reduction="mean")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+        # Test reduction value
+        # reduction value other than ["none", "mean", "sum"] should raise a ValueError
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz")
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
         assert_empty_loss(ops.generalized_box_iou_loss, dtype, device)
@@ -1383,7 +1761,7 @@ def test_empty_inputs(self, dtype, device):
 
 class TestCompleteBoxIouLoss:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_ciou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
 
@@ -1394,14 +1772,17 @@ def test_ciou_loss(self, dtype, device):
         assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean")
         assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.complete_box_iou_loss(box1s, box2s, reduction="xyz")
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
         assert_empty_loss(ops.complete_box_iou_loss, dtype, device)
 
 
 class TestDistanceBoxIouLoss:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_distance_iou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
@@ -1413,7 +1794,10 @@ def test_distance_iou_loss(self, dtype, device):
         assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean")
         assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.distance_box_iou_loss(box1s, box2s, reduction="xyz")
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_distance_iou_inputs(self, dtype, device):
         assert_empty_loss(ops.distance_box_iou_loss, dtype, device)
@@ -1458,7 +1842,7 @@ def generate_tensor_with_range_type(shape, range_type, **kwargs):
 
     @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
     @pytest.mark.parametrize("gamma", [0, 2])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [0, 1])
     def test_correct_ratio(self, alpha, gamma, device, dtype, seed):
@@ -1487,7 +1871,7 @@ def test_correct_ratio(self, alpha, gamma, device, dtype, seed):
         torch.testing.assert_close(correct_ratio, loss_ratio, atol=tol, rtol=tol)
 
     @pytest.mark.parametrize("reduction", ["mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [2, 3])
     def test_equal_ce_loss(self, reduction, device, dtype, seed):
@@ -1514,7 +1898,7 @@ def test_equal_ce_loss(self, reduction, device, dtype, seed):
     @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
     @pytest.mark.parametrize("gamma", [0, 2])
     @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [4, 5])
     def test_jit(self, alpha, gamma, reduction, device, dtype, seed):
@@ -1524,17 +1908,22 @@ def test_jit(self, alpha, gamma, reduction, device, dtype, seed):
         torch.random.manual_seed(seed)
         inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
         focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        if device == "cpu":
-            scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        else:
-            with torch.jit.fuser("fuser2"):
-                # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476
-                # We may remove this condition once the bug is resolved
-                scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+        scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
 
         tol = 1e-3 if dtype is torch.half else 1e-5
         torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
 
+    # Raise ValueError for anonymous reduction mode
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_reduction_mode(self, device, dtype, reduction="xyz"):
+        if device == "cpu" and dtype is torch.half:
+            pytest.skip("Currently torch.half is not fully supported on cpu")
+        torch.random.manual_seed(0)
+        inputs, targets = self._generate_diverse_input_target_pair(device=device, dtype=dtype)
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.sigmoid_focal_loss(inputs, targets, 0.25, 2, reduction)
+
 
 class TestMasksToBoxes:
     def test_masks_box(self):
@@ -1604,7 +1993,7 @@ def test_stochastic_depth_random(self, seed, mode, p):
                 counts += batch_size - non_zero_count
                 num_samples += batch_size
 
-        p_value = stats.binom_test(counts, num_samples, p=p)
+        p_value = stats.binomtest(counts, num_samples, p=p).pvalue
         assert p_value > 0.01
 
     @pytest.mark.parametrize("seed", range(10))
diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_datasets_builtin.py
similarity index 67%
rename from test/test_prototype_builtin_datasets.py
rename to test/test_prototype_datasets_builtin.py
index 6ddba1806c6..5f8fc90debf 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -1,28 +1,54 @@
-import functools
 import io
 import pickle
+from collections import deque
 from pathlib import Path
 
 import pytest
 import torch
+import torchvision.transforms.v2 as transforms
+
 from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
-from torch.testing._comparison import assert_equal, ObjectPair, TensorLikePair
+from torch.testing._comparison import not_close_error_metas, ObjectPair, TensorLikePair
+
+# TODO: replace with torchdata.dataloader2.DataLoader2 as soon as it is stable-ish
 from torch.utils.data import DataLoader
-from torch.utils.data.graph import traverse
+
+# TODO: replace with torchdata equivalent as soon as it is available
 from torch.utils.data.graph_settings import get_all_graph_pipes
+
+from torchdata.dataloader2.graph.utils import traverse_dps
 from torchdata.datapipes.iter import ShardingFilter, Shuffler
+from torchdata.datapipes.utils import StreamWrapper
+from torchvision import tv_tensors
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import datasets, transforms
+from torchvision.prototype import datasets
+from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.prototype.features import Image, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.transforms.v2._utils import is_pure_tensor
+
 
-assert_samples_equal = functools.partial(
-    assert_equal, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True
-)
+def assert_samples_equal(*args, msg=None, **kwargs):
+    error_metas = not_close_error_metas(
+        *args, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True, **kwargs
+    )
+    if error_metas:
+        raise error_metas[0].to_error(msg)
 
 
 def extract_datapipes(dp):
-    return get_all_graph_pipes(traverse(dp, only_datapipe=True))
+    return get_all_graph_pipes(traverse_dps(dp))
+
+
+def consume(iterator):
+    # Copied from the official itertools recipes: https://docs.python.org/3/library/itertools.html#itertools-recipes
+    deque(iterator, maxlen=0)
+
+
+def next_consume(iterator):
+    item = next(iterator)
+    consume(iterator)
+    return item
 
 
 @pytest.fixture(autouse=True)
@@ -66,7 +92,7 @@ def test_sample(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
         try:
-            sample = next(iter(dataset))
+            sample = next_consume(iter(dataset))
         except StopIteration:
             raise AssertionError("Unable to draw any sample.") from None
         except Exception as error:
@@ -84,29 +110,64 @@ def test_num_samples(self, dataset_mock, config):
 
         assert len(list(dataset)) == mock_info["num_samples"]
 
+    @pytest.fixture
+    def log_session_streams(self):
+        debug_unclosed_streams = StreamWrapper.debug_unclosed_streams
+        try:
+            StreamWrapper.debug_unclosed_streams = True
+            yield
+        finally:
+            StreamWrapper.debug_unclosed_streams = debug_unclosed_streams
+
+    @parametrize_dataset_mocks(DATASET_MOCKS)
+    def test_stream_closing(self, log_session_streams, dataset_mock, config):
+        def make_msg_and_close(head):
+            unclosed_streams = []
+            for stream in list(StreamWrapper.session_streams.keys()):
+                unclosed_streams.append(repr(stream.file_obj))
+                stream.close()
+            unclosed_streams = "\n".join(unclosed_streams)
+            return f"{head}\n\n{unclosed_streams}"
+
+        if StreamWrapper.session_streams:
+            raise pytest.UsageError(make_msg_and_close("A previous test did not close the following streams:"))
+
+        dataset, _ = dataset_mock.load(config)
+
+        consume(iter(dataset))
+
+        if StreamWrapper.session_streams:
+            raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:"))
+
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_vanilla_tensors(self, dataset_mock, config):
+    def test_no_unaccompanied_pure_tensors(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
+        sample = next_consume(iter(dataset))
+
+        pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)}
 
-        vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
-        if vanilla_tensors:
+        if pure_tensors and not any(
+            isinstance(item, (tv_tensors.Image, tv_tensors.Video, EncodedImage)) for item in sample.values()
+        ):
             raise AssertionError(
                 f"The values of key(s) "
-                f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors."
+                f"{sequence_to_str(sorted(pure_tensors), separate_last='and ')} contained pure tensors, "
+                f"but didn't find any (encoded) image or video."
             )
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_transformable(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        next(iter(dataset.map(transforms.Identity())))
+        dataset = dataset.map(transforms.Identity())
+
+        consume(iter(dataset))
 
-    @pytest.mark.parametrize("only_datapipe", [False, True])
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_traversable(self, dataset_mock, config, only_datapipe):
+    def test_traversable(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        traverse(dataset, only_datapipe=only_datapipe)
+        traverse_dps(dataset)
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_serializable(self, dataset_mock, config):
@@ -132,7 +193,7 @@ def test_data_loader(self, dataset_mock, config, num_workers):
             collate_fn=self._collate_fn,
         )
 
-        next(iter(dl))
+        consume(dl)
 
     # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
     #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680
@@ -149,12 +210,12 @@ def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
     def test_save_load(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        sample = next(iter(dataset))
+        sample = next_consume(iter(dataset))
 
         with io.BytesIO() as buffer:
             torch.save(sample, buffer)
             buffer.seek(0)
-            assert_samples_equal(torch.load(buffer), sample)
+            assert_samples_equal(torch.load(buffer, weights_only=True), sample)
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_infinite_buffer_size(self, dataset_mock, config):
@@ -178,7 +239,7 @@ class TestQMNIST:
     def test_extra_label(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        sample = next(iter(dataset))
+        sample = next_consume(iter(dataset))
         for key, type in (
             ("nist_hsf_series", int),
             ("nist_writer_id", int),
@@ -215,7 +276,7 @@ def test_sample_content(self, dataset_mock, config):
             assert "image" in sample
             assert "label" in sample
 
-            assert isinstance(sample["image"], Image)
+            assert isinstance(sample["image"], tv_tensors.Image)
             assert isinstance(sample["label"], Label)
 
             assert sample["image"].shape == (1, 16, 16)
diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
index 56f7b9cb6ac..d32df68f1f4 100644
--- a/test/test_prototype_models.py
+++ b/test/test_prototype_models.py
@@ -1,13 +1,13 @@
 import pytest
 import test_models as TM
 import torch
-from common_utils import cpu_and_gpu, set_rng_seed
+from common_utils import cpu_and_cuda, set_rng_seed
 from torchvision.prototype import models
 
 
-@pytest.mark.parametrize("model_fn", TM.list_model_fns(models.depth.stereo))
+@pytest.mark.parametrize("model_fn", (models.depth.stereo.raft_stereo_base,))
 @pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_raft_stereo(model_fn, model_mode, dev):
     # A simple test to make sure the model can do forward pass and jit scriptable
     set_rng_seed(0)
@@ -36,3 +36,49 @@ def test_raft_stereo(model_fn, model_mode, dev):
 
     # Test against expected file output
     TM._assert_expected(depth_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("model_fn", (models.depth.stereo.crestereo_base,))
+@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
+@pytest.mark.parametrize("dev", cpu_and_cuda())
+def test_crestereo(model_fn, model_mode, dev):
+    set_rng_seed(0)
+
+    model = model_fn().eval().to(dev)
+
+    if model_mode == "scripted":
+        model = torch.jit.script(model)
+
+    img1 = torch.rand(1, 3, 64, 64).to(dev)
+    img2 = torch.rand(1, 3, 64, 64).to(dev)
+    iterations = 3
+
+    preds = model(img1, img2, flow_init=None, num_iters=iterations)
+    disparity_pred = preds[-1]
+
+    # all the pyramid levels except the highest res make only half the number of iterations
+    expected_iterations = (iterations // 2) * (len(model.resolutions) - 1)
+    expected_iterations += iterations
+    assert (
+        len(preds) == expected_iterations
+    ), "Number of predictions should be the number of iterations multiplied by the number of pyramid levels"
+
+    assert disparity_pred.shape == torch.Size(
+        [1, 2, 64, 64]
+    ), f"Predicted disparity should have the same spatial shape as the input. Inputs shape {img1.shape[2:]}, Prediction shape {disparity_pred.shape[2:]}"
+
+    assert all(
+        d.shape == torch.Size([1, 2, 64, 64]) for d in preds
+    ), "All predicted disparities are expected to have the same shape"
+
+    # test a backward pass with a dummy loss as well
+    preds = torch.stack(preds, dim=0)
+    targets = torch.ones_like(preds, requires_grad=False)
+    loss = torch.nn.functional.mse_loss(preds, targets)
+
+    try:
+        loss.backward()
+    except Exception as e:
+        assert False, f"Backward pass failed with an unexpected exception: {e.__class__.__name__} {e}"
+
+    TM._assert_expected(disparity_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 0f485a10550..85ef98cf7b8 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,1316 +1,42 @@
-import itertools
-
-import numpy as np
+import collections.abc
+import re
 
 import PIL.Image
-
 import pytest
 import torch
-from common_utils import assert_equal, cpu_and_gpu
-from test_prototype_transforms_functional import (
-    make_bounding_box,
-    make_bounding_boxes,
-    make_image,
-    make_images,
-    make_label,
-    make_one_hot_labels,
-    make_segmentation_mask,
-)
-from torchvision.ops.boxes import box_iou
-from torchvision.prototype import features, transforms
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
-
-
-def make_vanilla_tensor_images(*args, **kwargs):
-    for image in make_images(*args, **kwargs):
-        if image.ndim > 3:
-            continue
-        yield image.data
-
-
-def make_pil_images(*args, **kwargs):
-    for image in make_vanilla_tensor_images(*args, **kwargs):
-        yield to_pil_image(image)
-
-
-def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_box in make_bounding_boxes(*args, **kwargs):
-        yield bounding_box.data
-
-
-def parametrize(transforms_with_inputs):
-    return pytest.mark.parametrize(
-        ("transform", "input"),
-        [
-            pytest.param(
-                transform,
-                input,
-                id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}",
-            )
-            for transform, inputs in transforms_with_inputs
-            for idx, input in enumerate(inputs)
-        ],
-    )
-
-
-def parametrize_from_transforms(*transforms):
-    transforms_with_inputs = []
-    for transform in transforms:
-        for creation_fn in [
-            make_images,
-            make_bounding_boxes,
-            make_one_hot_labels,
-            make_vanilla_tensor_images,
-            make_pil_images,
-        ]:
-            inputs = list(creation_fn())
-            try:
-                output = transform(inputs[0])
-            except Exception:
-                continue
-            else:
-                if output is inputs[0]:
-                    continue
-
-            transforms_with_inputs.append((transform, inputs))
-
-    return parametrize(transforms_with_inputs)
-
-
-class TestSmoke:
-    @parametrize_from_transforms(
-        transforms.RandomErasing(p=1.0),
-        transforms.Resize([16, 16]),
-        transforms.CenterCrop([16, 16]),
-        transforms.ConvertImageDtype(),
-        transforms.RandomHorizontalFlip(),
-        transforms.Pad(5),
-        transforms.RandomZoomOut(),
-        transforms.RandomRotation(degrees=(-45, 45)),
-        transforms.RandomAffine(degrees=(-45, 45)),
-        transforms.RandomCrop([16, 16], padding=1, pad_if_needed=True),
-        # TODO: Something wrong with input data setup. Let's fix that
-        # transforms.RandomEqualize(),
-        # transforms.RandomInvert(),
-        # transforms.RandomPosterize(bits=4),
-        # transforms.RandomSolarize(threshold=0.5),
-        # transforms.RandomAdjustSharpness(sharpness_factor=0.5),
-    )
-    def test_common(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transform,
-                [
-                    dict(
-                        image=features.Image.new_like(image, image.unsqueeze(0), dtype=torch.float),
-                        one_hot_label=features.OneHotLabel.new_like(
-                            one_hot_label, one_hot_label.unsqueeze(0), dtype=torch.float
-                        ),
-                    )
-                    for image, one_hot_label in itertools.product(make_images(), make_one_hot_labels())
-                ],
-            )
-            for transform in [
-                transforms.RandomMixup(alpha=1.0),
-                transforms.RandomCutmix(alpha=1.0),
-            ]
-        ]
-    )
-    def test_mixup_cutmix(self, transform, input):
-        transform(input)
-
-        # add other data that should bypass and wont raise any error
-        input_copy = dict(input)
-        input_copy["path"] = "/path/to/somewhere"
-        input_copy["num"] = 1234
-        transform(input_copy)
-
-        # Check if we raise an error if sample contains bbox or mask or label
-        err_msg = "does not support bounding boxes, segmentation masks and plain labels"
-        input_copy = dict(input)
-        for unsup_data in [make_label(), make_bounding_box(format="XYXY"), make_segmentation_mask()]:
-            input_copy["unsupported"] = unsup_data
-            with pytest.raises(TypeError, match=err_msg):
-                transform(input_copy)
-
-    @parametrize(
-        [
-            (
-                transform,
-                itertools.chain.from_iterable(
-                    fn(
-                        color_spaces=[
-                            features.ColorSpace.GRAY,
-                            features.ColorSpace.RGB,
-                        ],
-                        dtypes=[torch.uint8],
-                        extra_dims=[(4,)],
-                    )
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                        make_pil_images,
-                    ]
-                ),
-            )
-            for transform in (
-                transforms.RandAugment(),
-                transforms.TrivialAugmentWide(),
-                transforms.AutoAugment(),
-                transforms.AugMix(),
-            )
-        ]
-    )
-    def test_auto_augment(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
-                itertools.chain.from_iterable(
-                    fn(color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32])
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                    ]
-                ),
-            ),
-        ]
-    )
-    def test_normalize(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.RandomResizedCrop([16, 16]),
-                itertools.chain(
-                    make_images(extra_dims=[(4,)]),
-                    make_vanilla_tensor_images(),
-                    make_pil_images(),
-                ),
-            )
-        ]
-    )
-    def test_random_resized_crop(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.ConvertColorSpace(color_space=new_color_space, old_color_space=old_color_space),
-                itertools.chain.from_iterable(
-                    [
-                        fn(color_spaces=[old_color_space])
-                        for fn in (
-                            make_images,
-                            make_vanilla_tensor_images,
-                            make_pil_images,
-                        )
-                    ]
-                ),
-            )
-            for old_color_space, new_color_space in itertools.product(
-                [
-                    features.ColorSpace.GRAY,
-                    features.ColorSpace.GRAY_ALPHA,
-                    features.ColorSpace.RGB,
-                    features.ColorSpace.RGB_ALPHA,
-                ],
-                repeat=2,
-            )
-        ]
-    )
-    def test_convertolor_space(self, transform, input):
-        transform(input)
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomHorizontalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
-        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_features_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(features.Image(input))
-
-        assert_equal(features.Image(expected), actual)
-
-    def test_features_segmentation_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(features.SegmentationMask(input))
-
-        assert_equal(features.SegmentationMask(expected), actual)
-
-    def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomVerticalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
-        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_features_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(features.Image(input))
-
-        assert_equal(features.Image(expected), actual)
-
-    def test_features_segmentation_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(features.SegmentationMask(input))
-
-        assert_equal(features.SegmentationMask(expected), actual)
-
-    def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
-
-
-class TestPad:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.Pad("abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.Pad([-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.Pad(12, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.Pad(12, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, fill, padding_mode, mocker):
-        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        inpt = mocker.MagicMock(spec=features.Image)
-        _ = transform(inpt)
-
-        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
-
 
-class TestRandomZoomOut:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomZoomOut(fill="abc")
+from common_utils import assert_equal, make_bounding_boxes, make_detection_masks, make_image, make_video
 
-        with pytest.raises(TypeError, match="should be a sequence of length"):
-            transforms.RandomZoomOut(0, side_range=0)
+from torchvision.prototype import transforms, tv_tensors
+from torchvision.transforms.v2._utils import check_type, is_pure_tensor
+from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
 
-        with pytest.raises(ValueError, match="Invalid canvas side range"):
-            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
+from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__get_params(self, fill, side_range, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
 
-        image = mocker.MagicMock(spec=features.Image)
-        h, w = image.image_size = (24, 32)
-
-        params = transform._get_params(image)
-
-        assert params["fill"] == fill
-        assert len(params["padding"]) == 4
-        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
-        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        fn.assert_called_once_with(inpt, **params)
-
-
-class TestRandomRotation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomRotation(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomRotation(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomRotation(12, fill="abc")
-
-        with pytest.raises(TypeError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=12)
-
-        with pytest.raises(ValueError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=[1, 2, 3])
-
-    def test__get_params(self):
-        angle_bound = 34
-        transform = transforms.RandomRotation(angle_bound)
-
-        params = transform._get_params(None)
-        assert -angle_bound <= params["angle"] <= angle_bound
-
-        angle_bounds = [12, 34]
-        transform = transforms.RandomRotation(angle_bounds)
-
-        params = transform._get_params(None)
-        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("expand", [False, True])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, expand, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomRotation(
-            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
+def _parse_categories(categories):
+    if categories is None:
+        num_categories = int(torch.randint(1, 11, ()))
+    elif isinstance(categories, int):
+        num_categories = categories
+        categories = [f"category{idx}" for idx in range(num_categories)]
+    elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories):
+        categories = list(categories)
+        num_categories = len(categories)
+    else:
+        raise pytest.UsageError(
+            f"`categories` can either be `None` (default), an integer, or a sequence of strings, "
+            f"but got '{categories}' instead."
         )
+    return categories, num_categories
 
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.rotate")
-        inpt = mocker.MagicMock(spec=features.Image)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
-    @pytest.mark.parametrize("angle", [34, -87])
-    @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_image_size(self, angle, expand):
-        # Specific test for BoundingBox.rotate
-        bbox = features.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32)
-        )
-        img = features.Image(torch.rand(1, 3, 32, 32))
-
-        out_img = img.rotate(angle, expand=expand)
-        out_bbox = bbox.rotate(angle, expand=expand)
-
-        assert out_img.image_size == out_bbox.image_size
-
-
-class TestRandomAffine:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomAffine(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        for kwargs in [
-            {"center": 12},
-            {"translate": 12},
-            {"scale": 12},
-        ]:
-            with pytest.raises(TypeError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
-            with pytest.raises(ValueError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
-            transforms.RandomAffine(12, translate=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="scale values should be positive"):
-            transforms.RandomAffine(12, scale=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(12, shear=-10)
-
-        for s in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
-                transforms.RandomAffine(12, shear=s)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    def test__get_params(self, degrees, translate, scale, shear, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-        h, w = image.image_size
-
-        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
-        params = transform._get_params(image)
-
-        if not isinstance(degrees, (list, tuple)):
-            assert -degrees <= params["angle"] <= degrees
-        else:
-            assert degrees[0] <= params["angle"] <= degrees[1]
-
-        if translate is not None:
-            w_max = int(round(translate[0] * w))
-            h_max = int(round(translate[1] * h))
-            assert -w_max <= params["translations"][0] <= w_max
-            assert -h_max <= params["translations"][1] <= h_max
-        else:
-            assert params["translations"] == (0, 0)
-
-        if scale is not None:
-            assert scale[0] <= params["scale"] <= scale[1]
-        else:
-            assert params["scale"] == 1.0
-
-        if shear is not None:
-            if isinstance(shear, float):
-                assert -shear <= params["shear"][0] <= shear
-                assert params["shear"][1] == 0.0
-            elif len(shear) == 2:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert params["shear"][1] == 0.0
-            else:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert shear[2] <= params["shear"][1] <= shear[3]
-        else:
-            assert params["shear"] == (0, 0)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomAffine(
-            degrees,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.affine")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
-
-
-class TestRandomCrop:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Please provide only two dimensions"):
-            transforms.RandomCrop([10, 12, 14])
-
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.RandomCrop([10, 12], padding="abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomCrop([10, 12], padding=1, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-        h, w = image.image_size
-
-        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
-        params = transform._get_params(image)
-
-        if padding is not None:
-            if isinstance(padding, int):
-                h += 2 * padding
-                w += 2 * padding
-            elif isinstance(padding, list) and len(padding) == 2:
-                w += 2 * padding[0]
-                h += 2 * padding[1]
-            elif isinstance(padding, list) and len(padding) == 4:
-                w += padding[0] + padding[2]
-                h += padding[1] + padding[3]
-
-        expected_input_width = w
-        expected_input_height = h
-
-        if pad_if_needed:
-            if w < size[1]:
-                w += 2 * (size[1] - w)
-            if h < size[0]:
-                h += 2 * (size[0] - h)
-
-        assert 0 <= params["top"] <= h - size[0] + 1
-        assert 0 <= params["left"] <= w - size[1] + 1
-        assert params["height"] == size[0]
-        assert params["width"] == size[1]
-        assert params["input_width"] == expected_input_width
-        assert params["input_height"] == expected_input_height
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("pad_if_needed", [False, True])
-    @pytest.mark.parametrize("fill", [False, True])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
-        output_size = [10, 12]
-        transform = transforms.RandomCrop(
-            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
-        )
-
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (32, 32)
-
-        expected = mocker.MagicMock(spec=features.Image)
-        expected.num_channels = 3
-        if isinstance(padding, int):
-            expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding)
-        elif isinstance(padding, list):
-            expected.image_size = (
-                inpt.image_size[0] + sum(padding[0::2]),
-                inpt.image_size[1] + sum(padding[1::2]),
-            )
-        else:
-            expected.image_size = inpt.image_size
-        _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected)
-        fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop")
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-        if padding is None and not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif padding is None:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-        else:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-
-
-class TestGaussianBlur:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
-            transforms.GaussianBlur([10, 12, 14])
-
-        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
-            transforms.GaussianBlur(4)
-
-        with pytest.raises(TypeError, match="sigma should be a single float or a list/tuple with length 2"):
-            transforms.GaussianBlur(3, sigma=[1, 2, 3])
-
-        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
-            transforms.GaussianBlur(3, sigma=-1.0)
-
-        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
-            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
-
-    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
-    def test__get_params(self, sigma):
-        transform = transforms.GaussianBlur(3, sigma=sigma)
-        params = transform._get_params(None)
-
-        if isinstance(sigma, float):
-            assert params["sigma"][0] == params["sigma"][1] == 10
-        else:
-            assert sigma[0] <= params["sigma"][0] <= sigma[1]
-            assert sigma[0] <= params["sigma"][1] <= sigma[1]
-
-    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
-    def test__transform(self, kernel_size, sigma, mocker):
-        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
-
-        if isinstance(kernel_size, (tuple, list)):
-            assert transform.kernel_size == kernel_size
-        else:
-            assert transform.kernel_size == (kernel_size, kernel_size)
-
-        if isinstance(sigma, (tuple, list)):
-            assert transform.sigma == sigma
-        else:
-            assert transform.sigma == (sigma, sigma)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fn.assert_called_once_with(inpt, **params)
-
-
-class TestRandomColorOp:
-    @pytest.mark.parametrize("p", [0.0, 1.0])
-    @pytest.mark.parametrize(
-        "transform_cls, func_op_name, kwargs",
-        [
-            (transforms.RandomEqualize, "equalize", {}),
-            (transforms.RandomInvert, "invert", {}),
-            (transforms.RandomAutocontrast, "autocontrast", {}),
-            (transforms.RandomPosterize, "posterize", {"bits": 4}),
-            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
-            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
-        ],
-    )
-    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
-        transform = transform_cls(p=p, **kwargs)
-
-        fn = mocker.patch(f"torchvision.prototype.transforms.functional.{func_op_name}")
-        inpt = mocker.MagicMock(spec=features.Image)
-        _ = transform(inpt)
-        if p > 0.0:
-            fn.assert_called_once_with(inpt, **kwargs)
-        else:
-            assert fn.call_count == 0
-
-
-class TestRandomPerspective:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
-            transforms.RandomPerspective(distortion_scale=-1.0)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomPerspective(0.5, fill="abc")
-
-    def test__get_params(self, mocker):
-        dscale = 0.5
-        transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        params = transform._get_params(image)
-
-        h, w = image.image_size
-        assert len(params["startpoints"]) == 4
-        for x, y in params["startpoints"]:
-            assert x in (0, w - 1)
-            assert y in (0, h - 1)
-
-        assert len(params["endpoints"]) == 4
-        for (x, y), name in zip(params["endpoints"], ["tl", "tr", "br", "bl"]):
-            if "t" in name:
-                assert 0 <= y <= int(dscale * h // 2), (x, y, name)
-            if "b" in name:
-                assert h - int(dscale * h // 2) - 1 <= y <= h, (x, y, name)
-            if "l" in name:
-                assert 0 <= x <= int(dscale * w // 2), (x, y, name)
-            if "r" in name:
-                assert w - int(dscale * w // 2) - 1 <= x <= w, (x, y, name)
-
-    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
-    def test__transform(self, distortion_scale, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.perspective")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
-
-class TestElasticTransform:
-    def test_assertions(self):
-
-        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
-            transforms.ElasticTransform({})
-
-        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
-            transforms.ElasticTransform([1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
-            transforms.ElasticTransform([1, 2])
-
-        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
-            transforms.ElasticTransform(1.0, {})
-
-        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
-            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
-            transforms.ElasticTransform(1.0, [1, 2])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.ElasticTransform(1.0, 2.0, fill="abc")
-
-    def test__get_params(self, mocker):
-        alpha = 2.0
-        sigma = 3.0
-        transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        params = transform._get_params(image)
-
-        h, w = image.image_size
-        displacement = params["displacement"]
-        assert displacement.shape == (1, h, w, 2)
-        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
-        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
-
-    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
-    def test__transform(self, alpha, sigma, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
-
-        if isinstance(alpha, float):
-            assert transform.alpha == [alpha, alpha]
-        else:
-            assert transform.alpha == alpha
-
-        if isinstance(sigma, float):
-            assert transform.sigma == [sigma, sigma]
-        else:
-            assert transform.sigma == sigma
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.elastic")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock()
-        _ = transform(inpt)
-        params = transform._get_params(inpt)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
-
-class TestRandomErasing:
-    def test_assertions(self, mocker):
-        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
-            transforms.RandomErasing(value={})
-
-        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
-            transforms.RandomErasing(value="abc")
-
-        with pytest.raises(TypeError, match="Scale should be a sequence"):
-            transforms.RandomErasing(scale=123)
-
-        with pytest.raises(TypeError, match="Ratio should be a sequence"):
-            transforms.RandomErasing(ratio=123)
-
-        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
-            transforms.RandomErasing(scale=[-1, 2])
-
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
-
-        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
-            transform._get_params(image)
-
-    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=value)
-        params = transform._get_params(image)
-
-        v = params["v"]
-        h, w = params["h"], params["w"]
-        i, j = params["i"], params["j"]
-        assert isinstance(v, torch.Tensor)
-        if value == "random":
-            assert v.shape == (image.num_channels, h, w)
-        elif isinstance(value, (int, float)):
-            assert v.shape == (1, 1, 1)
-        elif isinstance(value, (list, tuple)):
-            assert v.shape == (image.num_channels, 1, 1)
-
-        assert 0 <= i <= image.image_size[0] - h
-        assert 0 <= j <= image.image_size[1] - w
-
-    @pytest.mark.parametrize("p", [0, 1])
-    def test__transform(self, mocker, p):
-        transform = transforms.RandomErasing(p=p)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        i_sentinel = mocker.MagicMock()
-        j_sentinel = mocker.MagicMock()
-        h_sentinel = mocker.MagicMock()
-        w_sentinel = mocker.MagicMock()
-        v_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._augment.RandomErasing._get_params",
-            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.prototype.transforms._augment.F.erase")
-        output = transform(inpt_sentinel)
-
-        if p:
-            mock.assert_called_once_with(
-                inpt_sentinel, i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel
-            )
-        else:
-            mock.assert_not_called()
-            assert output is inpt_sentinel
-
-
-class TestTransform:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test_check_transformed_types(self, inpt_type, mocker):
-        # This test ensures that we correctly handle which types to transform and which to bypass
-        t = transforms.Transform()
-        inpt = mocker.MagicMock(spec=inpt_type)
-
-        if inpt_type in (np.ndarray, str, int):
-            output = t(inpt)
-            assert output is inpt
-        else:
-            with pytest.raises(NotImplementedError):
-                t(inpt)
-
-
-class TestToImageTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch(
-            "torchvision.prototype.transforms.functional.to_image_tensor",
-            return_value=torch.rand(1, 3, 8, 8),
-        )
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImageTensor()
-        transform(inpt)
-        if inpt_type in (features.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, copy=transform.copy)
-
-
-class TestToImagePIL:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImagePIL()
-        transform(inpt)
-        if inpt_type in (features.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToPILImage:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.functional.to_pil_image")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        with pytest.warns(UserWarning, match="deprecated and will be removed"):
-            transform = transforms.ToPILImage()
-        transform(inpt)
-        if inpt_type in (PIL.Image.Image, features.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        with pytest.warns(UserWarning, match="deprecated and will be removed"):
-            transform = transforms.ToTensor()
-        transform(inpt)
-        if inpt_type in (features.Image, torch.Tensor, features.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestCompose:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
-            transforms.Compose(123)
-
-    @pytest.mark.parametrize(
-        "trfms",
-        [
-            [transforms.Pad(2), transforms.RandomCrop(28)],
-            [lambda x: 2.0 * x],
-        ],
-    )
-    def test_ctor(self, trfms):
-        c = transforms.Compose(trfms)
-        inpt = torch.rand(1, 3, 32, 32)
-        output = c(inpt)
-        assert isinstance(output, torch.Tensor)
-
-
-class TestRandomIoUCrop:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
-    def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-        bboxes = features.BoundingBox(
-            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
-            format="XYXY",
-            image_size=image.image_size,
-            device=device,
-        )
-        sample = [image, bboxes]
-
-        transform = transforms.RandomIoUCrop(sampler_options=options)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params(sample)
-
-            if options == [2.0]:
-                assert len(params) == 0
-                return
-
-            assert len(params["is_within_crop_area"]) > 0
-            assert params["is_within_crop_area"].dtype == torch.bool
-
-            orig_h = image.image_size[0]
-            orig_w = image.image_size[1]
-            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
-            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
-
-            left, top = params["left"], params["top"]
-            new_h, new_w = params["height"], params["width"]
-            ious = box_iou(
-                bboxes,
-                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
-            )
-            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
-
-    def test__transform_empty_params(self, mocker):
-        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
-        image = features.Image(torch.rand(1, 3, 4, 4))
-        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4))
-        label = features.Label(torch.tensor([1]))
-        sample = [image, bboxes, label]
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock(return_value={})
-        output = transform(sample)
-        torch.testing.assert_close(output, sample)
-
-    def test_forward_assertion(self):
-        transform = transforms.RandomIoUCrop()
-        with pytest.raises(
-            TypeError,
-            match="requires input sample to contain Images or PIL Images, BoundingBoxes and Labels or OneHotLabels",
-        ):
-            transform(torch.tensor(0))
-
-    def test__transform(self, mocker):
-        transform = transforms.RandomIoUCrop()
-
-        image = features.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,))
-        label = features.Label(torch.randint(0, 10, size=(6,)))
-        ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
-        masks = make_segmentation_mask((32, 24))
-        ohe_masks = features.SegmentationMask(torch.randint(0, 2, size=(6, 32, 24)))
-        sample = [image, bboxes, label, ohe_label, masks, ohe_masks]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.crop", side_effect=lambda x, **params: x)
-        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
-
-        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
-        transform._get_params = mocker.MagicMock(return_value=params)
-        output = transform(sample)
-
-        assert fn.call_count == 4
-
-        expected_calls = [
-            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(
-                ohe_masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
-            ),
-        ]
-
-        fn.assert_has_calls(expected_calls)
-
-        expected_within_targets = sum(is_within_crop_area)
-
-        # check number of bboxes vs number of labels:
-        output_bboxes = output[1]
-        assert isinstance(output_bboxes, features.BoundingBox)
-        assert len(output_bboxes) == expected_within_targets
-
-        # check labels
-        output_label = output[2]
-        assert isinstance(output_label, features.Label)
-        assert len(output_label) == expected_within_targets
-        torch.testing.assert_close(output_label, label[is_within_crop_area])
-
-        output_ohe_label = output[3]
-        assert isinstance(output_ohe_label, features.OneHotLabel)
-        torch.testing.assert_close(output_ohe_label, ohe_label[is_within_crop_area])
-
-        output_masks = output[4]
-        assert isinstance(output_masks, features.SegmentationMask)
-        assert output_masks.shape[:-2] == masks.shape[:-2]
-
-        output_ohe_masks = output[5]
-        assert isinstance(output_ohe_masks, features.SegmentationMask)
-        assert len(output_ohe_masks) == expected_within_targets
-
-
-class TestScaleJitter:
-    def test__get_params(self, mocker):
-        image_size = (24, 32)
-        target_size = (16, 12)
-        scale_range = (0.5, 1.5)
-
-        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
-        params = transform._get_params(sample)
-
-        assert "size" in params
-        size = params["size"]
-
-        assert isinstance(size, tuple) and len(size) == 2
-        height, width = size
-
-        assert int(target_size[0] * scale_range[0]) <= height <= int(target_size[0] * scale_range[1])
-        assert int(target_size[1] * scale_range[0]) <= width <= int(target_size[1] * scale_range[1])
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
-
-        transform = transforms.ScaleJitter(target_size=(16, 12), interpolation=interpolation_sentinel)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel)
-
-
-class TestRandomShortestSize:
-    def test__get_params(self, mocker):
-        image_size = (3, 10)
-        min_size = [5, 9]
-        max_size = 20
-
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
-
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
-        params = transform._get_params(sample)
-
-        assert "size" in params
-        size = params["size"]
-
-        assert isinstance(size, tuple) and len(size) == 2
-
-        longer = max(size)
-        assert longer <= max_size
-
-        shorter = min(size)
-        if longer == max_size:
-            assert shorter <= max_size
-        else:
-            assert shorter in min_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomShortestSize(min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.RandomShortestSize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel)
+def make_label(*, extra_dims=(), categories=10, dtype=torch.int64, device="cpu"):
+    categories, num_categories = _parse_categories(categories)
+    # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
+    # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
+    data = torch.testing.make_tensor(extra_dims, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
+    return tv_tensors.Label(data, categories=categories)
 
 
 class TestSimpleCopyPaste:
@@ -1324,21 +50,22 @@ def test__extract_image_targets_assertion(self, mocker):
 
         flat_sample = [
             # images, batch size = 2
-            self.create_fake_image(mocker, features.Image),
+            self.create_fake_image(mocker, Image),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=features.Label),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.SegmentationMask),
+            mocker.MagicMock(spec=tv_tensors.Label),
+            mocker.MagicMock(spec=BoundingBoxes),
+            mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.SegmentationMask),
+            mocker.MagicMock(spec=BoundingBoxes),
+            mocker.MagicMock(spec=Mask),
         ]
 
-        with pytest.raises(TypeError, match="requires input sample to contain equal-sized list of Images"):
+        with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"):
             transform._extract_image_targets(flat_sample)
 
-    @pytest.mark.parametrize("image_type", [features.Image, PIL.Image.Image, torch.Tensor])
-    def test__extract_image_targets(self, image_type, mocker):
+    @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor])
+    @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel])
+    def test__extract_image_targets(self, image_type, label_type, mocker):
         transform = transforms.SimpleCopyPaste()
 
         flat_sample = [
@@ -1346,13 +73,13 @@ def test__extract_image_targets(self, image_type, mocker):
             self.create_fake_image(mocker, image_type),
             self.create_fake_image(mocker, image_type),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=features.Label),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.SegmentationMask),
+            mocker.MagicMock(spec=label_type),
+            mocker.MagicMock(spec=BoundingBoxes),
+            mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=features.Label),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.SegmentationMask),
+            mocker.MagicMock(spec=label_type),
+            mocker.MagicMock(spec=BoundingBoxes),
+            mocker.MagicMock(spec=Mask),
         ]
 
         images, targets = transform._extract_image_targets(flat_sample)
@@ -1365,60 +92,85 @@ def test__extract_image_targets(self, image_type, mocker):
             assert images[0] == flat_sample[0]
             assert images[1] == flat_sample[1]
 
-    def test__copy_paste(self):
+        for target in targets:
+            for key, type_ in [
+                ("boxes", BoundingBoxes),
+                ("masks", Mask),
+                ("labels", label_type),
+            ]:
+                assert key in target
+                assert isinstance(target[key], type_)
+                assert target[key] in flat_sample
+
+    @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel])
+    def test__copy_paste(self, label_type):
         image = 2 * torch.ones(3, 32, 32)
         masks = torch.zeros(2, 32, 32)
         masks[0, 3:9, 2:8] = 1
         masks[1, 20:30, 20:30] = 1
+        labels = torch.tensor([1, 2])
+        blending = True
+        resize_interpolation = InterpolationMode.BILINEAR
+        antialias = None
+        if label_type == tv_tensors.OneHotLabel:
+            labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
-            "boxes": features.BoundingBox(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32)
+            "boxes": BoundingBoxes(
+                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", canvas_size=(32, 32)
             ),
-            "masks": features.SegmentationMask(masks),
-            "labels": features.Label(torch.tensor([1, 2])),
+            "masks": Mask(masks),
+            "labels": label_type(labels),
         }
 
         paste_image = 10 * torch.ones(3, 32, 32)
         paste_masks = torch.zeros(2, 32, 32)
         paste_masks[0, 13:19, 12:18] = 1
         paste_masks[1, 15:19, 1:8] = 1
+        paste_labels = torch.tensor([3, 4])
+        if label_type == tv_tensors.OneHotLabel:
+            paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
-            "boxes": features.BoundingBox(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32)
+            "boxes": BoundingBoxes(
+                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", canvas_size=(32, 32)
             ),
-            "masks": features.SegmentationMask(paste_masks),
-            "labels": features.Label(torch.tensor([3, 4])),
+            "masks": Mask(paste_masks),
+            "labels": label_type(paste_labels),
         }
 
         transform = transforms.SimpleCopyPaste()
         random_selection = torch.tensor([0, 1])
-        output_image, output_target = transform._copy_paste(image, target, paste_image, paste_target, random_selection)
+        output_image, output_target = transform._copy_paste(
+            image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias
+        )
 
         assert output_image.unique().tolist() == [2, 10]
         assert output_target["boxes"].shape == (4, 4)
         torch.testing.assert_close(output_target["boxes"][:2, :], target["boxes"])
         torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
-        torch.testing.assert_close(output_target["labels"], features.Label(torch.tensor([1, 2, 3, 4])))
+
+        expected_labels = torch.tensor([1, 2, 3, 4])
+        if label_type == tv_tensors.OneHotLabel:
+            expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
+        torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
+
         assert output_target["masks"].shape == (4, 32, 32)
         torch.testing.assert_close(output_target["masks"][:2, :], target["masks"])
         torch.testing.assert_close(output_target["masks"][2:, :], paste_target["masks"])
 
 
 class TestFixedSizeCrop:
-    def test__get_params(self, mocker):
+    def test_make_params(self, mocker):
         crop_size = (7, 7)
         batch_shape = (10,)
-        image_size = (11, 5)
+        canvas_size = (11, 5)
 
         transform = transforms.FixedSizeCrop(size=crop_size)
 
-        sample = dict(
-            image=make_image(size=image_size, color_space=features.ColorSpace.RGB),
-            bounding_boxes=make_bounding_box(
-                format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape
-            ),
-        )
-        params = transform._get_params(sample)
+        flat_inputs = [
+            make_image(size=canvas_size, color_space="RGB"),
+            make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_shape[0]),
+        ]
+        params = transform.make_params(flat_inputs)
 
         assert params["needs_crop"]
         assert params["height"] <= crop_size[0]
@@ -1433,163 +185,245 @@ def test__get_params(self, mocker):
         assert params["needs_pad"]
         assert any(pad > 0 for pad in params["padding"])
 
-    @pytest.mark.parametrize("needs", list(itertools.product((False, True), repeat=2)))
-    def test__transform(self, mocker, needs):
-        fill_sentinel = mocker.MagicMock()
-        padding_mode_sentinel = mocker.MagicMock()
-
-        transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
-        transform._transformed_types = (mocker.MagicMock,)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        needs_crop, needs_pad = needs
-        top_sentinel = mocker.MagicMock()
-        left_sentinel = mocker.MagicMock()
-        height_sentinel = mocker.MagicMock()
-        width_sentinel = mocker.MagicMock()
-        padding_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=needs_crop,
-                top=top_sentinel,
-                left=left_sentinel,
-                height=height_sentinel,
-                width=width_sentinel,
-                padding=padding_sentinel,
-                needs_pad=needs_pad,
-            ),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_crop = mocker.patch("torchvision.prototype.transforms._geometry.F.crop")
-        mock_pad = mocker.patch("torchvision.prototype.transforms._geometry.F.pad")
-        transform(inpt_sentinel)
-
-        if needs_crop:
-            mock_crop.assert_called_once_with(
-                inpt_sentinel,
-                top=top_sentinel,
-                left=left_sentinel,
-                height=height_sentinel,
-                width=width_sentinel,
-            )
-        else:
-            mock_crop.assert_not_called()
-
-        if needs_pad:
-            # If we cropped before, the input to F.pad is no longer inpt_sentinel. Thus, we can't use
-            # `MagicMock.assert_called_once_with` and have to perform the checks manually
-            mock_pad.assert_called_once()
-            args, kwargs = mock_pad.call_args
-            if not needs_crop:
-                assert args[0] is inpt_sentinel
-            assert args[1] is padding_sentinel
-            assert kwargs == dict(fill=fill_sentinel, padding_mode=padding_mode_sentinel)
-        else:
-            mock_pad.assert_not_called()
-
     def test__transform_culling(self, mocker):
         batch_size = 10
-        image_size = (10, 10)
+        canvas_size = (10, 10)
 
         is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
         mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
+            "torchvision.prototype.transforms._geometry.FixedSizeCrop.make_params",
             return_value=dict(
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=image_size[0],
-                width=image_size[1],
+                height=canvas_size[0],
+                width=canvas_size[1],
                 is_valid=is_valid,
                 needs_pad=False,
             ),
         )
 
-        bounding_boxes = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+        bounding_boxes = make_bounding_boxes(
+            format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_size
         )
-        segmentation_masks = make_segmentation_mask(size=image_size, extra_dims=(batch_size,))
-        labels = make_label(size=(batch_size,))
+        masks = make_detection_masks(size=canvas_size, num_masks=batch_size)
+        labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
         output = transform(
             dict(
                 bounding_boxes=bounding_boxes,
-                segmentation_masks=segmentation_masks,
+                masks=masks,
                 labels=labels,
             )
         )
 
         assert_equal(output["bounding_boxes"], bounding_boxes[is_valid])
-        assert_equal(output["segmentation_masks"], segmentation_masks[is_valid])
+        assert_equal(output["masks"], masks[is_valid])
         assert_equal(output["labels"], labels[is_valid])
 
-    def test__transform_bounding_box_clamping(self, mocker):
+    def test__transform_bounding_boxes_clamping(self, mocker):
         batch_size = 3
-        image_size = (10, 10)
+        canvas_size = (10, 10)
 
         mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
+            "torchvision.prototype.transforms._geometry.FixedSizeCrop.make_params",
             return_value=dict(
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=image_size[0],
-                width=image_size[1],
+                height=canvas_size[0],
+                width=canvas_size[1],
                 is_valid=torch.full((batch_size,), fill_value=True),
                 needs_pad=False,
             ),
         )
 
-        bounding_box = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+        bounding_boxes = make_bounding_boxes(
+            format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_size
+        )
+        mock = mocker.patch(
+            "torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes", wraps=clamp_bounding_boxes
         )
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
         transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
-        transform(bounding_box)
+        transform(bounding_boxes)
 
         mock.assert_called_once()
 
 
-class TestLinearTransformation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="transformation_matrix should be square"):
-            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
+class TestLabelToOneHot:
+    def test__transform(self):
+        categories = ["apple", "pear", "pineapple"]
+        labels = tv_tensors.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
+        transform = transforms.LabelToOneHot()
+        ohe_labels = transform(labels)
+        assert isinstance(ohe_labels, tv_tensors.OneHotLabel)
+        assert ohe_labels.shape == (4, 3)
+        assert ohe_labels.categories == labels.categories == categories
 
-        with pytest.raises(ValueError, match="mean_vector should have the same length"):
-            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
 
+class TestPermuteDimensions:
     @pytest.mark.parametrize(
-        "inpt",
+        ("dims", "inverse_dims"),
         [
-            122 * torch.ones(1, 3, 8, 8),
-            122.0 * torch.ones(1, 3, 8, 8),
-            features.Image(122 * torch.ones(1, 3, 8, 8)),
-            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
+            (
+                {Image: (2, 1, 0), Video: None},
+                {Image: (2, 1, 0), Video: None},
+            ),
+            (
+                {Image: (2, 1, 0), Video: (1, 2, 3, 0)},
+                {Image: (2, 1, 0), Video: (3, 0, 1, 2)},
+            ),
         ],
     )
-    def test__transform(self, inpt):
+    def test_call(self, dims, inverse_dims):
+        sample = dict(
+            image=make_image(),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
+            video=make_video(),
+            str="str",
+            int=0,
+        )
 
-        v = 121 * torch.ones(3 * 8 * 8)
-        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
-        transform = transforms.LinearTransformation(m, v)
+        transform = transforms.PermuteDimensions(dims)
+        transformed_sample = transform(sample)
 
-        if isinstance(inpt, PIL.Image.Image):
-            with pytest.raises(TypeError, match="Unsupported input type"):
-                transform(inpt)
-        else:
-            output = transform(inpt)
-            assert isinstance(output, torch.Tensor)
-            assert output.unique() == 3 * 8 * 8
-            assert output.dtype == inpt.dtype
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
+
+            if check_type(value, (Image, is_pure_tensor, Video)):
+                if transform.dims.get(value_type) is not None:
+                    assert transformed_value.permute(inverse_dims[value_type]).equal(value)
+                assert type(transformed_value) == torch.Tensor
+            else:
+                assert transformed_value is value
+
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((2, 3, 4))
+        transform = transforms.PermuteDimensions(dims=(1, 2, 0))
+
+        assert transform(tensor).shape == (3, 4, 2)
+
+    @pytest.mark.parametrize("other_type", [Image, Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+
+
+class TestTransposeDimensions:
+    @pytest.mark.parametrize(
+        "dims",
+        [
+            (-1, -2),
+            {Image: (1, 2), Video: None},
+        ],
+    )
+    def test_call(self, dims):
+        sample = dict(
+            image=make_image(),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
+            video=make_video(),
+            str="str",
+            int=0,
+        )
+
+        transform = transforms.TransposeDimensions(dims)
+        transformed_sample = transform(sample)
+
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
+
+            transposed_dims = transform.dims.get(value_type)
+            if check_type(value, (Image, is_pure_tensor, Video)):
+                if transposed_dims is not None:
+                    assert transformed_value.transpose(*transposed_dims).equal(value)
+                assert type(transformed_value) == torch.Tensor
+            else:
+                assert transformed_value is value
+
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((2, 3, 4))
+        transform = transforms.TransposeDimensions(dims=(0, 2))
+
+        assert transform(tensor).shape == (4, 3, 2)
+
+    @pytest.mark.parametrize("other_type", [Image, Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+
+
+import importlib.machinery
+import importlib.util
+from pathlib import Path
+
+
+def import_transforms_from_references(reference):
+    HERE = Path(__file__).parent
+    PROJECT_ROOT = HERE.parent
+
+    loader = importlib.machinery.SourceFileLoader(
+        "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py")
+    )
+    spec = importlib.util.spec_from_loader("transforms", loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+det_transforms = import_transforms_from_references("detection")
+
+
+def test_fixed_sized_crop_against_detection_reference():
+    def make_tv_tensors():
+        size = (600, 800)
+        num_objects = 22
+
+        pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long),
+        }
+
+        yield (pil_image, target)
+
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long),
+        }
+
+        yield (tensor_image, target)
+
+        tv_tensor_image = make_image(size=size, color_space="RGB")
+        target = {
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long),
+        }
+
+        yield (tv_tensor_image, target)
+
+    t = transforms.FixedSizeCrop((1024, 1024), fill=0)
+    t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0)
+
+    for dp in make_tv_tensors():
+        # We should use prototype transform first as reference transform performs inplace target update
+        torch.manual_seed(12)
+        output = t(dp)
+
+        torch.manual_seed(12)
+        expected_output = t_ref(*dp)
+
+        assert_equal(expected_output, output)
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
deleted file mode 100644
index cd11eb2a35e..00000000000
--- a/test/test_prototype_transforms_functional.py
+++ /dev/null
@@ -1,1910 +0,0 @@
-import functools
-import itertools
-import math
-import os
-
-import numpy as np
-import PIL.Image
-import pytest
-import torch.testing
-import torchvision.prototype.transforms.functional as F
-from common_utils import cpu_and_gpu
-from torch import jit
-from torch.nn.functional import one_hot
-from torchvision.prototype import features
-from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding
-from torchvision.prototype.transforms.functional._meta import convert_bounding_box_format
-from torchvision.transforms.functional import _get_perspective_coeffs
-from torchvision.transforms.functional_tensor import _max_value as get_max_value
-
-make_tensor = functools.partial(torch.testing.make_tensor, device="cpu")
-
-
-def make_image(size=None, *, color_space, extra_dims=(), dtype=torch.float32, constant_alpha=True):
-    size = size or torch.randint(16, 33, (2,)).tolist()
-
-    try:
-        num_channels = {
-            features.ColorSpace.GRAY: 1,
-            features.ColorSpace.GRAY_ALPHA: 2,
-            features.ColorSpace.RGB: 3,
-            features.ColorSpace.RGB_ALPHA: 4,
-        }[color_space]
-    except KeyError as error:
-        raise pytest.UsageError() from error
-
-    shape = (*extra_dims, num_channels, *size)
-    max_value = get_max_value(dtype)
-    data = make_tensor(shape, low=0, high=max_value, dtype=dtype)
-    if color_space in {features.ColorSpace.GRAY_ALPHA, features.ColorSpace.RGB_ALPHA} and constant_alpha:
-        data[..., -1, :, :] = max_value
-    return features.Image(data, color_space=color_space)
-
-
-make_grayscale_image = functools.partial(make_image, color_space=features.ColorSpace.GRAY)
-make_rgb_image = functools.partial(make_image, color_space=features.ColorSpace.RGB)
-
-
-def make_images(
-    sizes=((16, 16), (7, 33), (31, 9)),
-    color_spaces=(
-        features.ColorSpace.GRAY,
-        features.ColorSpace.GRAY_ALPHA,
-        features.ColorSpace.RGB,
-        features.ColorSpace.RGB_ALPHA,
-    ),
-    dtypes=(torch.float32, torch.uint8),
-    extra_dims=((4,), (2, 3)),
-):
-    for size, color_space, dtype in itertools.product(sizes, color_spaces, dtypes):
-        yield make_image(size, color_space=color_space, dtype=dtype)
-
-    for color_space, dtype, extra_dims_ in itertools.product(color_spaces, dtypes, extra_dims):
-        yield make_image(size=sizes[0], color_space=color_space, extra_dims=extra_dims_, dtype=dtype)
-
-
-def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
-    low, high = torch.broadcast_tensors(
-        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
-    )
-    return torch.stack(
-        [
-            torch.randint(low_scalar, high_scalar, (), **kwargs)
-            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
-        ]
-    ).reshape(low.shape)
-
-
-def make_bounding_box(*, format, image_size=(32, 32), extra_dims=(), dtype=torch.int64):
-    if isinstance(format, str):
-        format = features.BoundingBoxFormat[format]
-
-    height, width = image_size
-
-    if format == features.BoundingBoxFormat.XYXY:
-        x1 = torch.randint(0, width // 2, extra_dims)
-        y1 = torch.randint(0, height // 2, extra_dims)
-        x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-        y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-        parts = (x1, y1, x2, y2)
-    elif format == features.BoundingBoxFormat.XYWH:
-        x = torch.randint(0, width // 2, extra_dims)
-        y = torch.randint(0, height // 2, extra_dims)
-        w = randint_with_tensor_bounds(1, width - x)
-        h = randint_with_tensor_bounds(1, height - y)
-        parts = (x, y, w, h)
-    elif format == features.BoundingBoxFormat.CXCYWH:
-        cx = torch.randint(1, width - 1, ())
-        cy = torch.randint(1, height - 1, ())
-        w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-        h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-        parts = (cx, cy, w, h)
-    else:
-        raise pytest.UsageError()
-
-    return features.BoundingBox(torch.stack(parts, dim=-1).to(dtype), format=format, image_size=image_size)
-
-
-make_xyxy_bounding_box = functools.partial(make_bounding_box, format=features.BoundingBoxFormat.XYXY)
-
-
-def make_bounding_boxes(
-    formats=(features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH),
-    image_sizes=((32, 32),),
-    dtypes=(torch.int64, torch.float32),
-    extra_dims=((4,), (2, 3)),
-):
-    for format, image_size, dtype in itertools.product(formats, image_sizes, dtypes):
-        yield make_bounding_box(format=format, image_size=image_size, dtype=dtype)
-
-    for format, extra_dims_ in itertools.product(formats, extra_dims):
-        yield make_bounding_box(format=format, extra_dims=extra_dims_)
-
-
-def make_label(size=(), *, categories=("category0", "category1")):
-    return features.Label(torch.randint(0, len(categories) if categories else 10, size), categories=categories)
-
-
-def make_one_hot_label(*args, **kwargs):
-    label = make_label(*args, **kwargs)
-    return features.OneHotLabel(one_hot(label, num_classes=len(label.categories)), categories=label.categories)
-
-
-def make_one_hot_labels(
-    *,
-    num_categories=(1, 2, 10),
-    extra_dims=((4,), (2, 3)),
-):
-    for num_categories_ in num_categories:
-        yield make_one_hot_label(categories=[f"category{idx}" for idx in range(num_categories_)])
-
-    for extra_dims_ in extra_dims:
-        yield make_one_hot_label(extra_dims_)
-
-
-def make_segmentation_mask(size=None, *, num_categories=80, extra_dims=(), dtype=torch.long):
-    size = size or torch.randint(16, 33, (2,)).tolist()
-    shape = (*extra_dims, 1, *size)
-    data = make_tensor(shape, low=0, high=num_categories, dtype=dtype)
-    return features.SegmentationMask(data)
-
-
-def make_segmentation_masks(
-    sizes=((16, 16), (7, 33), (31, 9)),
-    dtypes=(torch.long,),
-    extra_dims=((), (4,), (2, 3)),
-):
-    for size, dtype, extra_dims_ in itertools.product(sizes, dtypes, extra_dims):
-        yield make_segmentation_mask(size=size, dtype=dtype, extra_dims=extra_dims_)
-
-
-class SampleInput:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-
-class FunctionalInfo:
-    def __init__(self, name, *, sample_inputs_fn):
-        self.name = name
-        self.functional = getattr(F, name)
-        self._sample_inputs_fn = sample_inputs_fn
-
-    def sample_inputs(self):
-        yield from self._sample_inputs_fn()
-
-    def __call__(self, *args, **kwargs):
-        if len(args) == 1 and not kwargs and isinstance(args[0], SampleInput):
-            sample_input = args[0]
-            return self.functional(*sample_input.args, **sample_input.kwargs)
-
-        return self.functional(*args, **kwargs)
-
-
-FUNCTIONAL_INFOS = []
-
-
-def register_kernel_info_from_sample_inputs_fn(sample_inputs_fn):
-    FUNCTIONAL_INFOS.append(FunctionalInfo(sample_inputs_fn.__name__, sample_inputs_fn=sample_inputs_fn))
-    return sample_inputs_fn
-
-
-@register_kernel_info_from_sample_inputs_fn
-def horizontal_flip_image_tensor():
-    for image in make_images():
-        yield SampleInput(image)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def horizontal_flip_bounding_box():
-    for bounding_box in make_bounding_boxes(formats=[features.BoundingBoxFormat.XYXY]):
-        yield SampleInput(bounding_box, format=bounding_box.format, image_size=bounding_box.image_size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def horizontal_flip_segmentation_mask():
-    for mask in make_segmentation_masks():
-        yield SampleInput(mask)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def vertical_flip_image_tensor():
-    for image in make_images():
-        yield SampleInput(image)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def vertical_flip_bounding_box():
-    for bounding_box in make_bounding_boxes(formats=[features.BoundingBoxFormat.XYXY]):
-        yield SampleInput(bounding_box, format=bounding_box.format, image_size=bounding_box.image_size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def vertical_flip_segmentation_mask():
-    for mask in make_segmentation_masks():
-        yield SampleInput(mask)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def resize_image_tensor():
-    for image, interpolation, max_size, antialias in itertools.product(
-        make_images(),
-        [F.InterpolationMode.BILINEAR, F.InterpolationMode.NEAREST],  # interpolation
-        [None, 34],  # max_size
-        [False, True],  # antialias
-    ):
-
-        if antialias and interpolation == F.InterpolationMode.NEAREST:
-            continue
-
-        height, width = image.shape[-2:]
-        for size in [
-            (height, width),
-            (int(height * 0.75), int(width * 1.25)),
-        ]:
-            if max_size is not None:
-                size = [size[0]]
-            yield SampleInput(image, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def resize_bounding_box():
-    for bounding_box, max_size in itertools.product(
-        make_bounding_boxes(),
-        [None, 34],  # max_size
-    ):
-        height, width = bounding_box.image_size
-        for size in [
-            (height, width),
-            (int(height * 0.75), int(width * 1.25)),
-        ]:
-            if max_size is not None:
-                size = [size[0]]
-            yield SampleInput(bounding_box, size=size, image_size=bounding_box.image_size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def resize_segmentation_mask():
-    for mask, max_size in itertools.product(
-        make_segmentation_masks(),
-        [None, 34],  # max_size
-    ):
-        height, width = mask.shape[-2:]
-        for size in [
-            (height, width),
-            (int(height * 0.75), int(width * 1.25)),
-        ]:
-            if max_size is not None:
-                size = [size[0]]
-            yield SampleInput(mask, size=size, max_size=max_size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def affine_image_tensor():
-    for image, angle, translate, scale, shear in itertools.product(
-        make_images(extra_dims=((), (4,))),
-        [-87, 15, 90],  # angle
-        [5, -5],  # translate
-        [0.77, 1.27],  # scale
-        [0, 12],  # shear
-    ):
-        yield SampleInput(
-            image,
-            angle=angle,
-            translate=(translate, translate),
-            scale=scale,
-            shear=(shear, shear),
-            interpolation=F.InterpolationMode.NEAREST,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def affine_bounding_box():
-    for bounding_box, angle, translate, scale, shear in itertools.product(
-        make_bounding_boxes(),
-        [-87, 15, 90],  # angle
-        [5, -5],  # translate
-        [0.77, 1.27],  # scale
-        [0, 12],  # shear
-    ):
-        yield SampleInput(
-            bounding_box,
-            format=bounding_box.format,
-            image_size=bounding_box.image_size,
-            angle=angle,
-            translate=(translate, translate),
-            scale=scale,
-            shear=(shear, shear),
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def affine_segmentation_mask():
-    for mask, angle, translate, scale, shear in itertools.product(
-        make_segmentation_masks(extra_dims=((), (4,))),
-        [-87, 15, 90],  # angle
-        [5, -5],  # translate
-        [0.77, 1.27],  # scale
-        [0, 12],  # shear
-    ):
-        yield SampleInput(
-            mask,
-            angle=angle,
-            translate=(translate, translate),
-            scale=scale,
-            shear=(shear, shear),
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def rotate_image_tensor():
-    for image, angle, expand, center, fill in itertools.product(
-        make_images(extra_dims=((), (4,))),
-        [-87, 15, 90],  # angle
-        [True, False],  # expand
-        [None, [12, 23]],  # center
-        [None, [128], [12.0]],  # fill
-    ):
-        if center is not None and expand:
-            # Skip warning: The provided center argument is ignored if expand is True
-            continue
-
-        yield SampleInput(image, angle=angle, expand=expand, center=center, fill=fill)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def rotate_bounding_box():
-    for bounding_box, angle, expand, center in itertools.product(
-        make_bounding_boxes(), [-87, 15, 90], [True, False], [None, [12, 23]]
-    ):
-        if center is not None and expand:
-            # Skip warning: The provided center argument is ignored if expand is True
-            continue
-
-        yield SampleInput(
-            bounding_box,
-            format=bounding_box.format,
-            image_size=bounding_box.image_size,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def rotate_segmentation_mask():
-    for mask, angle, expand, center in itertools.product(
-        make_segmentation_masks(extra_dims=((), (4,))),
-        [-87, 15, 90],  # angle
-        [True, False],  # expand
-        [None, [12, 23]],  # center
-    ):
-        if center is not None and expand:
-            # Skip warning: The provided center argument is ignored if expand is True
-            continue
-
-        yield SampleInput(
-            mask,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def crop_image_tensor():
-    for image, top, left, height, width in itertools.product(make_images(), [-8, 0, 9], [-8, 0, 9], [12, 20], [12, 20]):
-        yield SampleInput(
-            image,
-            top=top,
-            left=left,
-            height=height,
-            width=width,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def crop_bounding_box():
-    for bounding_box, top, left in itertools.product(make_bounding_boxes(), [-8, 0, 9], [-8, 0, 9]):
-        yield SampleInput(
-            bounding_box,
-            format=bounding_box.format,
-            top=top,
-            left=left,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def crop_segmentation_mask():
-    for mask, top, left, height, width in itertools.product(
-        make_segmentation_masks(), [-8, 0, 9], [-8, 0, 9], [12, 20], [12, 20]
-    ):
-        yield SampleInput(
-            mask,
-            top=top,
-            left=left,
-            height=height,
-            width=width,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def resized_crop_image_tensor():
-    for mask, top, left, height, width, size, antialias in itertools.product(
-        make_images(),
-        [-8, 9],
-        [-8, 9],
-        [12],
-        [12],
-        [(16, 18)],
-        [True, False],
-    ):
-        yield SampleInput(mask, top=top, left=left, height=height, width=width, size=size, antialias=antialias)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def resized_crop_bounding_box():
-    for bounding_box, top, left, height, width, size in itertools.product(
-        make_bounding_boxes(), [-8, 9], [-8, 9], [32, 22], [34, 20], [(32, 32), (16, 18)]
-    ):
-        yield SampleInput(
-            bounding_box, format=bounding_box.format, top=top, left=left, height=height, width=width, size=size
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def resized_crop_segmentation_mask():
-    for mask, top, left, height, width, size in itertools.product(
-        make_segmentation_masks(), [-8, 0, 9], [-8, 0, 9], [12, 20], [12, 20], [(32, 32), (16, 18)]
-    ):
-        yield SampleInput(mask, top=top, left=left, height=height, width=width, size=size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def pad_image_tensor():
-    for image, padding, fill, padding_mode in itertools.product(
-        make_images(),
-        [[1], [1, 1], [1, 1, 2, 2]],  # padding
-        [None, 12, 12.0],  # fill
-        ["constant", "symmetric", "edge", "reflect"],  # padding mode,
-    ):
-        yield SampleInput(image, padding=padding, fill=fill, padding_mode=padding_mode)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def pad_segmentation_mask():
-    for mask, padding, padding_mode in itertools.product(
-        make_segmentation_masks(),
-        [[1], [1, 1], [1, 1, 2, 2]],  # padding
-        ["constant", "symmetric", "edge", "reflect"],  # padding mode,
-    ):
-        yield SampleInput(mask, padding=padding, padding_mode=padding_mode)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def pad_bounding_box():
-    for bounding_box, padding in itertools.product(
-        make_bounding_boxes(),
-        [[1], [1, 1], [1, 1, 2, 2]],
-    ):
-        yield SampleInput(bounding_box, padding=padding, format=bounding_box.format)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def perspective_image_tensor():
-    for image, perspective_coeffs, fill in itertools.product(
-        make_images(extra_dims=((), (4,))),
-        [
-            [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
-            [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
-        ],
-        [None, [128], [12.0]],  # fill
-    ):
-        yield SampleInput(image, perspective_coeffs=perspective_coeffs, fill=fill)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def perspective_bounding_box():
-    for bounding_box, perspective_coeffs in itertools.product(
-        make_bounding_boxes(),
-        [
-            [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
-            [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
-        ],
-    ):
-        yield SampleInput(
-            bounding_box,
-            format=bounding_box.format,
-            perspective_coeffs=perspective_coeffs,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def perspective_segmentation_mask():
-    for mask, perspective_coeffs in itertools.product(
-        make_segmentation_masks(extra_dims=((), (4,))),
-        [
-            [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
-            [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
-        ],
-    ):
-        yield SampleInput(
-            mask,
-            perspective_coeffs=perspective_coeffs,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def elastic_image_tensor():
-    for image, fill in itertools.product(
-        make_images(extra_dims=((), (4,))),
-        [None, [128], [12.0]],  # fill
-    ):
-        h, w = image.shape[-2:]
-        displacement = torch.rand(1, h, w, 2)
-        yield SampleInput(image, displacement=displacement, fill=fill)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def elastic_bounding_box():
-    for bounding_box in make_bounding_boxes():
-        h, w = bounding_box.image_size
-        displacement = torch.rand(1, h, w, 2)
-        yield SampleInput(
-            bounding_box,
-            format=bounding_box.format,
-            displacement=displacement,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def elastic_segmentation_mask():
-    for mask in make_segmentation_masks(extra_dims=((), (4,))):
-        h, w = mask.shape[-2:]
-        displacement = torch.rand(1, h, w, 2)
-        yield SampleInput(
-            mask,
-            displacement=displacement,
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def center_crop_image_tensor():
-    for mask, output_size in itertools.product(
-        make_images(sizes=((16, 16), (7, 33), (31, 9))),
-        [[4, 3], [42, 70], [4]],  # crop sizes < image sizes, crop_sizes > image sizes, single crop size
-    ):
-        yield SampleInput(mask, output_size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def center_crop_bounding_box():
-    for bounding_box, output_size in itertools.product(make_bounding_boxes(), [(24, 12), [16, 18], [46, 48], [12]]):
-        yield SampleInput(
-            bounding_box, format=bounding_box.format, output_size=output_size, image_size=bounding_box.image_size
-        )
-
-
-@register_kernel_info_from_sample_inputs_fn
-def center_crop_segmentation_mask():
-    for mask, output_size in itertools.product(
-        make_segmentation_masks(sizes=((16, 16), (7, 33), (31, 9))),
-        [[4, 3], [42, 70], [4]],  # crop sizes < image sizes, crop_sizes > image sizes, single crop size
-    ):
-        yield SampleInput(mask, output_size)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def gaussian_blur_image_tensor():
-    for image, kernel_size, sigma in itertools.product(
-        make_images(extra_dims=((4,),)),
-        [[3, 3]],
-        [None, [3.0, 3.0]],
-    ):
-        yield SampleInput(image, kernel_size=kernel_size, sigma=sigma)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def equalize_image_tensor():
-    for image in make_images(extra_dims=(), color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)):
-        if image.dtype != torch.uint8:
-            continue
-        yield SampleInput(image)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def invert_image_tensor():
-    for image in make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)):
-        yield SampleInput(image)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def posterize_image_tensor():
-    for image, bits in itertools.product(
-        make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)),
-        [1, 4, 8],
-    ):
-        if image.dtype != torch.uint8:
-            continue
-        yield SampleInput(image, bits=bits)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def solarize_image_tensor():
-    for image, threshold in itertools.product(
-        make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)),
-        [0.1, 0.5, 127.0],
-    ):
-        if image.is_floating_point() and threshold > 1.0:
-            continue
-        yield SampleInput(image, threshold=threshold)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def autocontrast_image_tensor():
-    for image in make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)):
-        yield SampleInput(image)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def adjust_sharpness_image_tensor():
-    for image, sharpness_factor in itertools.product(
-        make_images(extra_dims=((4,),), color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)),
-        [0.1, 0.5],
-    ):
-        yield SampleInput(image, sharpness_factor=sharpness_factor)
-
-
-@register_kernel_info_from_sample_inputs_fn
-def erase_image_tensor():
-    for image in make_images():
-        c = image.shape[-3]
-        yield SampleInput(image, i=1, j=2, h=6, w=7, v=torch.rand(c, 6, 7))
-
-
-@pytest.mark.parametrize(
-    "kernel",
-    [
-        pytest.param(kernel, id=name)
-        for name, kernel in F.__dict__.items()
-        if not name.startswith("_")
-        and callable(kernel)
-        and any(feature_type in name for feature_type in {"image", "segmentation_mask", "bounding_box", "label"})
-        and "pil" not in name
-        and name
-        not in {
-            "to_image_tensor",
-        }
-    ],
-)
-def test_scriptable(kernel):
-    jit.script(kernel)
-
-
-# Test below is intended to test mid-level op vs low-level ops it calls
-# For example, resize -> resize_image_tensor, resize_bounding_boxes etc
-# TODO: Rewrite this tests as sample args may include more or less params
-# than needed by functions
-@pytest.mark.parametrize(
-    "func",
-    [
-        pytest.param(func, id=name)
-        for name, func in F.__dict__.items()
-        if not name.startswith("_")
-        and callable(func)
-        and all(
-            feature_type not in name for feature_type in {"image", "segmentation_mask", "bounding_box", "label", "pil"}
-        )
-        and name
-        not in {
-            "to_image_tensor",
-            "InterpolationMode",
-            "decode_video_with_av",
-            "crop",
-            "perspective",
-            "elastic_transform",
-            "elastic",
-        }
-        # We skip 'crop' due to missing 'height' and 'width'
-        # We skip 'perspective' as it requires different input args than perspective_image_tensor etc
-        # Skip 'elastic', TODO: inspect why test is failing
-    ],
-)
-def test_functional_mid_level(func):
-    finfos = [finfo for finfo in FUNCTIONAL_INFOS if f"{func.__name__}_" in finfo.name]
-    for finfo in finfos:
-        for sample_input in finfo.sample_inputs():
-            expected = finfo(sample_input)
-            kwargs = dict(sample_input.kwargs)
-            for key in ["format", "image_size"]:
-                if key in kwargs:
-                    del kwargs[key]
-            output = func(*sample_input.args, **kwargs)
-            torch.testing.assert_close(
-                output, expected, msg=f"finfo={finfo.name}, output={output}, expected={expected}"
-            )
-            break
-
-
-@pytest.mark.parametrize(
-    ("functional_info", "sample_input"),
-    [
-        pytest.param(functional_info, sample_input, id=f"{functional_info.name}-{idx}")
-        for functional_info in FUNCTIONAL_INFOS
-        for idx, sample_input in enumerate(functional_info.sample_inputs())
-    ],
-)
-def test_eager_vs_scripted(functional_info, sample_input):
-    eager = functional_info(sample_input)
-    scripted = jit.script(functional_info.functional)(*sample_input.args, **sample_input.kwargs)
-
-    torch.testing.assert_close(eager, scripted)
-
-
-def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
-    rot = math.radians(angle_)
-    cx, cy = center_
-    tx, ty = translate_
-    sx, sy = [math.radians(sh_) for sh_ in shear_]
-
-    c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
-    t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
-    c_matrix_inv = np.linalg.inv(c_matrix)
-    rs_matrix = np.array(
-        [
-            [scale_ * math.cos(rot), -scale_ * math.sin(rot), 0],
-            [scale_ * math.sin(rot), scale_ * math.cos(rot), 0],
-            [0, 0, 1],
-        ]
-    )
-    shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
-    shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
-    rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
-    true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
-    return true_matrix
-
-
-@pytest.mark.parametrize("angle", range(-90, 90, 56))
-@pytest.mark.parametrize("translate", range(-10, 10, 8))
-@pytest.mark.parametrize("scale", [0.77, 1.0, 1.27])
-@pytest.mark.parametrize("shear", range(-15, 15, 8))
-@pytest.mark.parametrize("center", [None, (12, 14)])
-def test_correctness_affine_bounding_box(angle, translate, scale, shear, center):
-    def _compute_expected_bbox(bbox, angle_, translate_, scale_, shear_, center_):
-        affine_matrix = _compute_affine_matrix(angle_, translate_, scale_, shear_, center_)
-        affine_matrix = affine_matrix[:2, :]
-
-        bbox_xyxy = convert_bounding_box_format(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-            ]
-        )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = [
-            np.min(transformed_points[:, 0]),
-            np.min(transformed_points[:, 1]),
-            np.max(transformed_points[:, 0]),
-            np.max(transformed_points[:, 1]),
-        ]
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYXY,
-            image_size=bbox.image_size,
-            dtype=torch.float32,
-            device=bbox.device,
-        )
-        return convert_bounding_box_format(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-        )
-
-    image_size = (32, 38)
-
-    for bboxes in make_bounding_boxes(
-        image_sizes=[
-            image_size,
-        ],
-        extra_dims=((4,),),
-    ):
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_bboxes = F.affine_bounding_box(
-            bboxes,
-            bboxes_format,
-            image_size=bboxes_image_size,
-            angle=angle,
-            translate=(translate, translate),
-            scale=scale,
-            shear=(shear, shear),
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_image_size[::-1]]
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(
-                _compute_expected_bbox(bbox, angle, (translate, translate), scale, (shear, shear), center_)
-            )
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_affine_bounding_box_on_fixed_input(device):
-    # Check transformation against known expected output
-    image_size = (64, 64)
-    # xyxy format
-    in_boxes = [
-        [20, 25, 35, 45],
-        [50, 5, 70, 22],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
-        [1, 1, 5, 5],
-    ]
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
-    )
-    # Tested parameters
-    angle = 63
-    scale = 0.89
-    dx = 0.12
-    dy = 0.23
-
-    # Expected bboxes computed using albumentations:
-    # from albumentations.augmentations.geometric.functional import bbox_shift_scale_rotate
-    # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox
-    # expected_bboxes = []
-    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *image_size)
-    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size)
-    #     out_box = denormalize_bbox(n_out_box, *image_size)
-    #     expected_bboxes.append(out_box)
-    expected_bboxes = [
-        (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695),
-        (54.88288587110401, 50.08453280875634, 76.44484547743795, 72.81332520036864),
-        (27.709526487041554, 34.74952648704156, 51.650473512958435, 58.69047351295844),
-        (48.56528888843238, 9.611532109828834, 53.35347829361575, 14.39972151501221),
-    ]
-
-    output_boxes = F.affine_bounding_box(
-        in_boxes,
-        in_boxes.format,
-        in_boxes.image_size,
-        angle,
-        (dx * image_size[1], dy * image_size[0]),
-        scale,
-        shear=(0, 0),
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("angle", [-54, 56])
-@pytest.mark.parametrize("translate", [-7, 8])
-@pytest.mark.parametrize("scale", [0.89, 1.12])
-@pytest.mark.parametrize("shear", [4])
-@pytest.mark.parametrize("center", [None, (12, 14)])
-def test_correctness_affine_segmentation_mask(angle, translate, scale, shear, center):
-    def _compute_expected_mask(mask, angle_, translate_, scale_, shear_, center_):
-        assert mask.ndim == 3 and mask.shape[0] == 1
-        affine_matrix = _compute_affine_matrix(angle_, translate_, scale_, shear_, center_)
-        inv_affine_matrix = np.linalg.inv(affine_matrix)
-        inv_affine_matrix = inv_affine_matrix[:2, :]
-
-        expected_mask = torch.zeros_like(mask.cpu())
-        for out_y in range(expected_mask.shape[1]):
-            for out_x in range(expected_mask.shape[2]):
-                output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0])
-                input_pt = np.floor(np.dot(inv_affine_matrix, output_pt)).astype(np.int32)
-                in_x, in_y = input_pt[:2]
-                if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]:
-                    expected_mask[0, out_y, out_x] = mask[0, in_y, in_x]
-        return expected_mask.to(mask.device)
-
-    for mask in make_segmentation_masks(extra_dims=((), (4,))):
-        output_mask = F.affine_segmentation_mask(
-            mask,
-            angle=angle,
-            translate=(translate, translate),
-            scale=scale,
-            shear=(shear, shear),
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in mask.shape[-2:][::-1]]
-
-        if mask.ndim < 4:
-            masks = [mask]
-        else:
-            masks = [m for m in mask]
-
-        expected_masks = []
-        for mask in masks:
-            expected_mask = _compute_expected_mask(mask, angle, (translate, translate), scale, (shear, shear), center_)
-            expected_masks.append(expected_mask)
-        if len(expected_masks) > 1:
-            expected_masks = torch.stack(expected_masks)
-        else:
-            expected_masks = expected_masks[0]
-        torch.testing.assert_close(output_mask, expected_masks)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_affine_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees and scale
-    expected_mask = torch.rot90(mask, k=-1, dims=(-2, -1))
-    expected_mask = torch.nn.functional.interpolate(expected_mask[None, :].float(), size=(64, 64), mode="nearest")
-    expected_mask = expected_mask[0, :, 16 : 64 - 16, 16 : 64 - 16].long()
-
-    out_mask = F.affine_segmentation_mask(mask, 90, [0.0, 0.0], 64.0 / 32.0, [0.0, 0.0])
-
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("angle", range(-90, 90, 56))
-@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
-def test_correctness_rotate_bounding_box(angle, expand, center):
-    def _compute_expected_bbox(bbox, angle_, expand_, center_):
-        affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
-        affine_matrix = affine_matrix[:2, :]
-
-        image_size = bbox.image_size
-        bbox_xyxy = convert_bounding_box_format(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-                # image frame
-                [0.0, 0.0, 1.0],
-                [0.0, image_size[0], 1.0],
-                [image_size[1], image_size[0], 1.0],
-                [image_size[1], 0.0, 1.0],
-            ]
-        )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = [
-            np.min(transformed_points[:4, 0]),
-            np.min(transformed_points[:4, 1]),
-            np.max(transformed_points[:4, 0]),
-            np.max(transformed_points[:4, 1]),
-        ]
-        if expand_:
-            tr_x = np.min(transformed_points[4:, 0])
-            tr_y = np.min(transformed_points[4:, 1])
-            out_bbox[0] -= tr_x
-            out_bbox[1] -= tr_y
-            out_bbox[2] -= tr_x
-            out_bbox[3] -= tr_y
-
-            # image_size should be updated, but it is OK here to skip its computation
-            # as we do not compute it in F.rotate_bounding_box
-
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYXY,
-            image_size=image_size,
-            dtype=torch.float32,
-            device=bbox.device,
-        )
-        return convert_bounding_box_format(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-        )
-
-    image_size = (32, 38)
-
-    for bboxes in make_bounding_boxes(
-        image_sizes=[
-            image_size,
-        ],
-        extra_dims=((4,),),
-    ):
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_bboxes = F.rotate_bounding_box(
-            bboxes,
-            bboxes_format,
-            image_size=bboxes_image_size,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_image_size[::-1]]
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, -angle, expand, center_))
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
-def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
-    # Check transformation against known expected output
-    image_size = (64, 64)
-    # xyxy format
-    in_boxes = [
-        [1, 1, 5, 5],
-        [1, image_size[0] - 6, 5, image_size[0] - 2],
-        [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
-    ]
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
-    )
-    # Tested parameters
-    angle = 45
-    center = None if expand else [12, 23]
-
-    # # Expected bboxes computed using Detectron2:
-    # from detectron2.data.transforms import RotationTransform, AugmentationList
-    # from detectron2.data.transforms import AugInput
-    # import cv2
-    # inpt = AugInput(im1, boxes=np.array(in_boxes, dtype="float32"))
-    # augs = AugmentationList([RotationTransform(*size, angle, expand=expand, center=center, interp=cv2.INTER_NEAREST), ])
-    # out = augs(inpt)
-    # print(inpt.boxes)
-    if expand:
-        expected_bboxes = [
-            [1.65937957, 42.67157288, 7.31623382, 48.32842712],
-            [41.96446609, 82.9766594, 47.62132034, 88.63351365],
-            [82.26955262, 42.67157288, 87.92640687, 48.32842712],
-            [31.35786438, 31.35786438, 59.64213562, 59.64213562],
-        ]
-    else:
-        expected_bboxes = [
-            [-11.33452378, 12.39339828, -5.67766953, 18.05025253],
-            [28.97056275, 52.69848481, 34.627417, 58.35533906],
-            [69.27564928, 12.39339828, 74.93250353, 18.05025253],
-            [18.36396103, 1.07968978, 46.64823228, 29.36396103],
-        ]
-
-    output_boxes = F.rotate_bounding_box(
-        in_boxes,
-        in_boxes.format,
-        in_boxes.image_size,
-        angle,
-        expand=expand,
-        center=center,
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("angle", range(-90, 90, 37))
-@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
-def test_correctness_rotate_segmentation_mask(angle, expand, center):
-    def _compute_expected_mask(mask, angle_, expand_, center_):
-        assert mask.ndim == 3 and mask.shape[0] == 1
-        image_size = mask.shape[-2:]
-        affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
-        inv_affine_matrix = np.linalg.inv(affine_matrix)
-
-        if expand_:
-            # Pillow implementation on how to perform expand:
-            # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054-L2069
-            height, width = image_size
-            points = np.array(
-                [
-                    [0.0, 0.0, 1.0],
-                    [0.0, 1.0 * height, 1.0],
-                    [1.0 * width, 1.0 * height, 1.0],
-                    [1.0 * width, 0.0, 1.0],
-                ]
-            )
-            new_points = points @ inv_affine_matrix.T
-            min_vals = np.min(new_points, axis=0)[:2]
-            max_vals = np.max(new_points, axis=0)[:2]
-            cmax = np.ceil(np.trunc(max_vals * 1e4) * 1e-4)
-            cmin = np.floor(np.trunc((min_vals + 1e-8) * 1e4) * 1e-4)
-            new_width, new_height = (cmax - cmin).astype("int32").tolist()
-            tr = np.array([-(new_width - width) / 2.0, -(new_height - height) / 2.0, 1.0]) @ inv_affine_matrix.T
-
-            inv_affine_matrix[:2, 2] = tr[:2]
-            image_size = [new_height, new_width]
-
-        inv_affine_matrix = inv_affine_matrix[:2, :]
-        expected_mask = torch.zeros(1, *image_size, dtype=mask.dtype)
-
-        for out_y in range(expected_mask.shape[1]):
-            for out_x in range(expected_mask.shape[2]):
-                output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0])
-                input_pt = np.floor(np.dot(inv_affine_matrix, output_pt)).astype(np.int32)
-                in_x, in_y = input_pt[:2]
-                if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]:
-                    expected_mask[0, out_y, out_x] = mask[0, in_y, in_x]
-        return expected_mask.to(mask.device)
-
-    for mask in make_segmentation_masks(extra_dims=((), (4,))):
-        output_mask = F.rotate_segmentation_mask(
-            mask,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in mask.shape[-2:][::-1]]
-
-        if mask.ndim < 4:
-            masks = [mask]
-        else:
-            masks = [m for m in mask]
-
-        expected_masks = []
-        for mask in masks:
-            expected_mask = _compute_expected_mask(mask, -angle, expand, center_)
-            expected_masks.append(expected_mask)
-        if len(expected_masks) > 1:
-            expected_masks = torch.stack(expected_masks)
-        else:
-            expected_masks = expected_masks[0]
-        torch.testing.assert_close(output_mask, expected_masks)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees
-    expected_mask = torch.rot90(mask, k=1, dims=(-2, -1))
-    out_mask = F.rotate_segmentation_mask(mask, 90, expand=False)
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "format",
-    [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH],
-)
-@pytest.mark.parametrize(
-    "top, left, height, width, expected_bboxes",
-    [
-        [8, 12, 30, 40, [(-2.0, 7.0, 13.0, 27.0), (38.0, -3.0, 58.0, 14.0), (33.0, 38.0, 44.0, 54.0)]],
-        [-8, 12, 70, 40, [(-2.0, 23.0, 13.0, 43.0), (38.0, 13.0, 58.0, 30.0), (33.0, 54.0, 44.0, 70.0)]],
-    ],
-)
-def test_correctness_crop_bounding_box(device, format, top, left, height, width, expected_bboxes):
-
-    # Expected bboxes computed using Albumentations:
-    # import numpy as np
-    # from albumentations.augmentations.crops.functional import crop_bbox_by_coords, normalize_bbox, denormalize_bbox
-    # expected_bboxes = []
-    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *size)
-    #     n_out_box = crop_bbox_by_coords(
-    #         n_in_box, (left, top, left + width, top + height), height, width, *size
-    #     )
-    #     out_box = denormalize_bbox(n_out_box, height, width)
-    #     expected_bboxes.append(out_box)
-
-    size = (64, 76)
-    # xyxy format
-    in_boxes = [
-        [10.0, 15.0, 25.0, 35.0],
-        [50.0, 5.0, 70.0, 22.0],
-        [45.0, 46.0, 56.0, 62.0],
-    ]
-    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device)
-    if format != features.BoundingBoxFormat.XYXY:
-        in_boxes = convert_bounding_box_format(in_boxes, features.BoundingBoxFormat.XYXY, format)
-
-    output_boxes = F.crop_bounding_box(
-        in_boxes,
-        format,
-        top,
-        left,
-    )
-
-    if format != features.BoundingBoxFormat.XYXY:
-        output_boxes = convert_bounding_box_format(output_boxes, format, features.BoundingBoxFormat.XYXY)
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "top, left, height, width",
-    [
-        [4, 6, 30, 40],
-        [-8, 6, 70, 40],
-        [-8, -6, 70, 8],
-    ],
-)
-def test_correctness_crop_segmentation_mask(device, top, left, height, width):
-    def _compute_expected_mask(mask, top_, left_, height_, width_):
-        h, w = mask.shape[-2], mask.shape[-1]
-        if top_ >= 0 and left_ >= 0 and top_ + height_ < h and left_ + width_ < w:
-            expected = mask[..., top_ : top_ + height_, left_ : left_ + width_]
-        else:
-            # Create output mask
-            expected_shape = mask.shape[:-2] + (height_, width_)
-            expected = torch.zeros(expected_shape, device=mask.device, dtype=mask.dtype)
-
-            out_y1 = abs(top_) if top_ < 0 else 0
-            out_y2 = h - top_ if top_ + height_ >= h else height_
-            out_x1 = abs(left_) if left_ < 0 else 0
-            out_x2 = w - left_ if left_ + width_ >= w else width_
-
-            in_y1 = 0 if top_ < 0 else top_
-            in_y2 = h if top_ + height_ >= h else top_ + height_
-            in_x1 = 0 if left_ < 0 else left_
-            in_x2 = w if left_ + width_ >= w else left_ + width_
-            # Paste input mask into output
-            expected[..., out_y1:out_y2, out_x1:out_x2] = mask[..., in_y1:in_y2, in_x1:in_x2]
-
-        return expected
-
-    for mask in make_segmentation_masks():
-        if mask.device != torch.device(device):
-            mask = mask.to(device)
-        output_mask = F.crop_segmentation_mask(mask, top, left, height, width)
-        expected_mask = _compute_expected_mask(mask, top, left, height, width)
-        torch.testing.assert_close(output_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device):
-    mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    mask[:, :, 0] = 1
-
-    out_mask = F.horizontal_flip_segmentation_mask(mask)
-
-    expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    expected_mask[:, :, -1] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
-    mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    mask[:, 0, :] = 1
-
-    out_mask = F.vertical_flip_segmentation_mask(mask)
-
-    expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    expected_mask[:, -1, :] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "format",
-    [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH],
-)
-@pytest.mark.parametrize(
-    "top, left, height, width, size",
-    [
-        [0, 0, 30, 30, (60, 60)],
-        [-5, 5, 35, 45, (32, 34)],
-    ],
-)
-def test_correctness_resized_crop_bounding_box(device, format, top, left, height, width, size):
-    def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
-        # bbox should be xyxy
-        bbox[0] = (bbox[0] - left_) * size_[1] / width_
-        bbox[1] = (bbox[1] - top_) * size_[0] / height_
-        bbox[2] = (bbox[2] - left_) * size_[1] / width_
-        bbox[3] = (bbox[3] - top_) * size_[0] / height_
-        return bbox
-
-    image_size = (100, 100)
-    # xyxy format
-    in_boxes = [
-        [10.0, 10.0, 20.0, 20.0],
-        [5.0, 10.0, 15.0, 20.0],
-    ]
-    expected_bboxes = []
-    for in_box in in_boxes:
-        expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
-    expected_bboxes = torch.tensor(expected_bboxes, device=device)
-
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device
-    )
-    if format != features.BoundingBoxFormat.XYXY:
-        in_boxes = convert_bounding_box_format(in_boxes, features.BoundingBoxFormat.XYXY, format)
-
-    output_boxes = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
-
-    if format != features.BoundingBoxFormat.XYXY:
-        output_boxes = convert_bounding_box_format(output_boxes, format, features.BoundingBoxFormat.XYXY)
-
-    torch.testing.assert_close(output_boxes, expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "top, left, height, width, size",
-    [
-        [0, 0, 30, 30, (60, 60)],
-        [5, 5, 35, 45, (32, 34)],
-    ],
-)
-def test_correctness_resized_crop_segmentation_mask(device, top, left, height, width, size):
-    def _compute_expected_mask(mask, top_, left_, height_, width_, size_):
-        output = mask.clone()
-        output = output[:, top_ : top_ + height_, left_ : left_ + width_]
-        output = torch.nn.functional.interpolate(output[None, :].float(), size=size_, mode="nearest")
-        output = output[0, :].long()
-        return output
-
-    in_mask = torch.zeros(1, 100, 100, dtype=torch.long, device=device)
-    in_mask[0, 10:20, 10:20] = 1
-    in_mask[0, 5:15, 12:23] = 2
-
-    expected_mask = _compute_expected_mask(in_mask, top, left, height, width, size)
-    output_mask = F.resized_crop_segmentation_mask(in_mask, top, left, height, width, size)
-    torch.testing.assert_close(output_mask, expected_mask)
-
-
-def _parse_padding(padding):
-    if isinstance(padding, int):
-        return [padding] * 4
-    if isinstance(padding, list):
-        if len(padding) == 1:
-            return padding * 4
-        if len(padding) == 2:
-            return padding * 2  # [left, up, right, down]
-
-    return padding
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]])
-def test_correctness_pad_bounding_box(device, padding):
-    def _compute_expected_bbox(bbox, padding_):
-        pad_left, pad_up, _, _ = _parse_padding(padding_)
-
-        bbox_format = bbox.format
-        bbox_dtype = bbox.dtype
-        bbox = convert_bounding_box_format(bbox, old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY)
-
-        bbox[0::2] += pad_left
-        bbox[1::2] += pad_up
-
-        bbox = convert_bounding_box_format(
-            bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False
-        )
-        if bbox.dtype != bbox_dtype:
-            # Temporary cast to original dtype
-            # e.g. float32 -> int
-            bbox = bbox.to(bbox_dtype)
-        return bbox
-
-    for bboxes in make_bounding_boxes():
-        bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_boxes = F.pad_bounding_box(bboxes, padding, format=bboxes_format)
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, padding))
-
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_boxes, expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_pad_segmentation_mask_on_fixed_input(device):
-    mask = torch.ones((1, 3, 3), dtype=torch.long, device=device)
-
-    out_mask = F.pad_segmentation_mask(mask, padding=[1, 1, 1, 1])
-
-    expected_mask = torch.zeros((1, 5, 5), dtype=torch.long, device=device)
-    expected_mask[:, 1:-1, 1:-1] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("padding", [[1, 2, 3, 4], [1], 1, [1, 2]])
-@pytest.mark.parametrize("padding_mode", ["constant", "edge", "reflect", "symmetric"])
-def test_correctness_pad_segmentation_mask(padding, padding_mode):
-    def _compute_expected_mask(mask, padding_, padding_mode_):
-        h, w = mask.shape[-2], mask.shape[-1]
-        pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
-
-        if any(pad <= 0 for pad in [pad_left, pad_up, pad_right, pad_down]):
-            raise pytest.UsageError(
-                "Expected output can be computed on positive pad values only, "
-                "but F.pad_* can also crop for negative values"
-            )
-
-        new_h = h + pad_up + pad_down
-        new_w = w + pad_left + pad_right
-
-        new_shape = (*mask.shape[:-2], new_h, new_w) if len(mask.shape) > 2 else (new_h, new_w)
-        output = torch.zeros(new_shape, dtype=mask.dtype)
-        output[..., pad_up:-pad_down, pad_left:-pad_right] = mask
-
-        if padding_mode_ == "edge":
-            # pad top-left corner, left vertical block, bottom-left corner
-            output[..., :pad_up, :pad_left] = mask[..., 0, 0].unsqueeze(-1).unsqueeze(-2)
-            output[..., pad_up:-pad_down, :pad_left] = mask[..., :, 0].unsqueeze(-1)
-            output[..., -pad_down:, :pad_left] = mask[..., -1, 0].unsqueeze(-1).unsqueeze(-2)
-            # pad top-right corner, right vertical block, bottom-right corner
-            output[..., :pad_up, -pad_right:] = mask[..., 0, -1].unsqueeze(-1).unsqueeze(-2)
-            output[..., pad_up:-pad_down, -pad_right:] = mask[..., :, -1].unsqueeze(-1)
-            output[..., -pad_down:, -pad_right:] = mask[..., -1, -1].unsqueeze(-1).unsqueeze(-2)
-            # pad top and bottom horizontal blocks
-            output[..., :pad_up, pad_left:-pad_right] = mask[..., 0, :].unsqueeze(-2)
-            output[..., -pad_down:, pad_left:-pad_right] = mask[..., -1, :].unsqueeze(-2)
-        elif padding_mode_ in ("reflect", "symmetric"):
-            d1 = 1 if padding_mode_ == "reflect" else 0
-            d2 = -1 if padding_mode_ == "reflect" else None
-            both = (-1, -2)
-            # pad top-left corner, left vertical block, bottom-left corner
-            output[..., :pad_up, :pad_left] = mask[..., d1 : pad_up + d1, d1 : pad_left + d1].flip(both)
-            output[..., pad_up:-pad_down, :pad_left] = mask[..., :, d1 : pad_left + d1].flip(-1)
-            output[..., -pad_down:, :pad_left] = mask[..., -pad_down - d1 : d2, d1 : pad_left + d1].flip(both)
-            # pad top-right corner, right vertical block, bottom-right corner
-            output[..., :pad_up, -pad_right:] = mask[..., d1 : pad_up + d1, -pad_right - d1 : d2].flip(both)
-            output[..., pad_up:-pad_down, -pad_right:] = mask[..., :, -pad_right - d1 : d2].flip(-1)
-            output[..., -pad_down:, -pad_right:] = mask[..., -pad_down - d1 : d2, -pad_right - d1 : d2].flip(both)
-            # pad top and bottom horizontal blocks
-            output[..., :pad_up, pad_left:-pad_right] = mask[..., d1 : pad_up + d1, :].flip(-2)
-            output[..., -pad_down:, pad_left:-pad_right] = mask[..., -pad_down - d1 : d2, :].flip(-2)
-
-        return output
-
-    for mask in make_segmentation_masks():
-        out_mask = F.pad_segmentation_mask(mask, padding, padding_mode=padding_mode)
-
-        expected_mask = _compute_expected_mask(mask, padding, padding_mode)
-        torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "startpoints, endpoints",
-    [
-        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
-    ],
-)
-def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
-    def _compute_expected_bbox(bbox, pcoeffs_):
-        m1 = np.array(
-            [
-                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
-                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
-            ]
-        )
-        m2 = np.array(
-            [
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-            ]
-        )
-
-        bbox_xyxy = convert_bounding_box_format(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-            ]
-        )
-        numer = np.matmul(points, m1.T)
-        denom = np.matmul(points, m2.T)
-        transformed_points = numer / denom
-        out_bbox = [
-            np.min(transformed_points[:, 0]),
-            np.min(transformed_points[:, 1]),
-            np.max(transformed_points[:, 0]),
-            np.max(transformed_points[:, 1]),
-        ]
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYXY,
-            image_size=bbox.image_size,
-            dtype=torch.float32,
-            device=bbox.device,
-        )
-        return convert_bounding_box_format(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-        )
-
-    image_size = (32, 38)
-
-    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
-    inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
-
-    for bboxes in make_bounding_boxes(
-        image_sizes=[
-            image_size,
-        ],
-        extra_dims=((4,),),
-    ):
-        bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_bboxes = F.perspective_bounding_box(
-            bboxes,
-            bboxes_format,
-            perspective_coeffs=pcoeffs,
-        )
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=1e-5, atol=1e-5)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "startpoints, endpoints",
-    [
-        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
-    ],
-)
-def test_correctness_perspective_segmentation_mask(device, startpoints, endpoints):
-    def _compute_expected_mask(mask, pcoeffs_):
-        assert mask.ndim == 3 and mask.shape[0] == 1
-        m1 = np.array(
-            [
-                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
-                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
-            ]
-        )
-        m2 = np.array(
-            [
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-            ]
-        )
-
-        expected_mask = torch.zeros_like(mask.cpu())
-        for out_y in range(expected_mask.shape[1]):
-            for out_x in range(expected_mask.shape[2]):
-                output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0])
-
-                numer = np.matmul(output_pt, m1.T)
-                denom = np.matmul(output_pt, m2.T)
-                input_pt = np.floor(numer / denom).astype(np.int32)
-
-                in_x, in_y = input_pt[:2]
-                if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]:
-                    expected_mask[0, out_y, out_x] = mask[0, in_y, in_x]
-        return expected_mask.to(mask.device)
-
-    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
-
-    for mask in make_segmentation_masks(extra_dims=((), (4,))):
-        mask = mask.to(device)
-
-        output_mask = F.perspective_segmentation_mask(
-            mask,
-            perspective_coeffs=pcoeffs,
-        )
-
-        if mask.ndim < 4:
-            masks = [mask]
-        else:
-            masks = [m for m in mask]
-
-        expected_masks = []
-        for mask in masks:
-            expected_mask = _compute_expected_mask(mask, pcoeffs)
-            expected_masks.append(expected_mask)
-        if len(expected_masks) > 1:
-            expected_masks = torch.stack(expected_masks)
-        else:
-            expected_masks = expected_masks[0]
-        torch.testing.assert_close(output_mask, expected_masks)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "output_size",
-    [(18, 18), [18, 15], (16, 19), [12], [46, 48]],
-)
-def test_correctness_center_crop_bounding_box(device, output_size):
-    def _compute_expected_bbox(bbox, output_size_):
-        format_ = bbox.format
-        image_size_ = bbox.image_size
-        bbox = convert_bounding_box_format(bbox, format_, features.BoundingBoxFormat.XYWH)
-
-        if len(output_size_) == 1:
-            output_size_.append(output_size_[-1])
-
-        cy = int(round((image_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((image_size_[1] - output_size_[1]) * 0.5))
-        out_bbox = [
-            bbox[0].item() - cx,
-            bbox[1].item() - cy,
-            bbox[2].item(),
-            bbox[3].item(),
-        ]
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYWH,
-            image_size=output_size_,
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        return convert_bounding_box_format(out_bbox, features.BoundingBoxFormat.XYWH, format_, copy=False)
-
-    for bboxes in make_bounding_boxes(
-        image_sizes=[(32, 32), (24, 33), (32, 25)],
-        extra_dims=((4,),),
-    ):
-        bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_boxes = F.center_crop_bounding_box(bboxes, bboxes_format, output_size, bboxes_image_size)
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
-
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_boxes, expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("output_size", [[4, 2], [4], [7, 6]])
-def test_correctness_center_crop_segmentation_mask(device, output_size):
-    def _compute_expected_segmentation_mask(mask, output_size):
-        crop_height, crop_width = output_size if len(output_size) > 1 else [output_size[0], output_size[0]]
-
-        _, image_height, image_width = mask.shape
-        if crop_width > image_height or crop_height > image_width:
-            padding = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-            mask = F.pad_image_tensor(mask, padding, fill=0)
-
-        left = round((image_width - crop_width) * 0.5)
-        top = round((image_height - crop_height) * 0.5)
-
-        return mask[:, top : top + crop_height, left : left + crop_width]
-
-    mask = torch.randint(0, 2, size=(1, 6, 6), dtype=torch.long, device=device)
-    actual = F.center_crop_segmentation_mask(mask, output_size)
-
-    expected = _compute_expected_segmentation_mask(mask, output_size)
-    torch.testing.assert_close(expected, actual)
-
-
-# Copied from test/test_functional_tensor.py
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("image_size", ("small", "large"))
-@pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
-@pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma):
-    fn = F.gaussian_blur_image_tensor
-
-    # true_cv2_results = {
-    #     # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8)
-    #     "3_3_0.8": ...
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5)
-    #     "3_3_0.5": ...
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8)
-    #     "3_5_0.8": ...
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5)
-    #     "3_5_0.5": ...
-    #     # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28))
-    #     # cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7)
-    #     "23_23_1.7": ...
-    # }
-    p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
-    true_cv2_results = torch.load(p)
-
-    if image_size == "small":
-        tensor = (
-            torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
-        )
-    else:
-        tensor = torch.from_numpy(np.arange(26 * 28, dtype="uint8").reshape((1, 26, 28))).to(device)
-
-    if dt == torch.float16 and device == "cpu":
-        # skip float16 on CPU case
-        return
-
-    if dt is not None:
-        tensor = tensor.to(dtype=dt)
-
-    _ksize = (ksize, ksize) if isinstance(ksize, int) else ksize
-    _sigma = sigma[0] if sigma is not None else None
-    shape = tensor.shape
-    gt_key = f"{shape[-2]}_{shape[-1]}_{shape[-3]}__{_ksize[0]}_{_ksize[1]}_{_sigma}"
-    if gt_key not in true_cv2_results:
-        return
-
-    true_out = (
-        torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor)
-    )
-
-    image = features.Image(tensor)
-
-    out = fn(image, kernel_size=ksize, sigma=sigma)
-    torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "fn, make_samples", [(F.elastic_image_tensor, make_images), (F.elastic_segmentation_mask, make_segmentation_masks)]
-)
-def test_correctness_elastic_image_or_mask_tensor(device, fn, make_samples):
-    in_box = [10, 15, 25, 35]
-    for sample in make_samples(sizes=((64, 76),), extra_dims=((), (4,))):
-        c, h, w = sample.shape[-3:]
-        # Setup a dummy image with 4 points
-        sample[..., in_box[1], in_box[0]] = torch.tensor([12, 34, 96, 112])[:c]
-        sample[..., in_box[3] - 1, in_box[0]] = torch.tensor([12, 34, 96, 112])[:c]
-        sample[..., in_box[3] - 1, in_box[2] - 1] = torch.tensor([12, 34, 96, 112])[:c]
-        sample[..., in_box[1], in_box[2] - 1] = torch.tensor([12, 34, 96, 112])[:c]
-        sample = sample.to(device)
-
-        if fn == F.elastic_image_tensor:
-            sample = features.Image(sample)
-            kwargs = {"interpolation": F.InterpolationMode.NEAREST}
-        else:
-            sample = features.SegmentationMask(sample)
-            kwargs = {}
-
-        # Create a displacement grid using sin
-        n, m = 5.0, 0.1
-        d1 = m * torch.sin(torch.arange(h, dtype=torch.float) * torch.pi * n / h)
-        d2 = m * torch.sin(torch.arange(w, dtype=torch.float) * torch.pi * n / w)
-
-        d1 = d1[:, None].expand((h, w))
-        d2 = d2[None, :].expand((h, w))
-
-        displacement = torch.cat([d1[..., None], d2[..., None]], dim=-1)
-        displacement = displacement.reshape(1, h, w, 2)
-
-        output = fn(sample, displacement=displacement, **kwargs)
-
-        # Check places where transformed points should be
-        torch.testing.assert_close(output[..., 12, 9], sample[..., in_box[1], in_box[0]])
-        torch.testing.assert_close(output[..., 17, 27], sample[..., in_box[1], in_box[2] - 1])
-        torch.testing.assert_close(output[..., 31, 6], sample[..., in_box[3] - 1, in_box[0]])
-        torch.testing.assert_close(output[..., 37, 23], sample[..., in_box[3] - 1, in_box[2] - 1])
-
-
-def test_midlevel_normalize_output_type():
-    inpt = torch.rand(1, 3, 32, 32)
-    output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0))
-    assert isinstance(output, torch.Tensor)
-    torch.testing.assert_close(inpt - 0.5, output)
-
-    inpt = make_segmentation_mask()
-    output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0))
-    assert isinstance(output, features.SegmentationMask)
-    torch.testing.assert_close(inpt, output)
-
-    inpt = make_bounding_box(format="XYXY")
-    output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0))
-    assert isinstance(output, features.BoundingBox)
-    torch.testing.assert_close(inpt, output)
-
-    inpt = make_image(color_space=features.ColorSpace.RGB)
-    output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0))
-    assert isinstance(output, torch.Tensor)
-    torch.testing.assert_close(inpt - 0.5, output)
-
-
-@pytest.mark.parametrize(
-    "inpt",
-    [
-        torch.randint(0, 256, size=(3, 32, 32)),
-        127 * np.ones((32, 32, 3), dtype="uint8"),
-        PIL.Image.new("RGB", (32, 32), 122),
-    ],
-)
-@pytest.mark.parametrize("copy", [True, False])
-def test_to_image_tensor(inpt, copy):
-    output = F.to_image_tensor(inpt, copy=copy)
-    assert isinstance(output, torch.Tensor)
-
-    assert np.asarray(inpt).sum() == output.sum().item()
-
-    if isinstance(inpt, PIL.Image.Image) and not copy:
-        # we can't check this option
-        # as PIL -> numpy is always copying
-        return
-
-    if isinstance(inpt, PIL.Image.Image):
-        inpt.putpixel((0, 0), 11)
-    else:
-        inpt[0, 0, 0] = 11
-    if copy:
-        assert output[0, 0, 0] != 11
-    else:
-        assert output[0, 0, 0] == 11
-
-
-@pytest.mark.parametrize(
-    "inpt",
-    [
-        torch.randint(0, 256, size=(3, 32, 32), dtype=torch.uint8),
-        127 * np.ones((32, 32, 3), dtype="uint8"),
-        PIL.Image.new("RGB", (32, 32), 122),
-    ],
-)
-@pytest.mark.parametrize("mode", [None, "RGB"])
-def test_to_image_pil(inpt, mode):
-    output = F.to_image_pil(inpt, mode=mode)
-    assert isinstance(output, PIL.Image.Image)
-
-    assert np.asarray(inpt).sum() == np.asarray(output).sum()
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
deleted file mode 100644
index b83c4f3acb9..00000000000
--- a/test/test_prototype_transforms_utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import PIL.Image
-import pytest
-
-import torch
-
-from test_prototype_transforms_functional import make_bounding_box, make_image, make_segmentation_mask
-
-from torchvision.prototype import features
-from torchvision.prototype.transforms._utils import has_all, has_any, is_simple_tensor
-from torchvision.prototype.transforms.functional import to_image_pil
-
-
-IMAGE = make_image(color_space=features.ColorSpace.RGB)
-BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size)
-SEGMENTATION_MASK = make_segmentation_mask(size=IMAGE.image_size)
-
-
-@pytest.mark.parametrize(
-    ("sample", "types", "expected"),
-    [
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image,), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox,), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.SegmentationMask,), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.SegmentationMask), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox, features.SegmentationMask), True),
-        ((SEGMENTATION_MASK,), (features.Image, features.BoundingBox), False),
-        ((BOUNDING_BOX,), (features.Image, features.SegmentationMask), False),
-        ((IMAGE,), (features.BoundingBox, features.SegmentationMask), False),
-        (
-            (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK),
-            (features.Image, features.BoundingBox, features.SegmentationMask),
-            True,
-        ),
-        ((), (features.Image, features.BoundingBox, features.SegmentationMask), False),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda obj: isinstance(obj, features.Image),), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: False,), False),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: True,), True),
-        ((IMAGE,), (features.Image, PIL.Image.Image, is_simple_tensor), True),
-        ((torch.Tensor(IMAGE),), (features.Image, PIL.Image.Image, is_simple_tensor), True),
-        ((to_image_pil(IMAGE),), (features.Image, PIL.Image.Image, is_simple_tensor), True),
-    ],
-)
-def test_has_any(sample, types, expected):
-    assert has_any(sample, *types) is expected
-
-
-@pytest.mark.parametrize(
-    ("sample", "types", "expected"),
-    [
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image,), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox,), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.SegmentationMask,), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.SegmentationMask), True),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox, features.SegmentationMask), True),
-        (
-            (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK),
-            (features.Image, features.BoundingBox, features.SegmentationMask),
-            True,
-        ),
-        ((BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox), False),
-        ((BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.SegmentationMask), False),
-        ((IMAGE, SEGMENTATION_MASK), (features.BoundingBox, features.SegmentationMask), False),
-        (
-            (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK),
-            (features.Image, features.BoundingBox, features.SegmentationMask),
-            True,
-        ),
-        ((BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox, features.SegmentationMask), False),
-        ((IMAGE, SEGMENTATION_MASK), (features.Image, features.BoundingBox, features.SegmentationMask), False),
-        ((IMAGE, BOUNDING_BOX), (features.Image, features.BoundingBox, features.SegmentationMask), False),
-        (
-            (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK),
-            (lambda obj: isinstance(obj, (features.Image, features.BoundingBox, features.SegmentationMask)),),
-            True,
-        ),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: False,), False),
-        ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: True,), True),
-    ],
-)
-def test_has_all(sample, types, expected):
-    assert has_all(sample, *types) is expected
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 6ec670ffa78..d93800d59bc 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2,17 +2,18 @@
 import os
 import random
 import re
+import sys
 from functools import partial
 
 import numpy as np
 import pytest
 import torch
 import torchvision.transforms as transforms
-import torchvision.transforms._pil_constants as _pil_constants
+import torchvision.transforms._functional_tensor as F_t
 import torchvision.transforms.functional as F
-import torchvision.transforms.functional_tensor as F_t
 from PIL import Image
 from torch._utils_internal import get_file_path_2
+from torchvision.utils import _Image_fromarray
 
 try:
     import accimage
@@ -174,7 +175,7 @@ def test_accimage_pil_to_tensor(self):
     def test_accimage_resize(self):
         trans = transforms.Compose(
             [
-                transforms.Resize(256, interpolation=_pil_constants.LINEAR),
+                transforms.Resize(256, interpolation=Image.LINEAR),
                 transforms.PILToTensor(),
                 transforms.ConvertImageDtype(dtype=torch.float),
             ]
@@ -319,7 +320,7 @@ def test_randomresized_params():
         scale_range = (scale_min, scale_min + round(random.random(), 2))
         aspect_min = max(round(random.random(), 2), epsilon)
         aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2))
-        randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range)
+        randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range, antialias=True)
         i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range)
         aspect_ratio_obtained = w / h
         assert (
@@ -366,7 +367,7 @@ def test_randomresized_params():
 def test_resize(height, width, osize, max_size):
     img = Image.new("RGB", size=(width, height), color=127)
 
-    t = transforms.Resize(osize, max_size=max_size)
+    t = transforms.Resize(osize, max_size=max_size, antialias=True)
     result = t(img)
 
     msg = f"{height}, {width} - {osize} - {max_size}"
@@ -424,7 +425,7 @@ def test_resize_sequence_output(height, width, osize):
     img = Image.new("RGB", size=(width, height), color=127)
     oheight, owidth = osize
 
-    t = transforms.Resize(osize)
+    t = transforms.Resize(osize, antialias=True)
     result = t(img)
 
     assert (owidth, oheight) == result.size
@@ -447,11 +448,21 @@ def test_resize_size_equals_small_edge_size(height, width):
     img = Image.new("RGB", size=(width, height), color=127)
 
     small_edge = min(height, width)
-    t = transforms.Resize(small_edge, max_size=max_size)
+    t = transforms.Resize(small_edge, max_size=max_size, antialias=True)
     result = t(img)
     assert max(result.size) == max_size
 
 
+def test_resize_equal_input_output_sizes():
+    # Regression test for https://github.com/pytorch/vision/issues/7518
+    height, width = 28, 27
+    img = Image.new("RGB", size=(width, height))
+
+    t = transforms.Resize((height, width), antialias=True)
+    result = t(img)
+    assert result is img
+
+
 class TestPad:
     @pytest.mark.parametrize("fill", [85, 85.0])
     def test_pad(self, fill):
@@ -605,7 +616,7 @@ def _get_1_channel_tensor_various_types():
 
         img_data_short = torch.ShortTensor(1, 4, 4).random_()
         expected_output = img_data_short.numpy()
-        yield img_data_short, expected_output, "I;16"
+        yield img_data_short, expected_output, "I;16" if sys.byteorder == "little" else "I;16B"
 
         img_data_int = torch.IntTensor(1, 4, 4).random_()
         expected_output = img_data_int.numpy()
@@ -622,7 +633,7 @@ def _get_2d_tensor_various_types():
 
         img_data_short = torch.ShortTensor(4, 4).random_()
         expected_output = img_data_short.numpy()
-        yield img_data_short, expected_output, "I;16"
+        yield img_data_short, expected_output, "I;16" if sys.byteorder == "little" else "I;16B"
 
         img_data_int = torch.IntTensor(4, 4).random_()
         expected_output = img_data_int.numpy()
@@ -644,16 +655,16 @@ def test_1_channel_float_tensor_to_pil_image(self):
         img_F_mode = transforms.ToPILImage(mode="F")(img_data)
         assert img_F_mode.mode == "F"
         torch.testing.assert_close(
-            np.array(Image.fromarray(img_data.squeeze(0).numpy(), mode="F")), np.array(img_F_mode)
+            np.array(_Image_fromarray(img_data.squeeze(0).numpy(), mode="F")), np.array(img_F_mode)
         )
 
     @pytest.mark.parametrize("with_mode", [False, True])
     @pytest.mark.parametrize(
         "img_data, expected_mode",
         [
-            (torch.Tensor(4, 4, 1).uniform_().numpy(), "F"),
+            (torch.Tensor(4, 4, 1).uniform_().numpy(), "L"),
             (torch.ByteTensor(4, 4, 1).random_(0, 255).numpy(), "L"),
-            (torch.ShortTensor(4, 4, 1).random_().numpy(), "I;16"),
+            (torch.ShortTensor(4, 4, 1).random_().numpy(), "I;16" if sys.byteorder == "little" else "I;16B"),
             (torch.IntTensor(4, 4, 1).random_().numpy(), "I"),
         ],
     )
@@ -661,6 +672,8 @@ def test_1_channel_ndarray_to_pil_image(self, with_mode, img_data, expected_mode
         transform = transforms.ToPILImage(mode=expected_mode) if with_mode else transforms.ToPILImage()
         img = transform(img_data)
         assert img.mode == expected_mode
+        if np.issubdtype(img_data.dtype, np.floating):
+            img_data = (img_data * 255).astype(np.uint8)
         # note: we explicitly convert img's dtype because pytorch doesn't support uint16
         # and otherwise assert_close wouldn't be able to construct a tensor from the uint16 array
         torch.testing.assert_close(img_data[:, :, 0], np.asarray(img).astype(img_data.dtype))
@@ -731,9 +744,9 @@ def test_2d_tensor_to_pil_image(self, with_mode, img_data, expected_output, expe
     @pytest.mark.parametrize(
         "img_data, expected_mode",
         [
-            (torch.Tensor(4, 4).uniform_().numpy(), "F"),
+            (torch.Tensor(4, 4).uniform_().numpy(), "L"),
             (torch.ByteTensor(4, 4).random_(0, 255).numpy(), "L"),
-            (torch.ShortTensor(4, 4).random_().numpy(), "I;16"),
+            (torch.ShortTensor(4, 4).random_().numpy(), "I;16" if sys.byteorder == "little" else "I;16B"),
             (torch.IntTensor(4, 4).random_().numpy(), "I"),
         ],
     )
@@ -741,6 +754,8 @@ def test_2d_ndarray_to_pil_image(self, with_mode, img_data, expected_mode):
         transform = transforms.ToPILImage(mode=expected_mode) if with_mode else transforms.ToPILImage()
         img = transform(img_data)
         assert img.mode == expected_mode
+        if np.issubdtype(img_data.dtype, np.floating):
+            img_data = (img_data * 255).astype(np.uint8)
         np.testing.assert_allclose(img_data, img)
 
     @pytest.mark.parametrize("expected_mode", [None, "RGB", "HSV", "YCbCr"])
@@ -864,8 +879,6 @@ def test_ndarray_bad_types_to_pil_image(self):
             trans(np.ones([4, 4, 1], np.uint16))
         with pytest.raises(TypeError, match=reg_msg):
             trans(np.ones([4, 4, 1], np.uint32))
-        with pytest.raises(TypeError, match=reg_msg):
-            trans(np.ones([4, 4, 1], np.float64))
 
         with pytest.raises(ValueError, match=r"pic should be 2/3 dimensional. Got \d+ dimensions."):
             transforms.ToPILImage()(np.ones([1, 4, 4, 3]))
@@ -883,7 +896,7 @@ def test_adjust_brightness():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
 
     # test 0
     y_pil = F.adjust_brightness(x_pil, 1)
@@ -909,7 +922,7 @@ def test_adjust_contrast():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
 
     # test 0
     y_pil = F.adjust_contrast(x_pil, 1)
@@ -931,38 +944,11 @@ def test_adjust_contrast():
     torch.testing.assert_close(y_np, y_ans)
 
 
-@pytest.mark.skipif(Image.__version__ >= "7", reason="Temporarily disabled")
-def test_adjust_saturation():
-    x_shape = [2, 2, 3]
-    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
-
-    # test 0
-    y_pil = F.adjust_saturation(x_pil, 1)
-    y_np = np.array(y_pil)
-    torch.testing.assert_close(y_np, x_np)
-
-    # test 1
-    y_pil = F.adjust_saturation(x_pil, 0.5)
-    y_np = np.array(y_pil)
-    y_ans = [2, 4, 8, 87, 128, 173, 39, 25, 138, 133, 215, 88]
-    y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
-    torch.testing.assert_close(y_np, y_ans)
-
-    # test 2
-    y_pil = F.adjust_saturation(x_pil, 2)
-    y_np = np.array(y_pil)
-    y_ans = [0, 6, 22, 0, 149, 255, 32, 0, 255, 4, 255, 0]
-    y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
-    torch.testing.assert_close(y_np, y_ans)
-
-
 def test_adjust_hue():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
 
     with pytest.raises(ValueError):
         F.adjust_hue(x_pil, -0.7)
@@ -1044,7 +1030,7 @@ def test_adjust_sharpness():
         117,
     ]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
 
     # test 0
     y_pil = F.adjust_sharpness(x_pil, 1)
@@ -1167,7 +1153,7 @@ def test_adjust_sharpness():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
     x_th = torch.tensor(x_np.transpose(2, 0, 1))
     y_pil = F.adjust_sharpness(x_pil, 2)
     y_np = np.array(y_pil).transpose(2, 0, 1)
@@ -1179,7 +1165,7 @@ def test_adjust_gamma():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
 
     # test 0
     y_pil = F.adjust_gamma(x_pil, 1)
@@ -1205,7 +1191,7 @@ def test_adjusts_L_mode():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_rgb = Image.fromarray(x_np, mode="RGB")
+    x_rgb = _Image_fromarray(x_np, mode="RGB")
 
     x_l = x_rgb.convert("L")
     assert F.adjust_brightness(x_l, 2).mode == "L"
@@ -1335,7 +1321,7 @@ def test_to_grayscale():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
     x_pil_2 = x_pil.convert("L")
     gray_np = np.array(x_pil_2)
 
@@ -1424,17 +1410,17 @@ def test_random_choice(proba_passthrough, seed):
 def test_random_order():
     random_state = random.getstate()
     random.seed(42)
-    random_order_transform = transforms.RandomOrder([transforms.Resize(20), transforms.CenterCrop(10)])
+    random_order_transform = transforms.RandomOrder([transforms.Resize(20, antialias=True), transforms.CenterCrop(10)])
     img = transforms.ToPILImage()(torch.rand(3, 25, 25))
     num_samples = 250
     num_normal_order = 0
-    resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20)(img))
+    resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20, antialias=True)(img))
     for _ in range(num_samples):
         out = random_order_transform(img)
         if out == resize_crop_out:
             num_normal_order += 1
 
-    p_value = stats.binom_test(num_normal_order, num_samples, p=0.5)
+    p_value = stats.binomtest(num_normal_order, num_samples, p=0.5).pvalue
     random.setstate(random_state)
     assert p_value > 0.0001
 
@@ -1522,10 +1508,10 @@ def test_ten_crop(should_vflip, single_dim):
     five_crop.__repr__()
 
     if should_vflip:
-        vflipped_img = img.transpose(_pil_constants.FLIP_TOP_BOTTOM)
+        vflipped_img = img.transpose(Image.FLIP_TOP_BOTTOM)
         expected_output += five_crop(vflipped_img)
     else:
-        hflipped_img = img.transpose(_pil_constants.FLIP_LEFT_RIGHT)
+        hflipped_img = img.transpose(Image.FLIP_LEFT_RIGHT)
         expected_output += five_crop(hflipped_img)
 
     assert len(results) == 10
@@ -1629,8 +1615,8 @@ def test_augmix(fill, severity, mixture_width, chain_depth, all_ops, grayscale):
 def test_random_crop():
     height = random.randint(10, 32) * 2
     width = random.randint(10, 32) * 2
-    oheight = random.randint(5, (height - 2) / 2) * 2
-    owidth = random.randint(5, (width - 2) / 2) * 2
+    oheight = random.randint(5, (height - 2) // 2) * 2
+    owidth = random.randint(5, (width - 2) // 2) * 2
     img = torch.ones(3, height, width, dtype=torch.uint8)
     result = transforms.Compose(
         [
@@ -1670,7 +1656,7 @@ def test_random_crop():
     assert result.size(1) == height + 1
     assert result.size(2) == width + 1
 
-    t = transforms.RandomCrop(48)
+    t = transforms.RandomCrop(33)
     img = torch.ones(3, 32, 32)
     with pytest.raises(ValueError, match=r"Required crop size .+ is larger than input image size .+"):
         t(img)
@@ -1679,8 +1665,8 @@ def test_random_crop():
 def test_center_crop():
     height = random.randint(10, 32) * 2
     width = random.randint(10, 32) * 2
-    oheight = random.randint(5, (height - 2) / 2) * 2
-    owidth = random.randint(5, (width - 2) / 2) * 2
+    oheight = random.randint(5, (height - 2) // 2) * 2
+    owidth = random.randint(5, (width - 2) // 2) * 2
 
     img = torch.ones(3, height, width, dtype=torch.uint8)
     oh1 = (height - oheight) // 2
@@ -1784,7 +1770,7 @@ def test_color_jitter():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
+    x_pil = _Image_fromarray(x_np, mode="RGB")
     x_pil_2 = x_pil.convert("L")
 
     for _ in range(10):
@@ -1798,6 +1784,12 @@ def test_color_jitter():
     color_jitter.__repr__()
 
 
+@pytest.mark.parametrize("hue", [1, (-1, 1)])
+def test_color_jitter_hue_out_of_bounds(hue):
+    with pytest.raises(ValueError, match=re.escape("hue values should be between (-0.5, 0.5)")):
+        transforms.ColorJitter(hue=hue)
+
+
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.skipif(stats is None, reason="scipy.stats not available")
 def test_random_erasing(seed):
@@ -1818,7 +1810,7 @@ def test_random_erasing(seed):
     tol = 0.05
     assert 1 / 3 - tol <= aspect_ratio <= 3 + tol
 
-    # Make sure that h > w and h < w are equaly likely (log-scale sampling)
+    # Make sure that h > w and h < w are equally likely (log-scale sampling)
     aspect_ratios = []
     random.seed(42)
     trial = 1000
@@ -1834,7 +1826,7 @@ def test_random_erasing(seed):
         aspect_ratios.append(h / w)
 
     count_bigger_then_ones = len([1 for aspect_ratio in aspect_ratios if aspect_ratio > 1])
-    p_value = stats.binom_test(count_bigger_then_ones, trial, p=0.5)
+    p_value = stats.binomtest(count_bigger_then_ones, trial, p=0.5).pvalue
     assert p_value > 0.0001
 
     # Checking if RandomErasing can be printed as string
@@ -1866,27 +1858,8 @@ def test_random_rotation():
     # Checking if RandomRotation can be printed as string
     t.__repr__()
 
-    # assert deprecation warning and non-BC
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. "
-            "Please use 'interpolation' instead."
-        ),
-    ):
-        t = transforms.RandomRotation((-10, 10), resample=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
-
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        t = transforms.RandomRotation((-10, 10), interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.RandomRotation((-10, 10), interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
 
 def test_random_rotation_error():
@@ -2078,7 +2051,7 @@ def _test_transformation(self, angle, translate, scale, shear, pil_image, input_
                 # https://github.com/python-pillow/Pillow/blob/71f8ec6a0cfc1008076a023c0756542539d057ab/
                 # src/libImaging/Geometry.c#L1060
                 input_pt = np.array([x + 0.5, y + 0.5, 1.0])
-                res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(np.int)
+                res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(int)
                 _x, _y = res[:2]
                 if 0 <= _x < input_img.shape[1] and 0 <= _y < input_img.shape[0]:
                     true_result[y, x, :] = input_img[_y, _x, :]
@@ -2217,37 +2190,8 @@ def test_random_affine():
     t = transforms.RandomAffine(10, interpolation=transforms.InterpolationMode.BILINEAR)
     assert "bilinear" in t.__repr__()
 
-    # assert deprecation warning and non-BC
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. "
-            "Please use 'interpolation' instead."
-        ),
-    ):
-        t = transforms.RandomAffine(10, resample=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
-
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. "
-            "Please use 'fill' instead."
-        ),
-    ):
-        t = transforms.RandomAffine(10, fillcolor=10)
-        assert t.fill == 10
-
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        t = transforms.RandomAffine(10, interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.RandomAffine(10, interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
 
 def test_elastic_transformation():
@@ -2265,9 +2209,8 @@ def test_elastic_transformation():
     with pytest.raises(ValueError, match=r"sigma is a sequence its length should be 2"):
         transforms.ElasticTransform(alpha=2.0, sigma=[1.0, 0.0, 1.0])
 
-    with pytest.warns(UserWarning, match=r"Argument interpolation should be of type InterpolationMode"):
-        t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
     with pytest.raises(TypeError, match=r"fill should be int or float"):
         transforms.ElasticTransform(alpha=1.0, sigma=1.0, fill={})
@@ -2287,5 +2230,17 @@ def test_elastic_transformation():
     t.__repr__()
 
 
+def test_random_grayscale_with_grayscale_input():
+    transform = transforms.RandomGrayscale(p=1.0)
+
+    image_tensor = torch.randint(0, 256, (1, 16, 16), dtype=torch.uint8)
+    output_tensor = transform(image_tensor)
+    torch.testing.assert_close(output_tensor, image_tensor)
+
+    image_pil = F.to_pil_image(image_tensor)
+    output_pil = transform(image_pil)
+    torch.testing.assert_close(F.pil_to_tensor(output_pil), image_tensor)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index f4ca544deb8..eac52dafc17 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -2,16 +2,16 @@
 import sys
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch
-import torchvision.transforms._pil_constants as _pil_constants
 from common_utils import (
     _assert_approx_equal_tensor_to_pil,
     _assert_equal_tensor_to_pil,
     _create_data,
     _create_data_batch,
     assert_equal,
-    cpu_and_gpu,
+    cpu_and_cuda,
     float_dtypes,
     get_tmp_dir,
     int_dtypes,
@@ -20,7 +20,12 @@
 from torchvision.transforms import functional as F, InterpolationMode
 from torchvision.transforms.autoaugment import _apply_op
 
-NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC
+NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
+    InterpolationMode.NEAREST,
+    InterpolationMode.NEAREST_EXACT,
+    InterpolationMode.BILINEAR,
+    InterpolationMode.BICUBIC,
+)
 
 
 def _test_transform_vs_scripted(transform, s_transform, tensor, msg=None):
@@ -94,12 +99,12 @@ def _test_op(func, method, device, channels=3, fn_kwargs=None, meth_kwargs=None,
 
 def _test_fn_save_load(fn, tmpdir):
     scripted_fn = torch.jit.script(fn)
-    p = os.path.join(tmpdir, f"t_op_list_{fn.__name__ if hasattr(fn, '__name__') else fn.__class__.__name__}.pt")
+    p = os.path.join(tmpdir, f"t_op_list_{getattr(fn, '__name__', fn.__class__.__name__)}.pt")
     scripted_fn.save(p)
     _ = torch.jit.load(p)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "func,method,fn_kwargs,match_kwargs",
     [
@@ -124,7 +129,7 @@ def test_random(func, method, device, channels, fn_kwargs, match_kwargs):
 
 
 @pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("channels", [1, 3])
 class TestColorJitter:
     @pytest.fixture(autouse=True)
@@ -200,7 +205,7 @@ def test_color_jitter_all(self, device, channels):
         )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("m", ["constant", "edge", "reflect", "symmetric"])
 @pytest.mark.parametrize("mul", [1, -1])
 def test_pad(m, mul, device):
@@ -223,7 +228,7 @@ def test_pad(m, mul, device):
     _test_op(F.pad, T.Pad, device=device, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_crop(device):
     fn_kwargs = {"top": 2, "left": 3, "height": 4, "width": 5}
     # Test transforms.RandomCrop with size and padding as tuple
@@ -251,7 +256,7 @@ def test_crop(device):
     _test_functional_op(F.crop, fn_kwargs=fn_kwargs, device=device)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "padding_config",
     [
@@ -277,7 +282,7 @@ def test_random_crop_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_center_crop(device, tmpdir):
     fn_kwargs = {"output_size": (4, 5)}
     meth_kwargs = {"size": (4, 5)}
@@ -307,7 +312,7 @@ def test_center_crop_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "fn, method, out_length",
     [
@@ -366,7 +371,7 @@ class TestResize:
     def test_resize_int(self, size):
         # TODO: Minimal check for bug-fix, improve this later
         x = torch.rand(3, 32, 46)
-        t = T.Resize(size=size)
+        t = T.Resize(size=size, antialias=True)
         y = t(x)
         # If size is an int, smaller edge of the image will be matched to this number.
         # i.e, if height > width, then image will be rescaled to (size * height / width, size).
@@ -374,11 +379,11 @@ def test_resize_int(self, size):
         assert y.shape[1] == size
         assert y.shape[2] == int(size * 46 / 32)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64])
     @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]])
     @pytest.mark.parametrize("max_size", [None, 35, 1000])
-    @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST])
+    @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
     def test_resize_scripted(self, dt, size, max_size, interpolation, device):
         tensor, _ = _create_data(height=34, width=36, device=device)
         batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device)
@@ -389,25 +394,25 @@ def test_resize_scripted(self, dt, size, max_size, interpolation, device):
         if max_size is not None and len(size) != 1:
             pytest.skip("Size should be an int or a sequence of length 1 if max_size is specified")
 
-        transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size)
+        transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size, antialias=True)
         s_transform = torch.jit.script(transform)
         _test_transform_vs_scripted(transform, s_transform, tensor)
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
     def test_resize_save_load(self, tmpdir):
-        fn = T.Resize(size=[32])
+        fn = T.Resize(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
     @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]])
     @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]])
-    @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC])
+    @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC, NEAREST_EXACT])
     @pytest.mark.parametrize("antialias", [None, True, False])
     def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device):
 
-        if antialias and interpolation == NEAREST:
-            pytest.skip("Can not resize if interpolation mode is NEAREST and antialias=True")
+        if antialias and interpolation in {NEAREST, NEAREST_EXACT}:
+            pytest.skip(f"Can not resize if interpolation mode is {interpolation} and antialias=True")
 
         tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
         batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device)
@@ -419,7 +424,7 @@ def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
     def test_resized_crop_save_load(self, tmpdir):
-        fn = T.RandomResizedCrop(size=[32])
+        fn = T.RandomResizedCrop(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
 
@@ -438,42 +443,42 @@ def test_random_affine_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("shear", [15, 10.0, (5.0, 10.0), [-15, 15], [-10.0, 10.0, -11.0, 11.0]])
 def test_random_affine_shear(device, interpolation, shear):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, shear=shear)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
 def test_random_affine_scale(device, interpolation, scale):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, scale=scale)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("translate", [(0.1, 0.2), [0.2, 0.1]])
 def test_random_affine_translate(device, interpolation, translate):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, translate=translate)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]])
 def test_random_affine_degrees(device, interpolation, degrees):
     _test_random_affine_helper(device, degrees=degrees, interpolation=interpolation)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_random_affine_fill(device, interpolation, fill):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, fill=fill)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("center", [(0, 0), [10, 10], None, (56, 44)])
 @pytest.mark.parametrize("expand", [True, False])
 @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]])
@@ -495,7 +500,7 @@ def test_random_rotate_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("distortion_scale", np.linspace(0.1, 1.0, num=20))
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
@@ -515,7 +520,7 @@ def test_random_perspective_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "Klass, meth_kwargs",
     [(T.Grayscale, {"num_output_channels": 1}), (T.Grayscale, {"num_output_channels": 3}), (T.RandomGrayscale, {})],
@@ -525,7 +530,7 @@ def test_to_grayscale(device, Klass, meth_kwargs):
     _test_class_op(Klass, meth_kwargs=meth_kwargs, test_exact_match=False, device=device, tol=tol, agg_method="max")
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("in_dtype", int_dtypes() + float_dtypes())
 @pytest.mark.parametrize("out_dtype", int_dtypes() + float_dtypes())
 def test_convert_image_dtype(device, in_dtype, out_dtype):
@@ -556,7 +561,7 @@ def test_convert_image_dtype_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("policy", [policy for policy in T.AutoAugmentPolicy])
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_autoaugment(device, policy, fill):
@@ -570,7 +575,7 @@ def test_autoaugment(device, policy, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_ops", [1, 2, 3])
 @pytest.mark.parametrize("magnitude", [7, 9, 11])
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
@@ -585,7 +590,7 @@ def test_randaugment(device, num_ops, magnitude, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_trivialaugmentwide(device, fill):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -598,7 +603,7 @@ def test_trivialaugmentwide(device, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_augmix(device, fill):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -635,13 +640,13 @@ def shear(pil_img, level, mode, resample):
             matrix = (1, level, 0, 0, 1, 0)
         elif mode == "Y":
             matrix = (1, 0, 0, level, 1, 0)
-        return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample)
+        return pil_img.transform((image_size, image_size), PIL.Image.AFFINE, matrix, resample=resample)
 
     t_img, pil_img = _create_data(image_size, image_size)
 
     resample_pil = {
-        F.InterpolationMode.NEAREST: _pil_constants.NEAREST,
-        F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR,
+        F.InterpolationMode.NEAREST: PIL.Image.NEAREST,
+        F.InterpolationMode.BILINEAR: PIL.Image.BILINEAR,
     }[interpolation]
 
     level = 0.3
@@ -664,10 +669,20 @@ def shear(pil_img, level, mode, resample):
     _assert_approx_equal_tensor_to_pil(out, expected_out)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "config",
-    [{"value": 0.2}, {"value": "random"}, {"value": (0.2, 0.2, 0.2)}, {"value": "random", "ratio": (0.1, 0.2)}],
+    [
+        {},
+        {"value": 1},
+        {"value": 0.2},
+        {"value": "random"},
+        {"value": (1, 1, 1)},
+        {"value": (0.2, 0.2, 0.2)},
+        {"value": [1, 1, 1]},
+        {"value": [0.2, 0.2, 0.2]},
+        {"value": "random", "ratio": (0.1, 0.2)},
+    ],
 )
 def test_random_erasing(device, config):
     tensor, _ = _create_data(24, 32, channels=3, device=device)
@@ -692,7 +707,7 @@ def test_random_erasing_with_invalid_data():
         random_erasing(img)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_normalize(device, tmpdir):
     fn = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     tensor, _ = _create_data(26, 34, device=device)
@@ -711,7 +726,7 @@ def test_normalize(device, tmpdir):
     scripted_fn.save(os.path.join(tmpdir, "t_norm.pt"))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_linear_transformation(device, tmpdir):
     c, h, w = 3, 24, 32
 
@@ -737,7 +752,7 @@ def test_linear_transformation(device, tmpdir):
     scripted_fn.save(os.path.join(tmpdir, "t_norm.pt"))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_compose(device):
     tensor, _ = _create_data(26, 34, device=device)
     tensor = tensor.to(dtype=torch.float32) / 255.0
@@ -765,7 +780,7 @@ def test_compose(device):
         torch.jit.script(t)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_random_apply(device):
     tensor, _ = _create_data(26, 34, device=device)
     tensor = tensor.to(dtype=torch.float32) / 255.0
@@ -807,7 +822,7 @@ def test_random_apply(device):
             torch.jit.script(transforms)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "meth_kwargs",
     [
@@ -843,3 +858,35 @@ def test_gaussian_blur(device, channels, meth_kwargs):
         agg_method="max",
         tol=tol,
     )
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize(
+    "fill",
+    [
+        1,
+        1.0,
+        [1],
+        [1.0],
+        (1,),
+        (1.0,),
+        [1, 2, 3],
+        [1.0, 2.0, 3.0],
+        (1, 2, 3),
+        (1.0, 2.0, 3.0),
+    ],
+)
+@pytest.mark.parametrize("channels", [1, 3])
+def test_elastic_transform(device, channels, fill):
+    if isinstance(fill, (list, tuple)) and len(fill) > 1 and channels == 1:
+        # For this the test would correctly fail, since the number of channels in the image does not match `fill`.
+        # Thus, this is not an issue in the transform, but rather a problem of parametrization that just gives the
+        # product of `fill` and `channels`.
+        return
+
+    _test_class_op(
+        T.ElasticTransform,
+        meth_kwargs=dict(fill=fill),
+        channels=channels,
+        device=device,
+    )
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
new file mode 100644
index 00000000000..9eb209f43fc
--- /dev/null
+++ b/test/test_transforms_v2.py
@@ -0,0 +1,7501 @@
+import contextlib
+import decimal
+import functools
+import inspect
+import itertools
+import math
+import pickle
+import random
+import re
+import sys
+from copy import deepcopy
+from pathlib import Path
+from unittest import mock
+
+import numpy as np
+import PIL.Image
+import pytest
+
+import torch
+import torchvision.ops
+import torchvision.transforms.v2 as transforms
+
+from common_utils import (
+    assert_equal,
+    cache,
+    cpu_and_cuda,
+    freeze_rng_state,
+    ignore_jit_no_profile_information_warning,
+    make_bounding_boxes,
+    make_detection_masks,
+    make_image,
+    make_image_pil,
+    make_image_tensor,
+    make_keypoints,
+    make_segmentation_mask,
+    make_video,
+    make_video_tensor,
+    needs_cuda,
+    set_rng_seed,
+)
+
+from torch import nn
+from torch.testing import assert_close
+from torch.utils._pytree import tree_flatten, tree_map
+from torch.utils.data import DataLoader, default_collate
+from torchvision import tv_tensors
+from torchvision.ops.boxes import box_iou
+
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.functional import pil_modes_mapping, to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2._utils import check_type, is_pure_tensor
+from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs, _parallelogram_to_bounding_boxes
+from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal
+
+
+# turns all warnings into errors for this module
+pytestmark = [pytest.mark.filterwarnings("error")]
+
+if sys.version_info[:2] >= (3, 12):
+    # torchscript relies on some AST stuff that got deprecated in 3.12,
+    # so we have to explicitly ignore those otherwise we'd error on warnings due to the pytestmark filter above.
+    pytestmark.append(pytest.mark.filterwarnings("ignore::DeprecationWarning"))
+
+
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
+def _to_tolerances(maybe_tolerance_dict):
+    if not isinstance(maybe_tolerance_dict, dict):
+        return dict(rtol=None, atol=None)
+
+    tolerances = dict(rtol=0, atol=0)
+    tolerances.update(maybe_tolerance_dict)
+    return tolerances
+
+
+def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel produces closes results for inputs on GPU and CPU."""
+    if input.device.type != "cuda":
+        return
+
+    input_cuda = input.as_subclass(torch.Tensor)
+    input_cpu = input_cuda.to("cpu")
+
+    with freeze_rng_state():
+        actual = kernel(input_cuda, *args, **kwargs)
+    with freeze_rng_state():
+        expected = kernel(input_cpu, *args, **kwargs)
+
+    assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol)
+
+
+@cache
+def _script(obj):
+    try:
+        return torch.jit.script(obj)
+    except Exception as error:
+        name = getattr(obj, "__name__", obj.__class__.__name__)
+        raise AssertionError(f"Trying to `torch.jit.script` `{name}` raised the error above.") from error
+
+
+def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel is scriptable and if the scripted output is close to the eager one."""
+    if input.device.type != "cpu":
+        return
+
+    kernel_scripted = _script(kernel)
+
+    input = input.as_subclass(torch.Tensor)
+    with ignore_jit_no_profile_information_warning():
+        with freeze_rng_state():
+            actual = kernel_scripted(input, *args, **kwargs)
+    with freeze_rng_state():
+        expected = kernel(input, *args, **kwargs)
+
+    assert_close(actual, expected, rtol=rtol, atol=atol)
+
+
+def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel produces close results for batched and unbatched inputs."""
+    unbatched_input = input.as_subclass(torch.Tensor)
+
+    for batch_dims in [(2,), (2, 1)]:
+        repeats = [*batch_dims, *[1] * input.ndim]
+
+        actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs)
+
+        expected = kernel(unbatched_input, *args, **kwargs)
+        # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata
+        if isinstance(expected, torch.Tensor):
+            expected = expected.repeat(repeats)
+        else:
+            tensor, *metadata = expected
+            expected = (tensor.repeat(repeats), *metadata)
+
+        assert_close(actual, expected, rtol=rtol, atol=atol)
+
+    for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]:
+        degenerate_batched_input = torch.empty(
+            degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device
+        )
+
+        output = kernel(degenerate_batched_input, *args, **kwargs)
+        # Most kernels just return a tensor, but some also return some additional metadata
+        if not isinstance(output, torch.Tensor):
+            output, *_ = output
+
+        assert output.shape[: -input.ndim] == degenerate_batch_dims
+
+
+def check_kernel(
+    kernel,
+    input,
+    *args,
+    check_cuda_vs_cpu=True,
+    check_scripted_vs_eager=True,
+    check_batched_vs_unbatched=True,
+    **kwargs,
+):
+    initial_input_version = input._version
+
+    output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs)
+    # Most kernels just return a tensor, but some also return some additional metadata
+    if not isinstance(output, torch.Tensor):
+        output, *_ = output
+
+    # check that no inplace operation happened
+    assert input._version == initial_input_version
+
+    if kernel not in {F.to_dtype_image, F.to_dtype_video}:
+        assert output.dtype == input.dtype
+    assert output.device == input.device
+
+    if check_cuda_vs_cpu:
+        _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu))
+
+    if check_scripted_vs_eager:
+        _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager))
+
+    if check_batched_vs_unbatched:
+        _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched))
+
+
+def _check_functional_scripted_smoke(functional, input, *args, **kwargs):
+    """Checks if the functional can be scripted and the scripted version can be called without error."""
+    if not isinstance(input, tv_tensors.Image):
+        return
+
+    functional_scripted = _script(functional)
+    with ignore_jit_no_profile_information_warning():
+        functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
+
+
+def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs):
+    unknown_input = object()
+    with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
+        functional(unknown_input, *args, **kwargs)
+
+    with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy:
+        output = functional(input, *args, **kwargs)
+
+        spy.assert_any_call(f"{functional.__module__}.{functional.__name__}")
+
+    assert isinstance(output, type(input))
+
+    if isinstance(input, tv_tensors.BoundingBoxes) and functional is not F.convert_bounding_box_format:
+        assert output.format == input.format
+
+    if check_scripted_smoke:
+        _check_functional_scripted_smoke(functional, input, *args, **kwargs)
+
+
+def check_functional_kernel_signature_match(functional, *, kernel, input_type):
+    """Checks if the signature of the functional matches the kernel signature."""
+    functional_params = list(inspect.signature(functional).parameters.values())[1:]
+    kernel_params = list(inspect.signature(kernel).parameters.values())[1:]
+
+    if issubclass(input_type, tv_tensors.TVTensor):
+        # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be
+        # explicitly passed to the kernel.
+        explicit_metadata = {tv_tensors.BoundingBoxes: {"format", "canvas_size"}, tv_tensors.KeyPoints: {"canvas_size"}}
+        kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
+
+    functional_params = iter(functional_params)
+    for functional_param, kernel_param in zip(functional_params, kernel_params):
+        try:
+            # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out
+            # functional parameters that have no kernel equivalent while keeping the order intact.
+            while functional_param.name != kernel_param.name:
+                functional_param = next(functional_params)
+        except StopIteration:
+            raise AssertionError(
+                f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` "
+                f"has no corresponding parameter on the functional `{functional.__name__}`."
+            ) from None
+
+        if issubclass(input_type, PIL.Image.Image):
+            # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check
+            # them in the first place.
+            functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty
+
+        assert functional_param == kernel_param
+
+
+def _check_transform_v1_compatibility(transform, input, *, rtol, atol):
+    """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
+    ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version
+    can be called without error."""
+    if not (type(input) is torch.Tensor or isinstance(input, PIL.Image.Image)):
+        return
+
+    v1_transform_cls = transform._v1_transform_cls
+    if v1_transform_cls is None:
+        return
+
+    if hasattr(v1_transform_cls, "get_params"):
+        assert type(transform).get_params is v1_transform_cls.get_params
+
+    v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform())
+
+    with freeze_rng_state():
+        output_v2 = transform(input)
+
+    with freeze_rng_state():
+        output_v1 = v1_transform(input)
+
+    assert_close(F.to_image(output_v2), F.to_image(output_v1), rtol=rtol, atol=atol)
+
+    if isinstance(input, PIL.Image.Image):
+        return
+
+    _script(v1_transform)(input)
+
+
+def _make_transform_sample(transform, *, image_or_video, adapter):
+    device = image_or_video.device if isinstance(image_or_video, torch.Tensor) else "cpu"
+    size = F.get_size(image_or_video)
+    input = dict(
+        image_or_video=image_or_video,
+        image_tv_tensor=make_image(size, device=device),
+        video_tv_tensor=make_video(size, device=device),
+        image_pil=make_image_pil(size),
+        bounding_boxes_xyxy=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.XYXY, device=device),
+        bounding_boxes_xywh=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.XYWH, device=device),
+        bounding_boxes_cxcywh=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.CXCYWH, device=device),
+        bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes(
+            [
+                [0, 0, 0, 0],  # no height or width
+                [0, 0, 0, 1],  # no height
+                [0, 0, 1, 0],  # no width
+                [2, 0, 1, 1],  # x1 > x2, y1 < y2
+                [0, 2, 1, 1],  # x1 < x2, y1 > y2
+                [2, 2, 1, 1],  # x1 > x2, y1 > y2
+            ],
+            format=tv_tensors.BoundingBoxFormat.XYXY,
+            canvas_size=size,
+            device=device,
+        ),
+        bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes(
+            [
+                [0, 0, 0, 0],  # no height or width
+                [0, 0, 0, 1],  # no height
+                [0, 0, 1, 0],  # no width
+                [0, 0, 1, -1],  # negative height
+                [0, 0, -1, 1],  # negative width
+                [0, 0, -1, -1],  # negative height and width
+            ],
+            format=tv_tensors.BoundingBoxFormat.XYWH,
+            canvas_size=size,
+            device=device,
+        ),
+        bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes(
+            [
+                [0, 0, 0, 0],  # no height or width
+                [0, 0, 0, 1],  # no height
+                [0, 0, 1, 0],  # no width
+                [0, 0, 1, -1],  # negative height
+                [0, 0, -1, 1],  # negative width
+                [0, 0, -1, -1],  # negative height and width
+            ],
+            format=tv_tensors.BoundingBoxFormat.CXCYWH,
+            canvas_size=size,
+            device=device,
+        ),
+        keypoints=make_keypoints(canvas_size=size),
+        detection_mask=make_detection_masks(size, device=device),
+        segmentation_mask=make_segmentation_mask(size, device=device),
+        int=0,
+        float=0.0,
+        bool=True,
+        none=None,
+        str="str",
+        path=Path.cwd(),
+        object=object(),
+        tensor=torch.empty(5),
+        array=np.empty(5),
+    )
+    if adapter is not None:
+        input = adapter(transform, input, device)
+    return input
+
+
+def _check_transform_sample_input_smoke(transform, input, *, adapter):
+    # This is a bunch of input / output convention checks, using a big sample with different parts as input.
+
+    if not check_type(input, (is_pure_tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video)):
+        return
+
+    sample = _make_transform_sample(
+        # adapter might change transform inplace
+        transform=transform if adapter is None else deepcopy(transform),
+        image_or_video=input,
+        adapter=adapter,
+    )
+    for container_type in [dict, list, tuple]:
+        if container_type is dict:
+            input = sample
+        else:
+            input = container_type(sample.values())
+
+        input_flat, input_spec = tree_flatten(input)
+
+        with freeze_rng_state():
+            torch.manual_seed(0)
+            output = transform(input)
+        output_flat, output_spec = tree_flatten(output)
+
+        assert output_spec == input_spec
+
+        for output_item, input_item, should_be_transformed in zip(
+            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
+        ):
+            if should_be_transformed:
+                assert type(output_item) is type(input_item)
+            else:
+                assert output_item is input_item
+
+    # Enforce that the transform does not turn a degenerate bounding box, e.g. marked by RandomIoUCrop (or any other
+    # future transform that does this), back into a valid one.
+    # TODO: We may want to do that for KeyPoints too
+    for degenerate_bounding_boxes in (
+        bounding_box
+        for name, bounding_box in sample.items()
+        if "degenerate" in name and isinstance(bounding_box, tv_tensors.BoundingBoxes)
+    ):
+        sample = dict(
+            boxes=degenerate_bounding_boxes,
+            labels=torch.randint(10, (degenerate_bounding_boxes.shape[0],), device=degenerate_bounding_boxes.device),
+        )
+        assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+
+
+def check_transform(transform, input, check_v1_compatibility=True, check_sample_input=True):
+    pickle.loads(pickle.dumps(transform))
+
+    output = transform(input)
+    assert isinstance(output, type(input))
+
+    if isinstance(input, tv_tensors.BoundingBoxes) and not isinstance(transform, transforms.ConvertBoundingBoxFormat):
+        assert output.format == input.format
+
+    if check_sample_input:
+        _check_transform_sample_input_smoke(
+            transform, input, adapter=check_sample_input if callable(check_sample_input) else None
+        )
+
+    if check_v1_compatibility:
+        _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility))
+
+    return output
+
+
+def transform_cls_to_functional(transform_cls, **transform_specific_kwargs):
+    def wrapper(input, *args, **kwargs):
+        transform = transform_cls(*args, **transform_specific_kwargs, **kwargs)
+        return transform(input)
+
+    wrapper.__name__ = transform_cls.__name__
+
+    return wrapper
+
+
+def param_value_parametrization(**kwargs):
+    """Helper function to turn
+
+    @pytest.mark.parametrize(
+        ("param", "value"),
+        ("a", 1),
+        ("a", 2),
+        ("a", 3),
+        ("b", -1.0)
+        ("b", 1.0)
+    )
+
+    into
+
+    @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0])
+    """
+    return pytest.mark.parametrize(
+        ("param", "value"),
+        [(param, value) for param, values in kwargs.items() for value in values],
+    )
+
+
+def adapt_fill(value, *, dtype):
+    """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype"""
+    if value is None:
+        return value
+
+    max_value = get_max_value(dtype)
+    value_type = float if dtype.is_floating_point else int
+
+    if isinstance(value, (int, float)):
+        return value_type(value * max_value)
+    elif isinstance(value, (list, tuple)):
+        return type(value)(value_type(v * max_value) for v in value)
+    else:
+        raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.")
+
+
+EXHAUSTIVE_TYPE_FILLS = [
+    None,
+    1,
+    0.5,
+    [1],
+    [0.2],
+    (0,),
+    (0.7,),
+    [1, 0, 1],
+    [0.1, 0.2, 0.3],
+    (0, 1, 0),
+    (0.9, 0.234, 0.314),
+]
+CORRECTNESS_FILLS = [
+    v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)
+]
+
+
+# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well
+INTERPOLATION_MODES = [
+    transforms.InterpolationMode.NEAREST,
+    transforms.InterpolationMode.NEAREST_EXACT,
+    transforms.InterpolationMode.BILINEAR,
+    transforms.InterpolationMode.BICUBIC,
+]
+
+
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+    clamping_mode = bounding_boxes.clamping_mode
+
+    def affine_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        input_xyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+            old_format=format,
+            new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            inplace=True,
+        )
+        x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
+
+        points = np.array(
+            [
+                [x1, y1, 1.0],
+                [x2, y1, 1.0],
+                [x1, y2, 1.0],
+                [x2, y2, 1.0],
+            ]
+        )
+        transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
+
+        output_xyxy = torch.Tensor(
+            [
+                float(np.min(transformed_points[:, 0])),
+                float(np.min(transformed_points[:, 1])),
+                float(np.max(transformed_points[:, 0])),
+                float(np.max(transformed_points[:, 1])),
+            ]
+        )
+
+        output = F.convert_bounding_box_format(
+            output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format
+        )
+
+        if clamp:
+            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            output = F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+                clamping_mode=clamping_mode,
+            )
+        else:
+            # We leave the bounding box as float64 so the caller gets the full precision to perform any additional
+            # operation
+            dtype = output.dtype
+
+        return output.to(dtype=dtype, device=device)
+
+    return tv_tensors.BoundingBoxes(
+        torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
+            bounding_boxes.shape
+        ),
+        format=format,
+        canvas_size=canvas_size,
+        clamping_mode=clamping_mode,
+    )
+
+
+def reference_affine_rotated_bounding_boxes_helper(
+    bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True, flip=False
+):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+    clamping_mode = bounding_boxes.clamping_mode
+
+    def affine_rotated_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+
+        # Go to float before converting to prevent precision loss in case of CXCYWHR -> XYXYXYXY and W or H is 1
+        input_xyxyxyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+            old_format=format,
+            new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+            inplace=True,
+        )
+        x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist()
+
+        points = np.array(
+            [
+                [x1, y1, 1.0],
+                [x2, y2, 1.0],
+                [x3, y3, 1.0],
+                [x4, y4, 1.0],
+            ]
+        )
+        transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
+        output = torch.tensor(
+            [
+                float(transformed_points[0, 0]),
+                float(transformed_points[0, 1]),
+                float(transformed_points[1, 0]),
+                float(transformed_points[1, 1]),
+                float(transformed_points[2, 0]),
+                float(transformed_points[2, 1]),
+                float(transformed_points[3, 0]),
+                float(transformed_points[3, 1]),
+            ]
+        )
+
+        output = output[[2, 3, 0, 1, 6, 7, 4, 5]] if flip else output
+        output = _parallelogram_to_bounding_boxes(output)
+
+        output = F.convert_bounding_box_format(
+            output, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format
+        )
+
+        return (
+            F.clamp_bounding_boxes(
+                output.to(dtype=dtype, device=device),
+                format=format,
+                canvas_size=canvas_size,
+                clamping_mode=clamping_mode,
+            )
+            if clamp
+            else output.to(dtype=output.dtype, device=device)
+        )
+
+    return tv_tensors.BoundingBoxes(
+        torch.cat(
+            [
+                affine_rotated_bounding_boxes(b)
+                for b in bounding_boxes.reshape(
+                    -1, 5 if format != tv_tensors.BoundingBoxFormat.XYXYXYXY else 8
+                ).unbind()
+            ],
+            dim=0,
+        ).reshape(bounding_boxes.shape),
+        format=format,
+        canvas_size=canvas_size,
+        clamping_mode=clamping_mode,
+    )
+
+
+def reference_affine_keypoints_helper(keypoints, *, affine_matrix, new_canvas_size=None, clamp=True):
+    canvas_size = new_canvas_size or keypoints.canvas_size
+
+    def affine_keypoints(keypoints):
+        dtype = keypoints.dtype
+        device = keypoints.device
+
+        # Go to float before converting to prevent precision loss
+        x, y = keypoints.to(dtype=torch.float64, device="cpu", copy=True).squeeze(0).tolist()
+
+        points = np.array([[x, y, 1.0]])
+        transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
+
+        output = torch.Tensor(
+            [
+                float(transformed_points[0, 0]),
+                float(transformed_points[0, 1]),
+            ]
+        )
+
+        if clamp:
+            output = F.clamp_keypoints(output, canvas_size=canvas_size)
+        else:
+            dtype = output.dtype
+
+        return output.to(dtype=dtype, device=device)
+
+    return tv_tensors.KeyPoints(
+        torch.cat([affine_keypoints(k) for k in keypoints.reshape(-1, 2).unbind()], dim=0).reshape(keypoints.shape),
+        canvas_size=canvas_size,
+    )
+
+
+class TestResize:
+    INPUT_SIZE = (17, 11)
+    OUTPUT_SIZES = [17, [17], (17,), None, [12, 13], (12, 13)]
+
+    def _make_max_size_kwarg(self, *, use_max_size, size):
+        if size is None:
+            max_size = min(list(self.INPUT_SIZE))
+        elif use_max_size:
+            if not (isinstance(size, int) or len(size) == 1):
+                # This would result in an `ValueError`
+                return None
+
+            max_size = (size if isinstance(size, int) else size[0]) + 1
+        else:
+            max_size = None
+
+        return dict(max_size=max_size)
+
+    def _compute_output_size(self, *, input_size, size, max_size):
+        if size is None:
+            size = max_size
+
+        elif not (isinstance(size, int) or len(size) == 1):
+            return tuple(size)
+
+        elif not isinstance(size, int):
+            size = size[0]
+
+        old_height, old_width = input_size
+        ratio = old_width / old_height
+        if ratio > 1:
+            new_height = size
+            new_width = int(ratio * new_height)
+        else:
+            new_width = size
+            new_height = int(new_width / ratio)
+
+        if max_size is not None and max(new_height, new_width) > max_size:
+            # Need to recompute the aspect ratio, since it might have changed due to rounding
+            ratio = new_width / new_height
+            if ratio > 1:
+                new_width = max_size
+                new_height = int(new_width / ratio)
+            else:
+                new_height = max_size
+                new_width = int(new_height * ratio)
+
+        return new_height, new_width
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("antialias", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA.
+        # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that.
+        atol = 30 if (interpolation is transforms.InterpolationMode.BICUBIC and dtype is torch.uint8) else 1
+        check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol)
+
+        check_kernel(
+            F.resize_image,
+            make_image(self.INPUT_SIZE, dtype=dtype, device=device),
+            size=size,
+            interpolation=interpolation,
+            **max_size_kwarg,
+            antialias=antialias,
+            check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+
+        bounding_boxes = make_bounding_boxes(
+            format=format,
+            canvas_size=self.INPUT_SIZE,
+            dtype=dtype,
+            device=device,
+        )
+        check_kernel(
+            F.resize_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+            size=size,
+            **max_size_kwarg,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, size, use_max_size, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        keypoints = make_keypoints(
+            canvas_size=self.INPUT_SIZE,
+            dtype=dtype,
+            device=device,
+        )
+        check_kernel(
+            F.resize_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            size=size,
+            **max_size_kwarg,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1])
+
+    def test_kernel_video(self):
+        check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, size, make_input):
+        max_size_kwarg = self._make_max_size_kwarg(use_max_size=size is None, size=size)
+
+        check_functional(
+            F.resize,
+            make_input(self.INPUT_SIZE),
+            size=size,
+            **max_size_kwarg,
+            antialias=True,
+            check_scripted_smoke=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.resize_image, torch.Tensor),
+            (F._geometry._resize_image_pil, PIL.Image.Image),
+            (F.resize_image, tv_tensors.Image),
+            (F.resize_mask, tv_tensors.Mask),
+            (F.resize_video, tv_tensors.Video),
+            (F.resize_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_transform(self, size, device, make_input):
+        max_size_kwarg = self._make_max_size_kwarg(use_max_size=size is None, size=size)
+
+        check_transform(
+            transforms.Resize(size=size, **max_size_kwarg, antialias=True),
+            make_input(self.INPUT_SIZE, device=device),
+            # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+            check_v1_compatibility=dict(rtol=0, atol=1) if size is not None else False,
+        )
+
+    def _check_output_size(self, input, output, *, size, max_size):
+        assert tuple(F.get_size(output)) == self._compute_output_size(
+            input_size=F.get_size(input), size=size, max_size=max_size
+        )
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2.
+    # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT`
+    @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST})
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_image_correctness(self, size, interpolation, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8)
+
+        actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
+        expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg))
+
+        self._check_output_size(image, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected, atol=1, rtol=0)
+
+    def _reference_resize_bounding_boxes(self, bounding_boxes, format, *, size, max_size=None):
+        old_height, old_width = bounding_boxes.canvas_size
+        new_height, new_width = self._compute_output_size(
+            input_size=bounding_boxes.canvas_size, size=size, max_size=max_size
+        )
+
+        if (old_height, old_width) == (new_height, new_width):
+            return bounding_boxes
+
+        affine_matrix = np.array(
+            [
+                [new_width / old_width, 0, 0],
+                [0, new_height / old_height, 0],
+            ],
+        )
+
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+
+        return helper(
+            bounding_boxes,
+            affine_matrix=affine_matrix,
+            new_canvas_size=(new_height, new_width),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_bounding_boxes_correctness(self, format, size, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE)
+
+        actual = fn(bounding_boxes, size=size, **max_size_kwarg)
+        expected = self._reference_resize_bounding_boxes(bounding_boxes, format=format, size=size, **max_size_kwarg)
+
+        self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_resize_keypoints(self, keypoints, *, size, max_size=None):
+        old_height, old_width = keypoints.canvas_size
+        new_height, new_width = self._compute_output_size(
+            input_size=keypoints.canvas_size, size=size, max_size=max_size
+        )
+
+        if (old_height, old_width) == (new_height, new_width):
+            return keypoints
+
+        affine_matrix = np.array(
+            [
+                [new_width / old_width, 0, 0],
+                [0, new_height / old_height, 0],
+            ],
+        )
+
+        return reference_affine_keypoints_helper(
+            keypoints,
+            affine_matrix=affine_matrix,
+            new_canvas_size=(new_height, new_width),
+        )
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_keypoints_correctness(self, size, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        keypoints = make_keypoints(canvas_size=self.INPUT_SIZE)
+
+        actual = fn(keypoints, size=size, **max_size_kwarg)
+        expected = self._reference_resize_keypoints(keypoints, size=size, **max_size_kwarg)
+
+        self._check_output_size(keypoints, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES))
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_pil_interpolation_compat_smoke(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        with (
+            contextlib.nullcontext()
+            if isinstance(input, PIL.Image.Image)
+            # This error is triggered in PyTorch core
+            else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}")
+        ):
+            F.resize(
+                input,
+                size=self.OUTPUT_SIZES[0],
+                interpolation=interpolation,
+            )
+
+    def test_functional_pil_antialias_warning(self):
+        with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
+            F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_max_size_error(self, size, make_input):
+        if size is None:
+            # value can be anything other than an integer
+            max_size = None
+            match = "max_size must be an integer when size is None"
+        elif isinstance(size, int) or len(size) == 1:
+            max_size = (size if isinstance(size, int) else size[0]) - 1
+            match = "must be strictly greater than the requested size"
+        else:
+            # value can be anything other than None
+            max_size = -1
+            match = "size should be an int or a sequence of length 1"
+
+        with pytest.raises(ValueError, match=match):
+            F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
+
+        if isinstance(size, list) and len(size) != 1:
+            with pytest.raises(ValueError, match="max_size should only be passed if size is None or specifies"):
+                F.resize(make_input(self.INPUT_SIZE), size=size, max_size=500)
+
+    @pytest.mark.parametrize(
+        "input_size, max_size, expected_size",
+        [
+            ((10, 10), 10, (10, 10)),
+            ((10, 20), 40, (20, 40)),
+            ((20, 10), 40, (40, 20)),
+            ((10, 20), 10, (5, 10)),
+            ((20, 10), 10, (10, 5)),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_resize_size_none(self, input_size, max_size, expected_size, make_input):
+        img = make_input(input_size)
+        out = F.resize(img, size=None, max_size=max_size)
+        assert F.get_size(out)[-2:] == list(expected_size)
+
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_interpolation_int(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to
+        # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a
+        # difference and thus we don't test it here.
+        if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
+            return
+
+        expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
+        actual = F.resize(
+            input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True
+        )
+
+        assert_equal(actual, expected)
+
+    def test_transform_unknown_size_error(self):
+        with pytest.raises(ValueError, match="size can be an integer, a sequence of one or two integers, or None"):
+            transforms.Resize(size=object())
+
+    @pytest.mark.parametrize(
+        "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)]
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_noop(self, size, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        output = F.resize(input, size=F.get_size(input), antialias=True)
+
+        # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
+        # is a good reason to break this, feel free to downgrade to an equality check.
+        if isinstance(input, tv_tensors.TVTensor):
+            # We can't test identity directly, since that checks for the identity of the Python object. Since all
+            # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check
+            # that the underlying storage is the same
+            assert output.data_ptr() == input.data_ptr()
+        else:
+            assert output is input
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_no_regression_5405(self, make_input):
+        # Checks that `max_size` is not ignored if `size == small_edge_size`
+        # See https://github.com/pytorch/vision/issues/5405
+
+        input = make_input(self.INPUT_SIZE)
+
+        size = min(F.get_size(input))
+        max_size = size + 1
+        output = F.resize(input, size=size, max_size=max_size, antialias=True)
+
+        assert max(F.get_size(output)) == max_size
+
+    def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs):
+        # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming
+        # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the
+        # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create
+        # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels
+        # last although PyTorch doesn't recognizes it as such.
+        emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1
+
+        image = make_image(
+            *args,
+            batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims,
+            memory_format=memory_format,
+            **kwargs,
+        )
+
+        if emulate_channels_last:
+            image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image)
+
+        return image
+
+    def _check_stride(self, image, *, memory_format):
+        C, H, W = F.get_dimensions(image)
+        if memory_format is torch.contiguous_format:
+            expected_stride = (H * W, W, 1)
+        elif memory_format is torch.channels_last:
+            expected_stride = (1, W * C, C)
+        else:
+            raise ValueError(f"Unknown memory_format: {memory_format}")
+
+        assert image.stride() == expected_stride
+
+    # TODO: We can remove this test and related torchvision workaround
+    #  once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize("antialias", [True, False])
+    @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last])
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device):
+        size = self.OUTPUT_SIZES[0]
+
+        input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format)
+
+        # Smoke test to make sure we aren't starting with wrong assumptions
+        self._check_stride(input, memory_format=memory_format)
+
+        output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias)
+
+        self._check_stride(output, memory_format=memory_format)
+
+    def test_float16_no_rounding(self):
+        # Make sure Resize() doesn't round float16 images
+        # Non-regression test for https://github.com/pytorch/vision/issues/7667
+
+        input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16)
+        output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True)
+
+        assert output.dtype is torch.float16
+        assert (output.round() - output).abs().sum() > 0
+
+
+class TestHorizontalFlip:
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        check_kernel(
+            F.horizontal_flip_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+        )
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, dtype, device):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+        check_kernel(
+            F.horizontal_flip_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.horizontal_flip_mask, make_mask())
+
+    def test_kernel_video(self):
+        check_kernel(F.horizontal_flip_video, make_video())
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.horizontal_flip, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.horizontal_flip_image, torch.Tensor),
+            (F._geometry._horizontal_flip_image_pil, PIL.Image.Image),
+            (F.horizontal_flip_image, tv_tensors.Image),
+            (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.horizontal_flip_mask, tv_tensors.Mask),
+            (F.horizontal_flip_video, tv_tensors.Video),
+            (F.horizontal_flip_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device))
+
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_image_correctness(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image(F.horizontal_flip(F.to_pil_image(image)))
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
+        affine_matrix = np.array(
+            [
+                [-1, 0, bounding_boxes.canvas_size[1]],
+                [0, 1, 0],
+            ],
+        )
+
+        helper = (
+            functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True)
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix, clamp=False)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_bounding_boxes_correctness(self, format, fn):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = fn(bounding_boxes)
+        expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_horizontal_flip_keypoints(self, keypoints):
+        affine_matrix = np.array(
+            [
+                [-1, 0, keypoints.canvas_size[1] - 1],
+                [0, 1, 0],
+            ],
+        )
+
+        return reference_affine_keypoints_helper(keypoints, affine_matrix=affine_matrix)
+
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_keypoints_correctness(self, fn):
+        keypoints = make_keypoints()
+
+        actual = fn(keypoints)
+        expected = self._reference_horizontal_flip_keypoints(keypoints)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
+
+        transform = transforms.RandomHorizontalFlip(p=0)
+
+        output = transform(input)
+
+        assert_equal(output, input)
+
+
+class TestAffine:
+    _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
+        # float, int
+        angle=[-10.9, 18],
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)],
+        # float
+        scale=[0.5],
+        # float, int,
+        # one-list of float, one-list of int, one-tuple of float, one-tuple of int
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)],
+        # None
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)],
+    )
+    # The special case for shear makes sure we pick a value that is supported while JIT scripting
+    _MINIMAL_AFFINE_KWARGS = {
+        k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list))
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+    _CORRECTNESS_AFFINE_KWARGS = {
+        k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)]
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+
+    _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
+        degrees=[30, (-15, 20)],
+        translate=[None, (0.5, 0.5)],
+        scale=[None, (0.75, 1.25)],
+        shear=[None, (12, 30, -17, 5), 10, (-5, 12)],
+    )
+    _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {
+        k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()
+    }
+
+    def _check_kernel(self, kernel, input, *args, **kwargs):
+        kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy()
+        kwargs_.update(kwargs)
+        check_kernel(kernel, input, *args, **kwargs_)
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        if param == "fill":
+            value = adapt_fill(value, dtype=dtype)
+        self._check_kernel(
+            F.affine_image,
+            make_image(dtype=dtype, device=device),
+            **{param: value},
+            check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
+            check_cuda_vs_cpu=(
+                dict(atol=1, rtol=0)
+                if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
+                else True
+            ),
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        self._check_kernel(
+            F.affine_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+            **{param: value},
+            check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, param, value, dtype, device):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+        self._check_kernel(
+            F.affine_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            **{param: value},
+            check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        self._check_kernel(F.affine_mask, make_mask())
+
+    def test_kernel_video(self):
+        self._check_kernel(F.affine_video, make_video())
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.affine_image, torch.Tensor),
+            (F._geometry._affine_image_pil, PIL.Image.Image),
+            (F.affine_image, tv_tensors.Image),
+            (F.affine_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.affine_mask, tv_tensors.Mask),
+            (F.affine_video, tv_tensors.Video),
+            (F.affine_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        input = make_input(device=device)
+
+        check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input)
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        actual = F.affine(
+            image,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+            interpolation=interpolation,
+            fill=fill,
+        )
+        expected = F.to_image(
+            F.affine(
+                F.to_pil_image(image),
+                angle=angle,
+                translate=translate,
+                scale=scale,
+                shear=shear,
+                center=center,
+                interpolation=interpolation,
+                fill=fill,
+            )
+        )
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, center, interpolation, fill, seed):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        transform = transforms.RandomAffine(
+            **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill
+        )
+
+        torch.manual_seed(seed)
+        actual = transform(image)
+
+        torch.manual_seed(seed)
+        expected = F.to_image(transform(F.to_pil_image(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
+
+    def _compute_affine_matrix(self, *, angle, translate, scale, shear, center):
+        rot = math.radians(angle)
+        cx, cy = center
+        tx, ty = translate
+        sx, sy = (math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear))
+
+        c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
+        t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+        c_matrix_inv = np.linalg.inv(c_matrix)
+        rs_matrix = np.array(
+            [
+                [scale * math.cos(rot), -scale * math.sin(rot), 0],
+                [scale * math.sin(rot), scale * math.cos(rot), 0],
+                [0, 0, 1],
+            ]
+        )
+        shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
+        shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
+        rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
+        true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
+        return true_matrix[:2, :]
+
+    def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center):
+        if center is None:
+            center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
+
+        affine_matrix = self._compute_affine_matrix(
+            angle=angle, translate=translate, scale=scale, shear=shear, center=center
+        )
+
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+
+        return helper(
+            bounding_boxes,
+            affine_matrix=affine_matrix,
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = F.affine(
+            bounding_boxes,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+        expected = self._reference_affine_bounding_boxes(
+            bounding_boxes,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+
+        torch.testing.assert_close(actual, expected, atol=1e-4, rtol=1e-4)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, format, center, seed):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
+
+        torch.manual_seed(seed)
+        params = transform.make_params([bounding_boxes])
+
+        torch.manual_seed(seed)
+        actual = transform(bounding_boxes)
+
+        expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center)
+
+        torch.testing.assert_close(actual, expected, atol=1e-5, rtol=2e-5)
+
+    def _reference_affine_keypoints(self, keypoints, *, angle, translate, scale, shear, center):
+        if center is None:
+            center = [s * 0.5 for s in keypoints.canvas_size[::-1]]
+
+        return reference_affine_keypoints_helper(
+            keypoints,
+            affine_matrix=self._compute_affine_matrix(
+                angle=angle, translate=translate, scale=scale, shear=shear, center=center
+            ),
+        )
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_keypoints_correctness(self, angle, translate, scale, shear, center):
+        keypoints = make_keypoints()
+
+        actual = F.affine(
+            keypoints,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+        expected = self._reference_affine_keypoints(
+            keypoints,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_keypoints_correctness(self, center, seed):
+        keypoints = make_keypoints()
+
+        transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
+
+        torch.manual_seed(seed)
+        params = transform.make_params([keypoints])
+
+        torch.manual_seed(seed)
+        actual = transform(keypoints)
+
+        expected = self._reference_affine_keypoints(keypoints, **params, center=center)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
+    @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"])
+    @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"])
+    @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transformmake_params_bounds(self, degrees, translate, scale, shear, seed):
+        image = make_image()
+        height, width = F.get_size(image)
+
+        transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
+
+        torch.manual_seed(seed)
+        params = transform.make_params([image])
+
+        if isinstance(degrees, (int, float)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+        if translate is not None:
+            width_max = int(round(translate[0] * width))
+            height_max = int(round(translate[1] * height))
+            assert -width_max <= params["translate"][0] <= width_max
+            assert -height_max <= params["translate"][1] <= height_max
+        else:
+            assert params["translate"] == (0, 0)
+
+        if scale is not None:
+            assert scale[0] <= params["scale"] <= scale[1]
+        else:
+            assert params["scale"] == 1.0
+
+        if shear is not None:
+            if isinstance(shear, (int, float)):
+                assert -shear <= params["shear"][0] <= shear
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 2:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 4:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert shear[2] <= params["shear"][1] <= shear[3]
+        else:
+            assert params["shear"] == (0, 0)
+
+    @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"])
+    @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]])
+    def test_transform_sequence_len_errors(self, param, value):
+        if param in {"degrees", "shear"} and not isinstance(value, list):
+            return
+
+        kwargs = {param: value}
+        if param != "degrees":
+            kwargs["degrees"] = 0
+
+        with pytest.raises(
+            ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2"
+        ):
+            transforms.RandomAffine(**kwargs)
+
+    def test_transform_negative_degrees_error(self):
+        with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=-1)
+
+    @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]])
+    def test_transform_translate_range_error(self, translate):
+        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
+            transforms.RandomAffine(degrees=0, translate=translate)
+
+    @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]])
+    def test_transform_scale_range_error(self, scale):
+        with pytest.raises(ValueError, match="scale values should be positive"):
+            transforms.RandomAffine(degrees=0, scale=scale)
+
+    def test_transform_negative_shear_error(self):
+        with pytest.raises(ValueError, match="If shear is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=0, shear=-1)
+
+    def test_transform_unknown_fill_error(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(degrees=0, fill="fill")
+
+
+class TestVerticalFlip:
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        check_kernel(
+            F.vertical_flip_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+        )
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, dtype, device):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+        check_kernel(
+            F.vertical_flip_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.vertical_flip_mask, make_mask())
+
+    def test_kernel_video(self):
+        check_kernel(F.vertical_flip_video, make_video())
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.vertical_flip, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.vertical_flip_image, torch.Tensor),
+            (F._geometry._vertical_flip_image_pil, PIL.Image.Image),
+            (F.vertical_flip_image, tv_tensors.Image),
+            (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.vertical_flip_mask, tv_tensors.Mask),
+            (F.vertical_flip_video, tv_tensors.Video),
+            (F.vertical_flip_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device))
+
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_image_correctness(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image(F.vertical_flip(F.to_pil_image(image)))
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
+        affine_matrix = np.array(
+            [
+                [1, 0, 0],
+                [0, -1, bounding_boxes.canvas_size[0]],
+            ],
+        )
+
+        helper = (
+            functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True)
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix, clamp=False)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_bounding_boxes_correctness(self, format, fn):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = fn(bounding_boxes)
+        expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_vertical_flip_keypoints(self, keypoints):
+        affine_matrix = np.array(
+            [
+                [1, 0, 0],
+                [0, -1, keypoints.canvas_size[0] - 1],
+            ],
+        )
+
+        return reference_affine_keypoints_helper(keypoints, affine_matrix=affine_matrix)
+
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_keypoints_correctness(self, fn):
+        keypoints = make_keypoints()
+
+        actual = fn(keypoints)
+        expected = self._reference_vertical_flip_keypoints(keypoints)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
+
+        transform = transforms.RandomVerticalFlip(p=0)
+
+        output = transform(input)
+
+        assert_equal(output, input)
+
+
+class TestRotate:
+    _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
+        # float, int
+        angle=[-10.9, 18],
+        # None
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)],
+    )
+    _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()}
+    _CORRECTNESS_AFFINE_KWARGS = {
+        k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)]
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+
+    _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
+        degrees=[30, (-15, 20)],
+    )
+    _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()}
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+        check_kernel(
+            F.rotate_image,
+            make_image(dtype=dtype, device=device),
+            **kwargs,
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        if tv_tensors.is_rotated_bounding_format(format):
+            # TODO there is a 1e-6 difference between GPU and CPU outputs
+            # due to clamping. To avoid failing this test, we do clamp before hand.
+            bounding_boxes = F.clamp_bounding_boxes(bounding_boxes)
+
+        check_kernel(
+            F.rotate_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+            **kwargs,
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, param, value, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+
+        keypoints = make_keypoints(dtype=dtype, device=device)
+
+        check_kernel(
+            F.rotate_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            **kwargs,
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.rotate_image, torch.Tensor),
+            (F._geometry._rotate_image_pil, PIL.Image.Image),
+            (F.rotate_image, tv_tensors.Image),
+            (F.rotate_mask, tv_tensors.Mask),
+            (F.rotate_video, tv_tensors.Video),
+            (F.rotate_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(
+            transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device)
+        )
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_functional_image_correctness(self, angle, center, interpolation, expand, fill):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill)
+        expected = F.to_image(
+            F.rotate(
+                F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill
+            )
+        )
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, center, interpolation, expand, fill, seed):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        transform = transforms.RandomRotation(
+            **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES,
+            center=center,
+            interpolation=interpolation,
+            expand=expand,
+            fill=fill,
+        )
+
+        torch.manual_seed(seed)
+        actual = transform(image)
+
+        torch.manual_seed(seed)
+        expected = F.to_image(transform(F.to_pil_image(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
+
+    def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix):
+        if not expand:
+            return canvas_size, (0.0, 0.0)
+
+        input_height, input_width = canvas_size
+
+        input_image_frame = np.array(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, input_height, 1.0],
+                [input_width, input_height, 1.0],
+                [input_width, 0.0, 1.0],
+            ],
+            dtype=np.float64,
+        )
+        output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T)
+
+        recenter_x = float(np.min(output_image_frame[:, 0]))
+        recenter_y = float(np.min(output_image_frame[:, 1]))
+
+        output_width = int(np.max(output_image_frame[:, 0]) - recenter_x)
+        output_height = int(np.max(output_image_frame[:, 1]) - recenter_y)
+
+        return (output_height, output_width), (recenter_x, recenter_y)
+
+    def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy):
+        x, y = recenter_xy
+        if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY:
+            translate = [x, y, x, y]
+        elif bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXYXYXY:
+            translate = [x, y, x, y, x, y, x, y]
+        elif (
+            bounding_boxes.format is tv_tensors.BoundingBoxFormat.CXCYWHR
+            or bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYWHR
+        ):
+            translate = [x, y, 0.0, 0.0, 0.0]
+        else:
+            translate = [x, y, 0.0, 0.0]
+        return tv_tensors.wrap(
+            (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes
+        )
+
+    def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center):
+        if center is None:
+            center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
+        cx, cy = center
+
+        a = np.cos(angle * np.pi / 180.0)
+        b = np.sin(angle * np.pi / 180.0)
+        affine_matrix = np.array(
+            [
+                [a, b, cx - cx * a - b * cy],
+                [-b, a, cy + cx * b - a * cy],
+            ],
+        )
+
+        new_canvas_size, recenter_xy = self._compute_output_canvas_size(
+            expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix
+        )
+
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        output = helper(
+            bounding_boxes,
+            affine_matrix=affine_matrix,
+            new_canvas_size=new_canvas_size,
+            clamp=False,
+        )
+
+        return self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy).to(bounding_boxes)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
+        bounding_boxes = make_bounding_boxes(format=format, clamping_mode=None)
+
+        actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center)
+        expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
+        bounding_boxes = make_bounding_boxes(format=format, clamping_mode=None)
+
+        transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
+
+        torch.manual_seed(seed)
+        params = transform.make_params([bounding_boxes])
+
+        torch.manual_seed(seed)
+        actual = transform(bounding_boxes)
+
+        expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
+        torch.testing.assert_close(actual, expected)
+
+    def _recenter_keypoints_after_expand(self, keypoints, *, recenter_xy):
+        x, y = recenter_xy
+        translate = [x, y]
+        return tv_tensors.wrap(
+            (keypoints.to(torch.float64) - torch.tensor(translate)).to(keypoints.dtype), like=keypoints
+        )
+
+    def _reference_rotate_keypoints(self, keypoints, *, angle, expand, center):
+        if center is None:
+            center = [s * 0.5 for s in keypoints.canvas_size[::-1]]
+        cx, cy = center
+
+        a = np.cos(angle * np.pi / 180.0)
+        b = np.sin(angle * np.pi / 180.0)
+        affine_matrix = np.array(
+            [
+                [a, b, cx - cx * a - b * cy],
+                [-b, a, cy + cx * b - a * cy],
+            ],
+        )
+
+        new_canvas_size, recenter_xy = self._compute_output_canvas_size(
+            expand=expand, canvas_size=keypoints.canvas_size, affine_matrix=affine_matrix
+        )
+
+        output = reference_affine_keypoints_helper(
+            keypoints,
+            affine_matrix=affine_matrix,
+            new_canvas_size=new_canvas_size,
+            clamp=False,
+        )
+
+        return F.clamp_keypoints(self._recenter_keypoints_after_expand(output, recenter_xy=recenter_xy)).to(keypoints)
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_keypoints_correctness(self, angle, expand, center):
+        keypoints = make_keypoints()
+
+        actual = F.rotate(keypoints, angle=angle, expand=expand, center=center)
+        expected = self._reference_rotate_keypoints(keypoints, angle=angle, expand=expand, center=center)
+
+        torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
+
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_keypoints_correctness(self, expand, center, seed):
+        keypoints = make_keypoints()
+
+        transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
+
+        torch.manual_seed(seed)
+        params = transform.make_params([keypoints])
+
+        torch.manual_seed(seed)
+        actual = transform(keypoints)
+
+        expected = self._reference_rotate_keypoints(keypoints, **params, expand=expand, center=center)
+
+        torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
+
+    @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transformmake_params_bounds(self, degrees, seed):
+        transform = transforms.RandomRotation(degrees=degrees)
+
+        torch.manual_seed(seed)
+        params = transform.make_params([])
+
+        if isinstance(degrees, (int, float)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+    @pytest.mark.parametrize("param", ["degrees", "center"])
+    @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]])
+    def test_transform_sequence_len_errors(self, param, value):
+        if param == "degrees" and not isinstance(value, list):
+            return
+
+        kwargs = {param: value}
+        if param != "degrees":
+            kwargs["degrees"] = 0
+
+        with pytest.raises(
+            ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2"
+        ):
+            transforms.RandomRotation(**kwargs)
+
+    def test_transform_negative_degrees_error(self):
+        with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=-1)
+
+    def test_transform_unknown_fill_error(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(degrees=0, fill="fill")
+
+    @pytest.mark.parametrize("size", [(11, 17), (16, 16)])
+    @pytest.mark.parametrize("angle", [0, 90, 180, 270])
+    @pytest.mark.parametrize("expand", [False, True])
+    def test_functional_image_fast_path_correctness(self, size, angle, expand):
+        image = make_image(size, dtype=torch.uint8, device="cpu")
+
+        actual = F.rotate(image, angle=angle, expand=expand)
+        expected = F.to_image(F.rotate(F.to_pil_image(image), angle=angle, expand=expand))
+
+        torch.testing.assert_close(actual, expected)
+
+
+class TestContainerTransforms:
+    class BuiltinTransform(transforms.Transform):
+        def transform(self, inpt, params):
+            return inpt
+
+    class PackedInputTransform(nn.Module):
+        def forward(self, sample):
+            assert len(sample) == 2
+            return sample
+
+    class UnpackedInputTransform(nn.Module):
+        def forward(self, image, label):
+            return image, label
+
+    @pytest.mark.parametrize(
+        "transform_cls", [transforms.Compose, functools.partial(transforms.RandomApply, p=1), transforms.RandomOrder]
+    )
+    @pytest.mark.parametrize(
+        "wrapped_transform_clss",
+        [
+            [BuiltinTransform],
+            [PackedInputTransform],
+            [UnpackedInputTransform],
+            [BuiltinTransform, BuiltinTransform],
+            [PackedInputTransform, PackedInputTransform],
+            [UnpackedInputTransform, UnpackedInputTransform],
+            [BuiltinTransform, PackedInputTransform, BuiltinTransform],
+            [BuiltinTransform, UnpackedInputTransform, BuiltinTransform],
+            [PackedInputTransform, BuiltinTransform, PackedInputTransform],
+            [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform],
+        ],
+    )
+    @pytest.mark.parametrize("unpack", [True, False])
+    def test_packed_unpacked(self, transform_cls, wrapped_transform_clss, unpack):
+        needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in wrapped_transform_clss)
+        needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in wrapped_transform_clss)
+        assert not (needs_packed_inputs and needs_unpacked_inputs)
+
+        transform = transform_cls([cls() for cls in wrapped_transform_clss])
+
+        image = make_image()
+        label = 3
+        packed_input = (image, label)
+
+        def call_transform():
+            if unpack:
+                return transform(*packed_input)
+            else:
+                return transform(packed_input)
+
+        if needs_unpacked_inputs and not unpack:
+            with pytest.raises(TypeError, match="missing 1 required positional argument"):
+                call_transform()
+        elif needs_packed_inputs and unpack:
+            with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"):
+                call_transform()
+        else:
+            output = call_transform()
+
+            assert isinstance(output, tuple) and len(output) == 2
+            assert output[0] is image
+            assert output[1] is label
+
+    def test_compose(self):
+        transform = transforms.Compose(
+            [
+                transforms.RandomHorizontalFlip(p=1),
+                transforms.RandomVerticalFlip(p=1),
+            ]
+        )
+
+        input = make_image()
+
+        actual = check_transform(transform, input)
+        expected = F.vertical_flip(F.horizontal_flip(input))
+
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("p", [0.0, 1.0])
+    @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList])
+    def test_random_apply(self, p, sequence_type):
+        transform = transforms.RandomApply(
+            sequence_type(
+                [
+                    transforms.RandomHorizontalFlip(p=1),
+                    transforms.RandomVerticalFlip(p=1),
+                ]
+            ),
+            p=p,
+        )
+
+        # This needs to be a pure tensor (or a PIL image), because otherwise check_transforms skips the v1 compatibility
+        # check
+        input = make_image_tensor()
+        output = check_transform(transform, input, check_v1_compatibility=issubclass(sequence_type, nn.ModuleList))
+
+        if p == 1:
+            assert_equal(output, F.vertical_flip(F.horizontal_flip(input)))
+        else:
+            assert output is input
+
+    @pytest.mark.parametrize("p", [(0, 1), (1, 0)])
+    def test_random_choice(self, p):
+        transform = transforms.RandomChoice(
+            [
+                transforms.RandomHorizontalFlip(p=1),
+                transforms.RandomVerticalFlip(p=1),
+            ],
+            p=p,
+        )
+
+        input = make_image()
+        output = check_transform(transform, input)
+
+        p_horz, p_vert = p
+        if p_horz:
+            assert_equal(output, F.horizontal_flip(input))
+        else:
+            assert_equal(output, F.vertical_flip(input))
+
+    def test_random_order(self):
+        transform = transforms.Compose(
+            [
+                transforms.RandomHorizontalFlip(p=1),
+                transforms.RandomVerticalFlip(p=1),
+            ]
+        )
+
+        input = make_image()
+
+        actual = check_transform(transform, input)
+        # We can't really check whether the transforms are actually applied in random order. However, horizontal and
+        # vertical flip are commutative. Meaning, even under the assumption that the transform applies them in random
+        # order, we can use a fixed order to compute the expected value.
+        expected = F.vertical_flip(F.horizontal_flip(input))
+
+        assert_equal(actual, expected)
+
+    def test_errors(self):
+        for cls in [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]:
+            with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
+                cls(lambda x: x)
+
+        for cls in (
+            transforms.Compose,
+            transforms.RandomApply,
+            transforms.RandomChoice,
+            transforms.RandomOrder,
+        ):
+
+            with pytest.raises(ValueError, match="at least one transform"):
+                cls([])
+
+        for p in [-1, 2]:
+            with pytest.raises(ValueError, match=re.escape("value in the interval [0.0, 1.0]")):
+                transforms.RandomApply([lambda x: x], p=p)
+
+        for transforms_, p in [
+            ([lambda x: x], []),
+            (
+                [lambda x: x, lambda x: x],
+                [
+                    1.0,
+                ],
+            ),
+        ]:
+            with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"):
+                transforms.RandomChoice(transforms_, p=p)
+
+
+class TestToDtype:
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.to_dtype_image, make_image_tensor),
+            (F.to_dtype_image, make_image),
+            (F.to_dtype_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale):
+        check_kernel(
+            kernel,
+            make_input(dtype=input_dtype, device=device),
+            dtype=output_dtype,
+            scale=scale,
+        )
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_functional(self, make_input, input_dtype, output_dtype, device, scale):
+        check_functional(
+            F.to_dtype,
+            make_input(dtype=input_dtype, device=device),
+            dtype=output_dtype,
+            scale=scale,
+        )
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    @pytest.mark.parametrize("as_dict", (True, False))
+    def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict):
+        input = make_input(dtype=input_dtype, device=device)
+        if as_dict:
+            output_dtype = {type(input): output_dtype}
+        check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input, check_sample_input=not as_dict)
+
+    def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False):
+        input_dtype = image.dtype
+        output_dtype = dtype
+
+        if not scale:
+            return image.to(dtype)
+
+        if output_dtype == input_dtype:
+            return image
+
+        def fn(value):
+            if input_dtype.is_floating_point:
+                if output_dtype.is_floating_point:
+                    return value
+                else:
+                    return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max)
+            else:
+                input_max_value = torch.iinfo(input_dtype).max
+
+                if output_dtype.is_floating_point:
+                    return float(decimal.Decimal(value) / input_max_value)
+                else:
+                    output_max_value = torch.iinfo(output_dtype).max
+
+                    if input_max_value > output_max_value:
+                        factor = (input_max_value + 1) // (output_max_value + 1)
+                        return value / factor
+                    else:
+                        factor = (output_max_value + 1) // (input_max_value + 1)
+                        return value * factor
+
+        return torch.tensor(tree_map(fn, image.tolist())).to(dtype=output_dtype, device=image.device)
+
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8, torch.uint16])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8, torch.uint16])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_image_correctness(self, input_dtype, output_dtype, device, scale):
+        if input_dtype.is_floating_point and output_dtype == torch.int64:
+            pytest.xfail("float to int64 conversion is not supported")
+        if input_dtype == torch.uint8 and output_dtype == torch.uint16 and device == "cuda":
+            pytest.xfail("uint8 to uint16 conversion is not supported on cuda")
+
+        input = make_image(dtype=input_dtype, device=device)
+
+        out = F.to_dtype(input, dtype=output_dtype, scale=scale)
+        expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale)
+
+        if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale:
+            torch.testing.assert_close(out, expected, atol=1, rtol=0)
+        else:
+            torch.testing.assert_close(out, expected)
+
+    def was_scaled(self, inpt):
+        # this assumes the target dtype is float
+        return inpt.max() <= 1
+
+    def make_inpt_with_bbox_and_mask(self, make_input):
+        H, W = 10, 10
+        inpt_dtype = torch.uint8
+        bbox_dtype = torch.float32
+        mask_dtype = torch.bool
+        sample = {
+            "inpt": make_input(size=(H, W), dtype=inpt_dtype),
+            "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype),
+            "mask": make_detection_masks(size=(H, W), dtype=mask_dtype),
+        }
+
+        return sample, inpt_dtype, bbox_dtype, mask_dtype
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_dtype_not_a_dict(self, make_input, scale):
+        # assert only inpt gets transformed when dtype isn't a dict
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample)
+
+        assert out["inpt"].dtype != inpt_dtype
+        assert out["inpt"].dtype == torch.float32
+        if scale:
+            assert self.was_scaled(out["inpt"])
+        else:
+            assert not self.was_scaled(out["inpt"])
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype == mask_dtype
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_others_catch_all_and_none(self, make_input):
+        # make sure "others" works as a catch-all and that None means no conversion
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample)
+        assert out["inpt"].dtype == inpt_dtype
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype != mask_dtype
+        assert out["mask"].dtype == torch.int64
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_typical_use_case(self, make_input):
+        # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks.
+        # This just makes sure we now have a decent API for this
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(
+            dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True
+        )(sample)
+        assert out["inpt"].dtype != inpt_dtype
+        assert out["inpt"].dtype == torch.float32
+        assert self.was_scaled(out["inpt"])
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype != mask_dtype
+        assert out["mask"].dtype == torch.int64
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_errors_warnings(self, make_input):
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+
+        with pytest.raises(ValueError, match="No dtype was specified for"):
+            out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample)
+        with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")):
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32})
+        with pytest.warns(UserWarning, match="no scaling will be done"):
+            out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample)
+        assert out["inpt"].dtype == inpt_dtype
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype == mask_dtype
+
+    def test_uint16(self):
+        # These checks are probably already covered above but since uint16 is a
+        # newly supported dtype,  we want to be extra careful, hence this
+        # explicit test
+        img_uint16 = torch.randint(0, 65535, (256, 512), dtype=torch.uint16)
+
+        img_uint8 = F.to_dtype(img_uint16, torch.uint8, scale=True)
+        img_float32 = F.to_dtype(img_uint16, torch.float32, scale=True)
+        img_int32 = F.to_dtype(img_uint16, torch.int32, scale=True)
+
+        assert_equal(img_uint8, (img_uint16 / 256).to(torch.uint8))
+        assert_close(img_float32, (img_uint16 / 65535))
+
+        assert_close(F.to_dtype(img_float32, torch.uint16, scale=True), img_uint16, rtol=0, atol=1)
+        # Ideally we'd check against (img_uint16 & 0xFF00) but bitwise and isn't supported for it yet
+        # so we simulate it by scaling down and up again.
+        assert_equal(F.to_dtype(img_uint8, torch.uint16, scale=True), ((img_uint16 / 256).to(torch.uint16) * 256))
+        assert_equal(F.to_dtype(img_int32, torch.uint16, scale=True), img_uint16)
+
+        assert_equal(F.to_dtype(img_float32, torch.uint8, scale=True), img_uint8)
+        assert_close(F.to_dtype(img_uint8, torch.float32, scale=True), img_float32, rtol=0, atol=1e-2)
+
+
+class TestAdjustBrightness:
+    _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0]
+    _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.adjust_brightness_image, make_image),
+            (F.adjust_brightness_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, kernel, make_input, dtype, device):
+        check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_brightness_image, torch.Tensor),
+            (F._color._adjust_brightness_image_pil, PIL.Image.Image),
+            (F.adjust_brightness_image, tv_tensors.Image),
+            (F.adjust_brightness_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS)
+    def test_image_correctness(self, brightness_factor):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_brightness(image, brightness_factor=brightness_factor)
+        expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor))
+
+        torch.testing.assert_close(actual, expected)
+
+
+class TestCutMixMixUp:
+    class DummyDataset:
+        def __init__(self, size, num_classes, one_hot_labels):
+            self.size = size
+            self.num_classes = num_classes
+            self.one_hot_labels = one_hot_labels
+            assert size < num_classes
+
+        def __getitem__(self, idx):
+            img = torch.rand(3, 100, 100)
+            label = idx  # This ensures all labels in a batch are unique and makes testing easier
+            if self.one_hot_labels:
+                label = torch.nn.functional.one_hot(torch.tensor(label), num_classes=self.num_classes)
+            return img, label
+
+        def __len__(self):
+            return self.size
+
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
+    @pytest.mark.parametrize("one_hot_labels", (True, False))
+    def test_supported_input_structure(self, T, one_hot_labels):
+
+        batch_size = 32
+        num_classes = 100
+
+        dataset = self.DummyDataset(size=batch_size, num_classes=num_classes, one_hot_labels=one_hot_labels)
+
+        cutmix_mixup = T(num_classes=num_classes)
+
+        dl = DataLoader(dataset, batch_size=batch_size)
+
+        # Input sanity checks
+        img, target = next(iter(dl))
+        input_img_size = img.shape[-3:]
+        assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor)
+        assert target.shape == (batch_size, num_classes) if one_hot_labels else (batch_size,)
+
+        def check_output(img, target):
+            assert img.shape == (batch_size, *input_img_size)
+            assert target.shape == (batch_size, num_classes)
+            torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size))
+            num_non_zero_labels = (target != 0).sum(axis=-1)
+            assert (num_non_zero_labels == 2).all()
+
+        # After Dataloader, as unpacked input
+        img, target = next(iter(dl))
+        assert target.shape == (batch_size, num_classes) if one_hot_labels else (batch_size,)
+        img, target = cutmix_mixup(img, target)
+        check_output(img, target)
+
+        # After Dataloader, as packed input
+        packed_from_dl = next(iter(dl))
+        assert isinstance(packed_from_dl, list)
+        img, target = cutmix_mixup(packed_from_dl)
+        check_output(img, target)
+
+        # As collation function. We expect default_collate to be used by users.
+        def collate_fn_1(batch):
+            return cutmix_mixup(default_collate(batch))
+
+        def collate_fn_2(batch):
+            return cutmix_mixup(*default_collate(batch))
+
+        for collate_fn in (collate_fn_1, collate_fn_2):
+            dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
+            img, target = next(iter(dl))
+            check_output(img, target)
+
+    @needs_cuda
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
+    def test_cpu_vs_gpu(self, T):
+        num_classes = 10
+        batch_size = 3
+        H, W = 12, 12
+
+        imgs = torch.rand(batch_size, 3, H, W)
+        labels = torch.randint(0, num_classes, (batch_size,))
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None)
+
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
+    def test_error(self, T):
+
+        num_classes = 10
+        batch_size = 9
+
+        imgs = torch.rand(batch_size, 3, 12, 12)
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        for input_with_bad_type in (
+            F.to_pil_image(imgs[0]),
+            tv_tensors.Mask(torch.rand(12, 12)),
+            tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12),
+            tv_tensors.KeyPoints(torch.rand(2, 2), canvas_size=(12, 12)),
+        ):
+            print(type(input_with_bad_type), cutmix_mixup)
+            with pytest.raises(ValueError, match="does not support PIL images, "):
+                cutmix_mixup(input_with_bad_type)
+
+        with pytest.raises(ValueError, match="Could not infer where the labels are"):
+            cutmix_mixup({"img": imgs, "Nothing_else": 3})
+
+        with pytest.raises(ValueError, match="labels should be index based"):
+            # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label
+            # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently
+            cutmix_mixup(imgs)
+
+        with pytest.raises(ValueError, match="When using the default labels_getter"):
+            cutmix_mixup(imgs, "not_a_tensor")
+
+        with pytest.raises(ValueError, match="Expected a batched input with 4 dims"):
+            cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,)))
+
+        with pytest.raises(ValueError, match="does not match the batch size of the labels"):
+            cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,)))
+
+        with pytest.raises(ValueError, match="When passing 2D labels"):
+            wrong_num_classes = num_classes + 1
+            T(alpha=0.5, num_classes=num_classes)(imgs, torch.randint(0, 2, size=(batch_size, wrong_num_classes)))
+
+        with pytest.raises(ValueError, match="but got a tensor of shape"):
+            cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3, 4)))
+
+        with pytest.raises(ValueError, match="num_classes must be passed"):
+            T(alpha=0.5)(imgs, torch.randint(0, num_classes, size=(batch_size,)))
+
+
+@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
+@pytest.mark.parametrize("sample_type", (tuple, list, dict))
+def test_labels_getter_default_heuristic(key, sample_type):
+    labels = torch.arange(10)
+    sample = {key: labels, "another_key": "whatever"}
+    if sample_type is not dict:
+        sample = sample_type((None, sample, "whatever_again"))
+    assert transforms._utils._find_labels_default_heuristic(sample) is labels
+
+    if key.lower() != "labels":
+        # If "labels" is in the dict (case-insensitive),
+        # it takes precedence over other keys which would otherwise be a match
+        d = {key: "something_else", "labels": labels}
+        assert transforms._utils._find_labels_default_heuristic(d) is labels
+
+
+class TestShapeGetters:
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_dimensions_image, make_image_tensor),
+            (F._meta._get_dimensions_image_pil, make_image_pil),
+            (F.get_dimensions_image, make_image),
+            (F.get_dimensions_video, make_video),
+        ],
+    )
+    def test_get_dimensions(self, kernel, make_input):
+        size = (10, 10)
+        color_space, num_channels = "RGB", 3
+
+        input = make_input(size, color_space=color_space)
+
+        assert kernel(input) == F.get_dimensions(input) == [num_channels, *size]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_num_channels_image, make_image_tensor),
+            (F._meta._get_num_channels_image_pil, make_image_pil),
+            (F.get_num_channels_image, make_image),
+            (F.get_num_channels_video, make_video),
+        ],
+    )
+    def test_get_num_channels(self, kernel, make_input):
+        color_space, num_channels = "RGB", 3
+
+        input = make_input(color_space=color_space)
+
+        assert kernel(input) == F.get_num_channels(input) == num_channels
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_size_image, make_image_tensor),
+            (F._meta._get_size_image_pil, make_image_pil),
+            (F.get_size_image, make_image),
+            (F.get_size_bounding_boxes, make_bounding_boxes),
+            (F.get_size_keypoints, make_keypoints),
+            (F.get_size_mask, make_detection_masks),
+            (F.get_size_mask, make_segmentation_mask),
+            (F.get_size_video, make_video),
+        ],
+    )
+    def test_get_size(self, kernel, make_input):
+        size = (10, 10)
+
+        input = make_input(size)
+
+        assert kernel(input) == F.get_size(input) == list(size)
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_num_frames_video, make_video_tensor),
+            (F.get_num_frames_video, make_video),
+        ],
+    )
+    def test_get_num_frames(self, kernel, make_input):
+        num_frames = 4
+
+        input = make_input(num_frames=num_frames)
+
+        assert kernel(input) == F.get_num_frames(input) == num_frames
+
+    @pytest.mark.parametrize(
+        ("functional", "make_input"),
+        [
+            (F.get_dimensions, make_bounding_boxes),
+            (F.get_dimensions, make_detection_masks),
+            (F.get_dimensions, make_segmentation_mask),
+            (F.get_num_channels, make_bounding_boxes),
+            (F.get_num_channels, make_detection_masks),
+            (F.get_num_channels, make_segmentation_mask),
+            (F.get_num_frames, make_image_pil),
+            (F.get_num_frames, make_image),
+            (F.get_num_frames, make_bounding_boxes),
+            (F.get_num_frames, make_detection_masks),
+            (F.get_num_frames, make_segmentation_mask),
+        ],
+    )
+    def test_unsupported_types(self, functional, make_input):
+        input = make_input()
+
+        with pytest.raises(TypeError, match=re.escape(str(type(input)))):
+            functional(input)
+
+
+class TestRegisterKernel:
+    @pytest.mark.parametrize("functional", (F.resize, "resize"))
+    def test_register_kernel(self, functional):
+        class CustomTVTensor(tv_tensors.TVTensor):
+            pass
+
+        kernel_was_called = False
+
+        @F.register_kernel(functional, CustomTVTensor)
+        def new_resize(dp, *args, **kwargs):
+            nonlocal kernel_was_called
+            kernel_was_called = True
+            return dp
+
+        t = transforms.Resize(size=(224, 224), antialias=True)
+
+        my_dp = CustomTVTensor(torch.rand(3, 10, 10))
+        out = t(my_dp)
+        assert out is my_dp
+        assert kernel_was_called
+
+        # Sanity check to make sure we didn't override the kernel of other types
+        t(torch.rand(3, 10, 10)).shape == (3, 224, 224)
+        t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
+
+    def test_errors(self):
+        with pytest.raises(ValueError, match="Could not find functional with name"):
+            F.register_kernel("bad_name", tv_tensors.Image)
+
+        with pytest.raises(ValueError, match="Kernels can only be registered on functionals"):
+            F.register_kernel(tv_tensors.Image, F.resize)
+
+        with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"):
+            F.register_kernel(F.resize, object)
+
+        with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"):
+            F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image)
+
+        class CustomTVTensor(tv_tensors.TVTensor):
+            pass
+
+        def resize_custom_tv_tensor():
+            pass
+
+        F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor)
+
+        with pytest.raises(ValueError, match="already has a kernel registered for type"):
+            F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor)
+
+
+class TestGetKernel:
+    # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination
+    # would also be fine
+    KERNELS = {
+        torch.Tensor: F.resize_image,
+        PIL.Image.Image: F._geometry._resize_image_pil,
+        tv_tensors.Image: F.resize_image,
+        tv_tensors.BoundingBoxes: F.resize_bounding_boxes,
+        tv_tensors.Mask: F.resize_mask,
+        tv_tensors.Video: F.resize_video,
+    }
+
+    @pytest.mark.parametrize("input_type", [str, int, object])
+    def test_unsupported_types(self, input_type):
+        with pytest.raises(TypeError, match="supports inputs of type"):
+            _get_kernel(F.resize, input_type)
+
+    def test_exact_match(self):
+        # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
+        # here, register the kernels without wrapper, and check the exact matching afterwards.
+        def resize_with_pure_kernels():
+            pass
+
+        for input_type, kernel in self.KERNELS.items():
+            _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel)
+
+            assert _get_kernel(resize_with_pure_kernels, input_type) is kernel
+
+    def test_builtin_tv_tensor_subclass(self):
+        # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
+        # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched
+        # to the kernel of the corresponding superclass
+        def resize_with_pure_kernels():
+            pass
+
+        class MyImage(tv_tensors.Image):
+            pass
+
+        class MyBoundingBoxes(tv_tensors.BoundingBoxes):
+            pass
+
+        class MyMask(tv_tensors.Mask):
+            pass
+
+        class MyVideo(tv_tensors.Video):
+            pass
+
+        for custom_tv_tensor_subclass in [
+            MyImage,
+            MyBoundingBoxes,
+            MyMask,
+            MyVideo,
+        ]:
+            builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1]
+            builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class]
+            _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)(
+                builtin_tv_tensor_kernel
+            )
+
+            assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel
+
+    def test_tv_tensor_subclass(self):
+        class MyTVTensor(tv_tensors.TVTensor):
+            pass
+
+        with pytest.raises(TypeError, match="supports inputs of type"):
+            _get_kernel(F.resize, MyTVTensor)
+
+        def resize_my_tv_tensor():
+            pass
+
+        _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor)
+
+        assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor
+
+    def test_pil_image_subclass(self):
+        opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+        loaded_image = opened_image.convert("RGB")
+
+        # check the assumptions
+        assert isinstance(opened_image, PIL.Image.Image)
+        assert type(opened_image) is not PIL.Image.Image
+
+        assert type(loaded_image) is PIL.Image.Image
+
+        size = [17, 11]
+        for image in [opened_image, loaded_image]:
+            kernel = _get_kernel(F.resize, type(image))
+
+            output = kernel(image, size=size)
+
+            assert F.get_size(output) == size
+
+
+class TestPermuteChannels:
+    _DEFAULT_PERMUTATION = [2, 0, 1]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.permute_channels_image, make_image_tensor),
+            # FIXME
+            # check_kernel does not support PIL kernel, but it should
+            (F.permute_channels_image, make_image),
+            (F.permute_channels_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, kernel, make_input, dtype, device):
+        check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.permute_channels_image, torch.Tensor),
+            (F._color._permute_channels_image_pil, PIL.Image.Image),
+            (F.permute_channels_image, tv_tensors.Image),
+            (F.permute_channels_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type)
+
+    def reference_image_correctness(self, image, permutation):
+        channel_images = image.split(1, dim=-3)
+        permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation]
+        return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3))
+
+    @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]])
+    @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)])
+    def test_image_correctness(self, permutation, batch_dims):
+        image = make_image(batch_dims=batch_dims)
+
+        actual = F.permute_channels(image, permutation=permutation)
+        expected = self.reference_image_correctness(image, permutation=permutation)
+
+        torch.testing.assert_close(actual, expected)
+
+
+class TestElastic:
+    def _make_displacement(self, inpt):
+        return torch.rand(
+            1,
+            *F.get_size(inpt),
+            2,
+            dtype=torch.float32,
+            device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu",
+        )
+
+    @param_value_parametrization(
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8, torch.float16])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        image = make_image_tensor(dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_image,
+            image,
+            displacement=self._make_displacement(image),
+            **{param: value},
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+            check_cuda_vs_cpu=dtype is not torch.float16,
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            displacement=self._make_displacement(bounding_boxes),
+        )
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, dtype, device):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            displacement=self._make_displacement(keypoints),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        mask = make_mask()
+        check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask))
+
+    def test_kernel_video(self):
+        video = make_video()
+        check_kernel(F.elastic_video, video, displacement=self._make_displacement(video))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        input = make_input()
+        check_functional(F.elastic, input, displacement=self._make_displacement(input))
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.elastic_image, torch.Tensor),
+            (F._geometry._elastic_image_pil, PIL.Image.Image),
+            (F.elastic_image, tv_tensors.Image),
+            (F.elastic_mask, tv_tensors.Mask),
+            (F.elastic_video, tv_tensors.Video),
+            (F.elastic_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_displacement_error(self, make_input):
+        input = make_input()
+
+        with pytest.raises(TypeError, match="displacement should be a Tensor"):
+            F.elastic(input, displacement=None)
+
+        with pytest.raises(ValueError, match="displacement shape should be"):
+            F.elastic(input, displacement=torch.rand(F.get_size(input)))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
+    @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, size, device):
+        # We have to skip that test on M1 because it's flaky: Mismatched elements: 35 / 89205 (0.0%)
+        # See https://github.com/pytorch/vision/issues/8154
+        # All other platforms are fine, so the differences do not come from something we own in torchvision
+        check_v1_compatibility = False if sys.platform == "darwin" else dict(rtol=0, atol=1)
+
+        check_transform(
+            transforms.ElasticTransform(),
+            make_input(size, device=device),
+            check_v1_compatibility=check_v1_compatibility,
+        )
+
+
+class TestToPureTensor:
+    def test_correctness(self):
+        input = {
+            "img": make_image(),
+            "img_tensor": make_image_tensor(),
+            "img_pil": make_image_pil(),
+            "mask": make_detection_masks(),
+            "video": make_video(),
+            "bbox": make_bounding_boxes(),
+            "str": "str",
+        }
+
+        out = transforms.ToPureTensor()(input)
+
+        for input_value, out_value in zip(input.values(), out.values()):
+            if isinstance(input_value, tv_tensors.TVTensor):
+                assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor)
+            else:
+                assert isinstance(out_value, type(input_value))
+
+
+class TestCrop:
+    INPUT_SIZE = (21, 11)
+
+    CORRECTNESS_CROP_KWARGS = [
+        # center
+        dict(top=5, left=5, height=10, width=5),
+        # larger than input, i.e. pad
+        dict(top=-5, left=-5, height=30, width=20),
+        # sides: left, right, top, bottom
+        dict(top=-5, left=-5, height=30, width=10),
+        dict(top=-5, left=5, height=30, width=10),
+        dict(top=-5, left=-5, height=20, width=20),
+        dict(top=5, left=-5, height=20, width=20),
+        # corners: top-left, top-right, bottom-left, bottom-right
+        dict(top=-5, left=-5, height=20, width=10),
+        dict(top=-5, left=5, height=20, width=10),
+        dict(top=5, left=-5, height=20, width=10),
+        dict(top=5, left=5, height=20, width=10),
+    ]
+    MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0]
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, kwargs, dtype, device):
+        check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, kwargs, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+        check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_keypoints(self, kwargs, dtype, device):
+        keypoints = make_keypoints(self.INPUT_SIZE, dtype=dtype, device=device)
+        check_kernel(F.crop_keypoints, keypoints, **kwargs)
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.crop_image, torch.Tensor),
+            (F._geometry._crop_image_pil, PIL.Image.Image),
+            (F.crop_image, tv_tensors.Image),
+            (F.crop_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.crop_mask, tv_tensors.Mask),
+            (F.crop_video, tv_tensors.Video),
+            (F.crop_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    def test_functional_image_correctness(self, kwargs):
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu")
+
+        actual = F.crop(image, **kwargs)
+        expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs))
+
+        assert_equal(actual, expected)
+
+    @param_value_parametrization(
+        size=[(10, 5), (25, 15), (25, 5), (10, 15)],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_transform(self, param, value, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        check_sample_input = True
+        if param == "fill":
+            if isinstance(value, (tuple, list)):
+                if isinstance(input, tv_tensors.Mask):
+                    pytest.skip("F.pad_mask doesn't support non-scalar fill.")
+                else:
+                    check_sample_input = False
+
+            kwargs = dict(
+                # 1. size is required
+                # 2. the fill parameter only has an affect if we need padding
+                size=[s + 4 for s in self.INPUT_SIZE],
+                fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8),
+            )
+        else:
+            kwargs = {param: value}
+
+        check_transform(
+            transforms.RandomCrop(**kwargs, pad_if_needed=True),
+            input,
+            check_v1_compatibility=param != "fill" or isinstance(value, (int, float)),
+            check_sample_input=check_sample_input,
+        )
+
+    @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)])
+    def test_transform_padding(self, padding):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s + 2 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, padding=padding)
+
+        output = transform(inpt)
+
+        assert F.get_size(output) == output_size
+
+    @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)])
+    def test_transform_insufficient_padding(self, padding):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s + 3 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, padding=padding)
+
+        with pytest.raises(ValueError, match="larger than (padded )?input image size"):
+            transform(inpt)
+
+    def test_transform_pad_if_needed(self):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s * 2 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, pad_if_needed=True)
+
+        output = transform(inpt)
+
+        assert F.get_size(output) == output_size
+
+    @param_value_parametrization(
+        size=[(10, 5), (25, 15), (25, 5), (10, 15)],
+        fill=CORRECTNESS_FILLS,
+        padding_mode=["constant", "edge", "reflect", "symmetric"],
+    )
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, param, value, seed):
+        kwargs = {param: value}
+        if param != "size":
+            # 1. size is required
+            # 2. the fill / padding_mode parameters only have an affect if we need padding
+            kwargs["size"] = [s + 4 for s in self.INPUT_SIZE]
+        if param == "fill":
+            kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8)
+
+        transform = transforms.RandomCrop(pad_if_needed=True, **kwargs)
+
+        image = make_image(self.INPUT_SIZE)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+            torch.manual_seed(seed)
+            expected = F.to_image(transform(F.to_pil_image(image)))
+
+        assert_equal(actual, expected)
+
+    def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width):
+        affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+            ],
+        )
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width))
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+
+        actual = F.crop(bounding_boxes, **kwargs)
+        expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs)
+
+        assert_equal(actual, expected, atol=1, rtol=0)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)])
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        input_size = [s * 2 for s in output_size]
+        bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device)
+
+        transform = transforms.RandomCrop(output_size)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            params = transform.make_params([bounding_boxes])
+            assert not params.pop("needs_pad")
+            del params["padding"]
+            assert params.pop("needs_crop")
+
+            torch.manual_seed(seed)
+            actual = transform(bounding_boxes)
+
+        expected = self._reference_crop_bounding_boxes(bounding_boxes, **params)
+
+        torch.testing.assert_close(actual, expected)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    def _reference_crop_keypoints(self, keypoints, *, top, left, height, width):
+        affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+            ],
+        )
+        return reference_affine_keypoints_helper(
+            keypoints, affine_matrix=affine_matrix, new_canvas_size=(height, width)
+        )
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_keypoints_correctness(self, kwargs, dtype, device):
+        keypoints = make_keypoints(self.INPUT_SIZE, dtype=dtype, device=device)
+
+        actual = F.crop(keypoints, **kwargs)
+        expected = self._reference_crop_keypoints(keypoints, **kwargs)
+
+        assert_equal(actual, expected, atol=1, rtol=0)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_keypoints_correctness(self, output_size, dtype, device, seed):
+        input_size = (output_size[0] * 2, output_size[1] * 2)
+        keypoints = make_keypoints(input_size, dtype=dtype, device=device)
+
+        transform = transforms.RandomCrop(output_size)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            params = transform.make_params([keypoints])
+            assert not params.pop("needs_pad")
+            del params["padding"]
+            assert params.pop("needs_crop")
+
+            torch.manual_seed(seed)
+            actual = transform(keypoints)
+
+        expected = self._reference_crop_keypoints(keypoints, **params)
+
+        assert_equal(actual, expected)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    def test_errors(self):
+        with pytest.raises(ValueError, match="Please provide only two dimensions"):
+            transforms.RandomCrop([10, 12, 14])
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding="abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=0.5)
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=[0.5, 0.5])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomCrop([10, 12], padding=1, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
+
+
+class TestErase:
+    INPUT_SIZE = (17, 11)
+    FUNCTIONAL_KWARGS = dict(
+        zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)])
+    )
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_inplace(self, dtype, device):
+        input = make_image(self.INPUT_SIZE, dtype=dtype, device=device)
+        input_version = input._version
+
+        output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS)
+        assert output_out_of_place.data_ptr() != input.data_ptr()
+        assert output_out_of_place is not input
+
+        output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True)
+        assert output_inplace.data_ptr() == input.data_ptr()
+        assert output_inplace._version > input_version
+        assert output_inplace is input
+
+        assert_equal(output_inplace, output_out_of_place)
+
+    def test_kernel_video(self):
+        check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.erase_image, torch.Tensor),
+            (F._augment._erase_image_pil, PIL.Image.Image),
+            (F.erase_image, tv_tensors.Image),
+            (F.erase_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        input = make_input(device=device)
+
+        with pytest.warns(UserWarning, match="currently passing through inputs of type"):
+            check_transform(
+                transforms.RandomErasing(p=1),
+                input,
+                check_v1_compatibility=not isinstance(input, PIL.Image.Image),
+            )
+
+    def _reference_erase_image(self, image, *, i, j, h, w, v):
+        mask = torch.zeros_like(image, dtype=torch.bool)
+        mask[..., i : i + h, j : j + w] = True
+
+        # The broadcasting and type casting logic is handled automagically in the kernel through indexing
+        value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image)
+
+        erased_image = torch.empty_like(image)
+        erased_image[mask] = value.flatten()
+        erased_image[~mask] = image[~mask]
+
+        return erased_image
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_image_correctness(self, dtype, device):
+        image = make_image(dtype=dtype, device=device)
+
+        actual = F.erase(image, **self.FUNCTIONAL_KWARGS)
+        expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS)
+
+        assert_equal(actual, expected)
+
+    @param_value_parametrization(
+        scale=[(0.1, 0.2), [0.0, 1.0]],
+        ratio=[(0.3, 0.7), [0.1, 5.0]],
+        value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, param, value, dtype, device, seed):
+        transform = transforms.RandomErasing(**{param: value}, p=1)
+
+        image = make_image(dtype=dtype, device=device)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            # This emulates the random apply check that happens before make_params is called
+            torch.rand(1)
+            params = transform.make_params([image])
+
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+        expected = self._reference_erase_image(image, **params)
+
+        assert_equal(actual, expected)
+
+    def test_transform_errors(self):
+        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
+            transforms.RandomErasing(value={})
+
+        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
+            transforms.RandomErasing(value="abc")
+
+        with pytest.raises(TypeError, match="Scale should be a sequence"):
+            transforms.RandomErasing(scale=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence"):
+            transforms.RandomErasing(ratio=123)
+
+        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
+            transforms.RandomErasing(scale=[-1, 2])
+
+        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
+
+        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
+            transform.make_params([make_image()])
+
+
+class TestGaussianBlur:
+    @pytest.mark.parametrize("kernel_size", [1, 3, (3, 1), [3, 5]])
+    @pytest.mark.parametrize("sigma", [None, 1.0, 1, (0.5,), [0.3], (0.3, 0.7), [0.9, 0.2]])
+    def test_kernel_image(self, kernel_size, sigma):
+        check_kernel(
+            F.gaussian_blur_image,
+            make_image(),
+            kernel_size=kernel_size,
+            sigma=sigma,
+            check_scripted_vs_eager=not (isinstance(kernel_size, int) or isinstance(sigma, (float, int))),
+        )
+
+    def test_kernel_image_errors(self):
+        image = make_image_tensor()
+
+        with pytest.raises(ValueError, match="kernel_size is a sequence its length should be 2"):
+            F.gaussian_blur_image(image, kernel_size=[1, 2, 3])
+
+        for kernel_size in [2, -1]:
+            with pytest.raises(ValueError, match="kernel_size should have odd and positive integers"):
+                F.gaussian_blur_image(image, kernel_size=kernel_size)
+
+        with pytest.raises(ValueError, match="sigma is a sequence, its length should be 2"):
+            F.gaussian_blur_image(image, kernel_size=1, sigma=[1, 2, 3])
+
+        with pytest.raises(TypeError, match="sigma should be either float or sequence of floats"):
+            F.gaussian_blur_image(image, kernel_size=1, sigma=object())
+
+        with pytest.raises(ValueError, match="sigma should have positive values"):
+            F.gaussian_blur_image(image, kernel_size=1, sigma=-1)
+
+    def test_kernel_video(self):
+        check_kernel(F.gaussian_blur_video, make_video(), kernel_size=(3, 3))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.gaussian_blur, make_input(), kernel_size=(3, 3))
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.gaussian_blur_image, torch.Tensor),
+            (F._misc._gaussian_blur_image_pil, PIL.Image.Image),
+            (F.gaussian_blur_image, tv_tensors.Image),
+            (F.gaussian_blur_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.gaussian_blur, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("sigma", [5, 2.0, (0.5, 2), [1.3, 2.7]])
+    def test_transform(self, make_input, device, sigma):
+        check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device))
+
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
+            transforms.GaussianBlur([10, 12, 14])
+
+        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
+            transforms.GaussianBlur(4)
+
+        with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"):
+            transforms.GaussianBlur(3, sigma=[1, 2, 3])
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=-1.0)
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
+
+        with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"):
+            transforms.GaussianBlur(3, sigma={})
+
+    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]])
+    def test_make_params(self, sigma):
+        transform = transforms.GaussianBlur(3, sigma=sigma)
+        params = transform.make_params([])
+
+        if isinstance(sigma, float):
+            assert params["sigma"][0] == params["sigma"][1] == sigma
+        elif isinstance(sigma, list) and len(sigma) == 1:
+            assert params["sigma"][0] == params["sigma"][1] == sigma[0]
+        else:
+            assert sigma[0] <= params["sigma"][0] <= sigma[1]
+            assert sigma[0] <= params["sigma"][1] <= sigma[1]
+
+    # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))
+    # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28))
+    # {
+    #     "10_12_3__3_3_0.8": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8),
+    #     "10_12_3__3_3_0.5": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5),
+    #     "10_12_3__3_5_0.8": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8),
+    #     "10_12_3__3_5_0.5": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5),
+    #     "26_28_1__23_23_1.7": cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7),
+    # }
+    REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS = torch.load(
+        Path(__file__).parent / "assets" / "gaussian_blur_opencv_results.pt",
+        weights_only=False,
+    )
+
+    @pytest.mark.parametrize(
+        ("dimensions", "kernel_size", "sigma"),
+        [
+            ((3, 10, 12), (3, 3), 0.8),
+            ((3, 10, 12), (3, 3), 0.5),
+            ((3, 10, 12), (3, 5), 0.8),
+            ((3, 10, 12), (3, 5), 0.5),
+            ((1, 26, 28), (23, 23), 1.7),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float64, torch.float16])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_image_correctness(self, dimensions, kernel_size, sigma, dtype, device):
+        if dtype is torch.float16 and device == "cpu":
+            pytest.skip("The CPU implementation of float16 on CPU differs from opencv")
+
+        num_channels, height, width = dimensions
+
+        reference_results_key = f"{height}_{width}_{num_channels}__{kernel_size[0]}_{kernel_size[1]}_{sigma}"
+        expected = (
+            torch.tensor(self.REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS[reference_results_key])
+            .reshape(height, width, num_channels)
+            .permute(2, 0, 1)
+            .to(dtype=dtype, device=device)
+        )
+
+        image = tv_tensors.Image(
+            torch.arange(num_channels * height * width, dtype=torch.uint8)
+            .reshape(height, width, num_channels)
+            .permute(2, 0, 1),
+            dtype=dtype,
+            device=device,
+        )
+
+        actual = F.gaussian_blur_image(image, kernel_size=kernel_size, sigma=sigma)
+
+        torch.testing.assert_close(actual, expected, rtol=0, atol=1)
+
+
+class TestGaussianNoise:
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_kernel(self, make_input):
+        check_kernel(
+            F.gaussian_noise,
+            make_input(dtype=torch.float32),
+            # This cannot pass because the noise on a batch in not per-image
+            check_batched_vs_unbatched=False,
+        )
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.gaussian_noise, make_input(dtype=torch.float32))
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.gaussian_noise, torch.Tensor),
+            (F.gaussian_noise_image, tv_tensors.Image),
+            (F.gaussian_noise_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.gaussian_noise, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_transform(self, make_input):
+        def adapter(_, input, __):
+            # This transform doesn't support uint8 so we have to convert the auto-generated uint8 tensors to float32
+            # Same for PIL images
+            for key, value in input.items():
+                if isinstance(value, torch.Tensor) and not value.is_floating_point():
+                    input[key] = value.to(torch.float32)
+                if isinstance(value, PIL.Image.Image):
+                    input[key] = F.pil_to_tensor(value).to(torch.float32)
+            return input
+
+        check_transform(transforms.GaussianNoise(), make_input(dtype=torch.float32), check_sample_input=adapter)
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Gaussian Noise is not implemented for PIL images."):
+            F.gaussian_noise(make_image_pil())
+        with pytest.raises(ValueError, match="Input tensor is expected to be in float dtype"):
+            F.gaussian_noise(make_image(dtype=torch.uint8))
+        with pytest.raises(ValueError, match="sigma shouldn't be negative"):
+            F.gaussian_noise(make_image(dtype=torch.float32), sigma=-1)
+
+    def test_clip(self):
+        img = make_image(dtype=torch.float32)
+
+        out = F.gaussian_noise(img, mean=100, clip=False)
+        assert out.min() > 50
+
+        out = F.gaussian_noise(img, mean=100, clip=True)
+        assert (out == 1).all()
+
+        out = F.gaussian_noise(img, mean=-100, clip=False)
+        assert out.min() < -50
+
+        out = F.gaussian_noise(img, mean=-100, clip=True)
+        assert (out == 0).all()
+
+
+class TestAutoAugmentTransforms:
+    # These transforms have a lot of branches in their `forward()` passes which are conditioned on random sampling.
+    # It's typically very hard to test the effect on some parameters without heavy mocking logic.
+    # This class adds correctness tests for the kernels that are specific to those transforms. The rest of kernels, e.g.
+    # rotate, are tested in their respective classes. The rest of the tests here are mostly smoke tests.
+
+    def _reference_shear_translate(self, image, *, transform_id, magnitude, interpolation, fill):
+        if isinstance(image, PIL.Image.Image):
+            input = image
+        else:
+            input = F.to_pil_image(image)
+
+        matrix = {
+            "ShearX": (1, magnitude, 0, 0, 1, 0),
+            "ShearY": (1, 0, 0, magnitude, 1, 0),
+            "TranslateX": (1, 0, -int(magnitude), 0, 1, 0),
+            "TranslateY": (1, 0, 0, 0, 1, -int(magnitude)),
+        }[transform_id]
+
+        output = input.transform(
+            input.size, PIL.Image.AFFINE, matrix, resample=pil_modes_mapping[interpolation], fill=fill
+        )
+
+        if isinstance(image, PIL.Image.Image):
+            return output
+        else:
+            return F.to_image(output)
+
+    @pytest.mark.parametrize("transform_id", ["ShearX", "ShearY", "TranslateX", "TranslateY"])
+    @pytest.mark.parametrize("magnitude", [0.3, -0.2, 0.0])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    @pytest.mark.parametrize("input_type", ["Tensor", "PIL"])
+    def test_correctness_shear_translate(self, transform_id, magnitude, interpolation, fill, input_type):
+        # ShearX/Y and TranslateX/Y are the only ops that are native to the AA transforms. They are modeled after the
+        # reference implementation:
+        # https://github.com/tensorflow/models/blob/885fda091c46c59d6c7bb5c7e760935eacc229da/research/autoaugment/augmentation_transforms.py#L273-L362
+        # All other ops are checked in their respective dedicated tests.
+
+        image = make_image(dtype=torch.uint8, device="cpu")
+        if input_type == "PIL":
+            image = F.to_pil_image(image)
+
+        if "Translate" in transform_id:
+            # For TranslateX/Y magnitude is a value in pixels
+            magnitude *= min(F.get_size(image))
+
+        actual = transforms.AutoAugment()._apply_image_or_video_transform(
+            image,
+            transform_id=transform_id,
+            magnitude=magnitude,
+            interpolation=interpolation,
+            fill={type(image): fill},
+        )
+        expected = self._reference_shear_translate(
+            image, transform_id=transform_id, magnitude=magnitude, interpolation=interpolation, fill=fill
+        )
+
+        if input_type == "PIL":
+            actual, expected = F.to_image(actual), F.to_image(expected)
+
+        if "Shear" in transform_id and input_type == "Tensor":
+            mae = (actual.float() - expected.float()).abs().mean()
+            assert mae < (12 if interpolation is transforms.InterpolationMode.NEAREST else 5)
+        else:
+            assert_close(actual, expected, rtol=0, atol=1)
+
+    def _sample_input_adapter(self, transform, input, device):
+        adapted_input = {}
+        image_or_video_found = False
+        for key, value in input.items():
+            if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+                # AA transforms don't support bounding boxes or masks
+                continue
+            elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)):
+                if image_or_video_found:
+                    # AA transforms only support a single image or video
+                    continue
+                image_or_video_found = True
+            adapted_input[key] = value
+        return adapted_input
+
+    @pytest.mark.parametrize(
+        "transform",
+        [transforms.AutoAugment(), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix()],
+    )
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_smoke(self, transform, make_input, dtype, device):
+        if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"):
+            pytest.skip(
+                "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' "
+                "will degenerate to that anyway."
+            )
+        input = make_input(dtype=dtype, device=device)
+
+        with freeze_rng_state():
+            # By default every test starts from the same random seed. This leads to minimal coverage of the sampling
+            # that happens inside forward(). To avoid calling the transform multiple times to achieve higher coverage,
+            # we build a reproducible random seed from the input type, dtype, and device.
+            torch.manual_seed(hash((make_input, dtype, device)))
+
+            # For v2, we changed the random sampling of the AA transforms. This makes it impossible to compare the v1
+            # and v2 outputs without complicated mocking and monkeypatching. Thus, we skip the v1 compatibility checks
+            # here and only check if we can script the v2 transform and subsequently call the result.
+            check_transform(
+                transform, input, check_v1_compatibility=False, check_sample_input=self._sample_input_adapter
+            )
+
+            if type(input) is torch.Tensor and dtype is torch.uint8:
+                _script(transform)(input)
+
+    def test_auto_augment_policy_error(self):
+        with pytest.raises(ValueError, match="provided policy"):
+            transforms.AutoAugment(policy=None)
+
+    @pytest.mark.parametrize("severity", [0, 11])
+    def test_aug_mix_severity_error(self, severity):
+        with pytest.raises(ValueError, match="severity must be between"):
+            transforms.AugMix(severity=severity)
+
+    @pytest.mark.parametrize("num_ops", [-1, 1.1])
+    def test_rand_augment_num_ops_error(self, num_ops):
+        with pytest.raises(
+            ValueError,
+            match=re.escape(f"num_ops should be a non-negative integer, but got {num_ops} instead."),
+        ):
+            transforms.RandAugment(num_ops=num_ops)
+
+
+class TestConvertBoundingBoxFormat:
+    old_new_formats = list(
+        itertools.permutations(
+            [f for f in tv_tensors.BoundingBoxFormat if not tv_tensors.is_rotated_bounding_format(f)], 2
+        )
+    )
+    old_new_formats += list(
+        itertools.permutations([f for f in tv_tensors.BoundingBoxFormat if tv_tensors.is_rotated_bounding_format(f)], 2)
+    )
+
+    @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats)
+    def test_kernel(self, old_format, new_format):
+        check_kernel(
+            F.convert_bounding_box_format,
+            make_bounding_boxes(format=old_format),
+            new_format=new_format,
+            old_format=old_format,
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("inplace", [False, True])
+    def test_kernel_noop(self, format, inplace):
+        input = make_bounding_boxes(format=format).as_subclass(torch.Tensor)
+        input_version = input._version
+
+        output = F.convert_bounding_box_format(input, old_format=format, new_format=format, inplace=inplace)
+
+        assert output is input
+        assert output.data_ptr() == input.data_ptr()
+        assert output._version == input_version
+
+    @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats)
+    def test_kernel_inplace(self, old_format, new_format):
+        input = make_bounding_boxes(format=old_format).as_subclass(torch.Tensor)
+        input_version = input._version
+
+        output_out_of_place = F.convert_bounding_box_format(input, old_format=old_format, new_format=new_format)
+        assert output_out_of_place.data_ptr() != input.data_ptr()
+        assert output_out_of_place is not input
+
+        output_inplace = F.convert_bounding_box_format(
+            input, old_format=old_format, new_format=new_format, inplace=True
+        )
+        if old_format != tv_tensors.BoundingBoxFormat.XYXYXYXY and new_format != tv_tensors.BoundingBoxFormat.XYXYXYXY:
+            # NOTE: BoundingBox format conversion from and to XYXYXYXY format
+            # cannot modify the input tensor inplace as it requires a dimension
+            # change.
+            assert output_inplace.data_ptr() == input.data_ptr()
+            assert output_inplace._version > input_version
+            assert output_inplace is input
+
+        assert_equal(output_inplace, output_out_of_place)
+
+    @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats)
+    def test_functional(self, old_format, new_format):
+        check_functional(F.convert_bounding_box_format, make_bounding_boxes(format=old_format), new_format=new_format)
+
+    @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats)
+    @pytest.mark.parametrize("format_type", ["enum", "str"])
+    def test_transform(self, old_format, new_format, format_type):
+        check_transform(
+            transforms.ConvertBoundingBoxFormat(new_format.name if format_type == "str" else new_format),
+            make_bounding_boxes(format=old_format),
+        )
+
+    @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats)
+    def test_strings(self, old_format, new_format):
+        # Non-regression test for https://github.com/pytorch/vision/issues/8258
+        input = make_bounding_boxes(format=old_format, canvas_size=(50, 50))
+        expected = self._reference_convert_bounding_box_format(input, new_format)
+
+        old_format = old_format.name
+        new_format = new_format.name
+
+        out_functional = F.convert_bounding_box_format(input, new_format=new_format)
+        out_functional_tensor = F.convert_bounding_box_format(
+            input.as_subclass(torch.Tensor), old_format=old_format, new_format=new_format
+        )
+        out_transform = transforms.ConvertBoundingBoxFormat(new_format)(input)
+        for out in (out_functional, out_functional_tensor, out_transform):
+            torch.testing.assert_close(out, expected)
+
+    def _reference_convert_bounding_box_format(self, bounding_boxes, new_format):
+        return tv_tensors.wrap(
+            torchvision.ops.box_convert(
+                bounding_boxes.as_subclass(torch.Tensor),
+                in_fmt=bounding_boxes.format.name.lower(),
+                out_fmt=new_format.name.lower(),
+            ).to(bounding_boxes.dtype),
+            like=bounding_boxes,
+            format=new_format,
+        )
+
+    @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats)
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("fn_type", ["functional", "transform"])
+    def test_correctness(self, old_format, new_format, dtype, device, fn_type):
+        if not dtype.is_floating_point and (
+            tv_tensors.is_rotated_bounding_format(old_format) or tv_tensors.is_rotated_bounding_format(new_format)
+        ):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=old_format, dtype=dtype, device=device)
+
+        if fn_type == "functional":
+            fn = functools.partial(F.convert_bounding_box_format, new_format=new_format)
+        else:
+            fn = transforms.ConvertBoundingBoxFormat(format=new_format)
+
+        actual = fn(bounding_boxes)
+        expected = self._reference_convert_bounding_box_format(bounding_boxes, new_format)
+
+        torch.testing.assert_close(actual, expected)
+
+    def test_errors(self):
+        input_tv_tensor = make_bounding_boxes()
+        input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor)
+
+        for input in [input_tv_tensor, input_pure_tensor]:
+            with pytest.raises(TypeError, match="missing 1 required argument: 'new_format'"):
+                F.convert_bounding_box_format(input)
+
+        with pytest.raises(ValueError, match="`old_format` has to be passed"):
+            F.convert_bounding_box_format(input_pure_tensor, new_format=input_tv_tensor.format)
+
+        with pytest.raises(ValueError, match="`old_format` must not be passed"):
+            F.convert_bounding_box_format(
+                input_tv_tensor, old_format=input_tv_tensor.format, new_format=input_tv_tensor.format
+            )
+
+
+class TestResizedCrop:
+    INPUT_SIZE = (17, 11)
+    CROP_KWARGS = dict(top=2, left=2, height=5, width=7)
+    OUTPUT_SIZE = (19, 32)
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.resized_crop_image, make_image),
+            (F.resized_crop_bounding_boxes, make_bounding_boxes),
+            (F.resized_crop_mask, make_segmentation_mask),
+            (F.resized_crop_mask, make_detection_masks),
+            (F.resized_crop_video, make_video),
+            (F.resized_crop_keypoints, make_keypoints),
+        ],
+    )
+    def test_kernel(self, kernel, make_input):
+        input = make_input(self.INPUT_SIZE)
+        if isinstance(input, tv_tensors.BoundingBoxes):
+            extra_kwargs = dict(format=input.format)
+        elif isinstance(input, (tv_tensors.Mask, tv_tensors.KeyPoints)):
+            extra_kwargs = dict()
+        else:
+            extra_kwargs = dict(antialias=True)
+
+        check_kernel(kernel, input, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, **extra_kwargs)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(
+            F.resized_crop, make_input(self.INPUT_SIZE), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, antialias=True
+        )
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.resized_crop_image, torch.Tensor),
+            (F._geometry._resized_crop_image_pil, PIL.Image.Image),
+            (F.resized_crop_image, tv_tensors.Image),
+            (F.resized_crop_mask, tv_tensors.Mask),
+            (F.resized_crop_video, tv_tensors.Video),
+            (F.resized_crop_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.resized_crop, kernel=kernel, input_type=input_type)
+
+    @param_value_parametrization(
+        scale=[(0.1, 0.2), [0.0, 1.0]],
+        ratio=[(0.3, 0.7), [0.1, 5.0]],
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_transform(self, param, value, make_input):
+        check_transform(
+            transforms.RandomResizedCrop(size=self.OUTPUT_SIZE, **{param: value}, antialias=True),
+            make_input(self.INPUT_SIZE),
+            check_v1_compatibility=dict(rtol=0, atol=1),
+        )
+
+    # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2.
+    # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT`
+    @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST})
+    def test_functional_image_correctness(self, interpolation):
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8)
+
+        actual = F.resized_crop(
+            image, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation, antialias=True
+        )
+        expected = F.to_image(
+            F.resized_crop(
+                F.to_pil_image(image), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation
+            )
+        )
+
+        torch.testing.assert_close(actual, expected, atol=1, rtol=0)
+
+    def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width, size):
+        new_height, new_width = size
+
+        crop_affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+                [0, 0, 1],
+            ],
+        )
+        resize_affine_matrix = np.array(
+            [
+                [new_width / width, 0, 0],
+                [0, new_height / height, 0],
+                [0, 0, 1],
+            ],
+        )
+
+        affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :]
+
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+
+        return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=size, clamp=False)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    def test_functional_bounding_boxes_correctness(self, format):
+        # Note that we don't want to clamp because in
+        # _reference_resized_crop_bounding_boxes we are fusing the crop and the
+        # resize operation, where none of the croppings happen - particularly,
+        # the intermediate one.
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, clamping_mode=None)
+
+        actual = F.resized_crop(bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE)
+        expected = self._reference_resized_crop_bounding_boxes(
+            bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE
+        )
+
+        torch.testing.assert_close(actual, expected)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    def _reference_resized_crop_keypoints(self, keypoints, *, top, left, height, width, size):
+        new_height, new_width = size
+
+        crop_affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+                [0, 0, 1],
+            ],
+        )
+        resize_affine_matrix = np.array(
+            [
+                [new_width / width, 0, 0],
+                [0, new_height / height, 0],
+                [0, 0, 1],
+            ],
+        )
+        intermediate_keypoints = reference_affine_keypoints_helper(
+            keypoints,
+            affine_matrix=crop_affine_matrix,
+            new_canvas_size=(height, width),
+        )
+        return reference_affine_keypoints_helper(
+            intermediate_keypoints,
+            affine_matrix=resize_affine_matrix,
+            new_canvas_size=size,
+        )
+
+    def test_functional_keypoints_correctness(self):
+        keypoints = make_keypoints(self.INPUT_SIZE)
+
+        actual = F.resized_crop(keypoints, **self.CROP_KWARGS, size=self.OUTPUT_SIZE)
+        expected = self._reference_resized_crop_keypoints(keypoints, **self.CROP_KWARGS, size=self.OUTPUT_SIZE)
+
+        assert_equal(actual, expected)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    def test_transform_errors_warnings(self):
+        with pytest.raises(ValueError, match="provide only two dimensions"):
+            transforms.RandomResizedCrop(size=(1, 2, 3))
+
+        with pytest.raises(TypeError, match="Scale should be a sequence of two floats."):
+            transforms.RandomResizedCrop(size=self.INPUT_SIZE, scale=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence of two floats."):
+            transforms.RandomResizedCrop(size=self.INPUT_SIZE, ratio=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence of two floats."):
+            transforms.RandomResizedCrop(size=self.INPUT_SIZE, ratio=[1, 2, 3])
+
+        with pytest.raises(TypeError, match="Scale should be a sequence of two floats."):
+            transforms.RandomResizedCrop(size=self.INPUT_SIZE, scale=[1, 2, 3])
+
+        for param in ["scale", "ratio"]:
+            with pytest.warns(match="Scale and ratio should be of kind"):
+                transforms.RandomResizedCrop(size=self.INPUT_SIZE, **{param: [1, 0]})
+
+
+class TestPad:
+    EXHAUSTIVE_TYPE_PADDINGS = [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
+    CORRECTNESS_PADDINGS = [
+        padding
+        for padding in EXHAUSTIVE_TYPE_PADDINGS
+        if isinstance(padding, int) or isinstance(padding, list) and len(padding) > 1
+    ]
+    PADDING_MODES = ["constant", "symmetric", "edge", "reflect"]
+
+    @param_value_parametrization(
+        padding=EXHAUSTIVE_TYPE_PADDINGS,
+        fill=EXHAUSTIVE_TYPE_FILLS,
+        padding_mode=PADDING_MODES,
+    )
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        if param == "fill":
+            value = adapt_fill(value, dtype=dtype)
+        kwargs = {param: value}
+        if param != "padding":
+            kwargs["padding"] = [1]
+
+        image = make_image(dtype=dtype, device=device)
+
+        check_kernel(
+            F.pad_image,
+            image,
+            **kwargs,
+            check_scripted_vs_eager=not (
+                (param == "padding" and isinstance(value, int))
+                # See https://github.com/pytorch/vision/pull/7252#issue-1585585521 for details
+                or (
+                    param == "fill"
+                    and (
+                        isinstance(value, tuple) or (isinstance(value, list) and any(isinstance(v, int) for v in value))
+                    )
+                )
+            ),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    def test_kernel_bounding_boxes(self, format):
+        bounding_boxes = make_bounding_boxes(format=format)
+        check_kernel(
+            F.pad_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            padding=[1],
+        )
+
+    @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"])
+    def test_kernel_bounding_boxes_errors(self, padding_mode):
+        bounding_boxes = make_bounding_boxes()
+        with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"):
+            F.pad_bounding_boxes(
+                bounding_boxes,
+                format=bounding_boxes.format,
+                canvas_size=bounding_boxes.canvas_size,
+                padding=[1],
+                padding_mode=padding_mode,
+            )
+
+    def test_kernel_keypoints(self):
+        keypoints = make_keypoints()
+        check_kernel(
+            F.pad_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            padding=[1],
+        )
+
+    @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"])
+    def test_kernel_keypoints_errors(self, padding_mode):
+        keypoints = make_keypoints()
+        with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"):
+            F.pad_keypoints(
+                keypoints,
+                canvas_size=keypoints.canvas_size,
+                padding=[1],
+                padding_mode=padding_mode,
+            )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.pad_mask, make_mask(), padding=[1])
+
+    @pytest.mark.parametrize("fill", [[1], (0,), [1, 0, 1], (0, 1, 0)])
+    def test_kernel_mask_errors(self, fill):
+        with pytest.raises(ValueError, match="Non-scalar fill value is not supported"):
+            F.pad_mask(make_segmentation_mask(), padding=[1], fill=fill)
+
+    def test_kernel_video(self):
+        check_kernel(F.pad_video, make_video(), padding=[1])
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.pad, make_input(), padding=[1])
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.pad_image, torch.Tensor),
+            # The PIL kernel uses fill=0 as default rather than fill=None as all others.
+            # Since the whole fill story is already really inconsistent, we won't introduce yet another case to allow
+            # for this test to pass.
+            # See https://github.com/pytorch/vision/issues/6623 for a discussion.
+            # (F._geometry._pad_image_pil, PIL.Image.Image),
+            (F.pad_image, tv_tensors.Image),
+            (F.pad_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.pad_mask, tv_tensors.Mask),
+            (F.pad_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.pad, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_transform(self, make_input):
+        check_transform(transforms.Pad(padding=[1]), make_input())
+
+    def test_transform_errors(self):
+        with pytest.raises(ValueError, match="Padding must be"):
+            transforms.Pad("abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"):
+            transforms.Pad([-0.7, 0, 0.7])
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"):
+            transforms.Pad(0.5)
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"):
+            transforms.Pad(padding=[0.5, 0.5])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.Pad(12, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.Pad(12, padding_mode="abc")
+
+    @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS)
+    @pytest.mark.parametrize(
+        ("padding_mode", "fill"),
+        [
+            *[("constant", fill) for fill in CORRECTNESS_FILLS],
+            *[(padding_mode, None) for padding_mode in ["symmetric", "edge", "reflect"]],
+        ],
+    )
+    @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)])
+    def test_image_correctness(self, padding, padding_mode, fill, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        actual = fn(image, padding=padding, padding_mode=padding_mode, fill=fill)
+        expected = F.to_image(F.pad(F.to_pil_image(image), padding=padding, padding_mode=padding_mode, fill=fill))
+
+        assert_equal(actual, expected)
+
+    def _reference_pad_bounding_boxes(self, bounding_boxes, *, padding):
+        if isinstance(padding, int):
+            padding = [padding]
+        left, top, right, bottom = padding * (4 // len(padding))
+
+        affine_matrix = np.array(
+            [
+                [1, 0, left],
+                [0, 1, top],
+            ],
+        )
+
+        height = bounding_boxes.canvas_size[0] + top + bottom
+        width = bounding_boxes.canvas_size[1] + left + right
+
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width))
+
+    @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)])
+    def test_bounding_boxes_correctness(self, padding, format, dtype, device, fn):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+
+        actual = fn(bounding_boxes, padding=padding)
+        expected = self._reference_pad_bounding_boxes(bounding_boxes, padding=padding)
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_pad_keypoints(self, keypoints, *, padding):
+        if isinstance(padding, int):
+            padding = [padding]
+        left, top, right, bottom = padding * (4 // len(padding))
+
+        affine_matrix = np.array(
+            [
+                [1, 0, left],
+                [0, 1, top],
+            ],
+        )
+
+        height = keypoints.canvas_size[0] + top + bottom
+        width = keypoints.canvas_size[1] + left + right
+
+        return reference_affine_keypoints_helper(
+            keypoints, affine_matrix=affine_matrix, new_canvas_size=(height, width)
+        )
+
+    @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS)
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)])
+    def test_keypoints_correctness(self, padding, dtype, device, fn):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+
+        actual = fn(keypoints, padding=padding)
+        expected = self._reference_pad_keypoints(keypoints, padding=padding)
+
+        assert_equal(actual, expected)
+
+
+class TestCenterCrop:
+    INPUT_SIZE = (17, 11)
+    OUTPUT_SIZES = [(3, 5), (5, 3), (4, 4), (21, 9), (13, 15), (19, 14), 3, (4,), [5], INPUT_SIZE]
+
+    @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, output_size, dtype, device):
+        check_kernel(
+            F.center_crop_image,
+            make_image(self.INPUT_SIZE, dtype=dtype, device=device),
+            output_size=output_size,
+            check_scripted_vs_eager=not isinstance(output_size, int),
+        )
+
+    @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    def test_kernel_bounding_boxes(self, output_size, format):
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format)
+        check_kernel(
+            F.center_crop_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            output_size=output_size,
+            check_scripted_vs_eager=not isinstance(output_size, int),
+        )
+
+    @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
+    def test_kernel_keypoints(self, output_size):
+        keypoints = make_keypoints(self.INPUT_SIZE)
+        check_kernel(
+            F.center_crop_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            output_size=output_size,
+            check_scripted_vs_eager=not isinstance(output_size, int),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.center_crop_mask, make_mask(), output_size=self.OUTPUT_SIZES[0])
+
+    def test_kernel_video(self):
+        check_kernel(F.center_crop_video, make_video(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0])
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.center_crop, make_input(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0])
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.center_crop_image, torch.Tensor),
+            (F._geometry._center_crop_image_pil, PIL.Image.Image),
+            (F.center_crop_image, tv_tensors.Image),
+            (F.center_crop_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.center_crop_mask, tv_tensors.Mask),
+            (F.center_crop_video, tv_tensors.Video),
+            (F.center_crop_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.center_crop, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_transform(self, make_input):
+        check_transform(transforms.CenterCrop(self.OUTPUT_SIZES[0]), make_input(self.INPUT_SIZE))
+
+    @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)])
+    def test_image_correctness(self, output_size, fn):
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu")
+
+        actual = fn(image, output_size)
+        expected = F.to_image(F.center_crop(F.to_pil_image(image), output_size=output_size))
+
+        assert_equal(actual, expected)
+
+    def _reference_center_crop_bounding_boxes(self, bounding_boxes, output_size):
+        image_height, image_width = bounding_boxes.canvas_size
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        elif len(output_size) == 1:
+            output_size *= 2
+        crop_height, crop_width = output_size
+
+        top = int(round((image_height - crop_height) / 2))
+        left = int(round((image_width - crop_width) / 2))
+
+        affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+            ],
+        )
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=output_size)
+
+    @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)])
+    def test_bounding_boxes_correctness(self, output_size, format, dtype, device, fn):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+
+        actual = fn(bounding_boxes, output_size)
+        expected = self._reference_center_crop_bounding_boxes(bounding_boxes, output_size)
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_center_crop_keypoints(self, keypoints, output_size):
+        image_height, image_width = keypoints.canvas_size
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        elif len(output_size) == 1:
+            output_size *= 2
+        crop_height, crop_width = output_size
+
+        top = int(round((image_height - crop_height) / 2))
+        left = int(round((image_width - crop_width) / 2))
+
+        affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+            ],
+        )
+        return reference_affine_keypoints_helper(keypoints, affine_matrix=affine_matrix, new_canvas_size=output_size)
+
+    @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)])
+    def test_keypoints_correctness(self, output_size, dtype, device, fn):
+        keypoints = make_keypoints(self.INPUT_SIZE, dtype=dtype, device=device)
+
+        actual = fn(keypoints, output_size)
+        expected = self._reference_center_crop_keypoints(keypoints, output_size)
+
+        assert_equal(actual, expected)
+
+
+class TestPerspective:
+    COEFFICIENTS = [
+        [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
+        [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
+    ]
+    START_END_POINTS = [
+        ([[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]),
+        ([[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]),
+        ([[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]),
+    ]
+    MINIMAL_KWARGS = dict(startpoints=None, endpoints=None, coefficients=COEFFICIENTS[0])
+
+    @param_value_parametrization(
+        coefficients=COEFFICIENTS,
+        start_end_points=START_END_POINTS,
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        if param == "start_end_points":
+            kwargs = dict(zip(["startpoints", "endpoints"], value))
+        else:
+            kwargs = {"startpoints": None, "endpoints": None, param: value}
+        if param == "fill":
+            kwargs["coefficients"] = self.COEFFICIENTS[0]
+
+        check_kernel(
+            F.perspective_image,
+            make_image(dtype=dtype, device=device),
+            **kwargs,
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+
+    def test_kernel_image_error(self):
+        image = make_image_tensor()
+
+        with pytest.raises(ValueError, match="startpoints/endpoints or the coefficients must have non `None` values"):
+            F.perspective_image(image, startpoints=None, endpoints=None)
+
+        with pytest.raises(
+            ValueError, match="startpoints/endpoints and the coefficients shouldn't be defined concurrently"
+        ):
+            startpoints, endpoints = self.START_END_POINTS[0]
+            coefficients = self.COEFFICIENTS[0]
+            F.perspective_image(image, startpoints=startpoints, endpoints=endpoints, coefficients=coefficients)
+
+        with pytest.raises(ValueError, match="coefficients should have 8 float values"):
+            F.perspective_image(image, startpoints=None, endpoints=None, coefficients=list(range(7)))
+
+    @param_value_parametrization(
+        coefficients=COEFFICIENTS,
+        start_end_points=START_END_POINTS,
+    )
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    def test_kernel_bounding_boxes(self, param, value, format):
+        if param == "start_end_points":
+            kwargs = dict(zip(["startpoints", "endpoints"], value))
+        else:
+            kwargs = {"startpoints": None, "endpoints": None, param: value}
+
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        check_kernel(
+            F.perspective_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            **kwargs,
+        )
+
+    def test_kernel_bounding_boxes_error(self):
+        bounding_boxes = make_bounding_boxes()
+        format, canvas_size = bounding_boxes.format, bounding_boxes.canvas_size
+        bounding_boxes = bounding_boxes.as_subclass(torch.Tensor)
+
+        with pytest.raises(RuntimeError, match="Denominator is zero"):
+            F.perspective_bounding_boxes(
+                bounding_boxes,
+                format=format,
+                canvas_size=canvas_size,
+                startpoints=None,
+                endpoints=None,
+                coefficients=[0.0] * 8,
+            )
+
+    @param_value_parametrization(
+        coefficients=COEFFICIENTS,
+        start_end_points=START_END_POINTS,
+    )
+    def test_kernel_keypoints(self, param, value):
+        if param == "start_end_points":
+            kwargs = dict(zip(["startpoints", "endpoints"], value))
+        else:
+            kwargs = {"startpoints": None, "endpoints": None, param: value}
+
+        keypoints = make_keypoints()
+
+        check_kernel(
+            F.perspective_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+            **kwargs,
+        )
+
+    def test_kernel_keypoints_error(self):
+        keypoints = make_keypoints()
+        canvas_size = keypoints.canvas_size
+        keypoints = keypoints.as_subclass(torch.Tensor)
+
+        with pytest.raises(RuntimeError, match="Denominator is zero"):
+            F.perspective_keypoints(
+                keypoints,
+                canvas_size=canvas_size,
+                startpoints=None,
+                endpoints=None,
+                coefficients=[0.0] * 8,
+            )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.perspective_mask, make_mask(), **self.MINIMAL_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.perspective_video, make_video(), **self.MINIMAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.perspective, make_input(), **self.MINIMAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.perspective_image, torch.Tensor),
+            (F._geometry._perspective_image_pil, PIL.Image.Image),
+            (F.perspective_image, tv_tensors.Image),
+            (F.perspective_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.perspective_mask, tv_tensors.Mask),
+            (F.perspective_video, tv_tensors.Video),
+            (F.perspective_keypoints, tv_tensors.KeyPoints),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.perspective, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("distortion_scale", [0.5, 0.0, 1.0])
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_video,
+            make_keypoints,
+        ],
+    )
+    def test_transform(self, distortion_scale, make_input):
+        check_transform(transforms.RandomPerspective(distortion_scale=distortion_scale, p=1), make_input())
+
+    @pytest.mark.parametrize("distortion_scale", [-1, 2])
+    def test_transform_error(self, distortion_scale):
+        with pytest.raises(ValueError, match="distortion_scale value should be between 0 and 1"):
+            transforms.RandomPerspective(distortion_scale=distortion_scale)
+
+    @pytest.mark.parametrize("coefficients", COEFFICIENTS)
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_image_functional_correctness(self, coefficients, interpolation, fill):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.perspective(
+            image, startpoints=None, endpoints=None, coefficients=coefficients, interpolation=interpolation, fill=fill
+        )
+        expected = F.to_image(
+            F.perspective(
+                F.to_pil_image(image),
+                startpoints=None,
+                endpoints=None,
+                coefficients=coefficients,
+                interpolation=interpolation,
+                fill=fill,
+            )
+        )
+
+        if interpolation is transforms.InterpolationMode.BILINEAR:
+            abs_diff = (actual.float() - expected.float()).abs()
+            assert (abs_diff > 1).float().mean() < 7e-2
+            mae = abs_diff.mean()
+            assert mae < 3
+        else:
+            assert_equal(actual, expected)
+
+    def _reference_perspective_bounding_boxes(self, bounding_boxes, *, startpoints, endpoints):
+        format = bounding_boxes.format
+        canvas_size = bounding_boxes.canvas_size
+        clamping_mode = bounding_boxes.clamping_mode
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+        is_rotated = tv_tensors.is_rotated_bounding_format(format)
+        ndims = 4
+        if is_rotated and format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+            ndims = 8
+        if is_rotated and format != tv_tensors.BoundingBoxFormat.XYXYXYXY:
+            ndims = 5
+
+        coefficients = _get_perspective_coeffs(endpoints, startpoints)
+
+        def perspective_bounding_boxes(bounding_boxes):
+            m1 = np.array(
+                [
+                    [coefficients[0], coefficients[1], coefficients[2]],
+                    [coefficients[3], coefficients[4], coefficients[5]],
+                ]
+            )
+            m2 = np.array(
+                [
+                    [coefficients[6], coefficients[7], 1.0],
+                    [coefficients[6], coefficients[7], 1.0],
+                ]
+            )
+
+            if is_rotated:
+                input_xyxyxyxy = F.convert_bounding_box_format(
+                    bounding_boxes.to(device="cpu", copy=True),
+                    old_format=format,
+                    new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+                    inplace=True,
+                )
+                x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist()
+                points = np.array(
+                    [
+                        [x1, y1, 1.0],
+                        [x2, y2, 1.0],
+                        [x3, y3, 1.0],
+                        [x4, y4, 1.0],
+                    ]
+                )
+
+            else:
+                # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+                input_xyxy = F.convert_bounding_box_format(
+                    bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+                    old_format=format,
+                    new_format=tv_tensors.BoundingBoxFormat.XYXY,
+                    inplace=True,
+                )
+                x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
+
+                points = np.array(
+                    [
+                        [x1, y1, 1.0],
+                        [x2, y1, 1.0],
+                        [x1, y2, 1.0],
+                        [x2, y2, 1.0],
+                    ]
+                )
+
+            numerator = points @ m1.astype(points.dtype).T
+            denominator = points @ m2.astype(points.dtype).T
+            transformed_points = numerator / denominator
+
+            if is_rotated:
+                output = torch.Tensor(
+                    [
+                        float(transformed_points[0, 0]),
+                        float(transformed_points[0, 1]),
+                        float(transformed_points[1, 0]),
+                        float(transformed_points[1, 1]),
+                        float(transformed_points[2, 0]),
+                        float(transformed_points[2, 1]),
+                        float(transformed_points[3, 0]),
+                        float(transformed_points[3, 1]),
+                    ]
+                )
+                output = _parallelogram_to_bounding_boxes(output)
+            else:
+                output = torch.Tensor(
+                    [
+                        float(np.min(transformed_points[:, 0])),
+                        float(np.min(transformed_points[:, 1])),
+                        float(np.max(transformed_points[:, 0])),
+                        float(np.max(transformed_points[:, 1])),
+                    ]
+                )
+
+            output = F.convert_bounding_box_format(
+                output,
+                old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY,
+                new_format=format,
+            )
+
+            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            return F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+                clamping_mode=clamping_mode,
+            ).to(dtype=dtype, device=device)
+
+        return tv_tensors.BoundingBoxes(
+            torch.cat(
+                [perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, ndims).unbind()], dim=0
+            ).reshape(bounding_boxes.shape),
+            format=format,
+            canvas_size=canvas_size,
+        )
+
+    @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_correctness_perspective_bounding_boxes(self, startpoints, endpoints, format, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+
+        actual = F.perspective(bounding_boxes, startpoints=startpoints, endpoints=endpoints)
+        expected = self._reference_perspective_bounding_boxes(
+            bounding_boxes, startpoints=startpoints, endpoints=endpoints
+        )
+
+        assert_close(actual, expected, rtol=0, atol=1)
+
+    def _reference_perspective_keypoints(self, keypoints, *, startpoints, endpoints):
+        canvas_size = keypoints.canvas_size
+        dtype = keypoints.dtype
+        device = keypoints.device
+
+        coefficients = _get_perspective_coeffs(endpoints, startpoints)
+
+        def perspective_keypoints(keypoints):
+            m1 = np.array(
+                [
+                    [coefficients[0], coefficients[1], coefficients[2]],
+                    [coefficients[3], coefficients[4], coefficients[5]],
+                ]
+            )
+            m2 = np.array(
+                [
+                    [coefficients[6], coefficients[7], 1.0],
+                    [coefficients[6], coefficients[7], 1.0],
+                ]
+            )
+
+            # Go to float before converting to prevent precision loss
+            x, y = keypoints.to(dtype=torch.float64, device="cpu", copy=True).squeeze(0).tolist()
+
+            points = np.array([[x, y, 1.0]])
+
+            numerator = points @ m1.T
+            denominator = points @ m2.T
+            transformed_points = numerator / denominator
+
+            output = torch.Tensor(
+                [
+                    float(transformed_points[0, 0]),
+                    float(transformed_points[0, 1]),
+                ]
+            )
+
+            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            return F.clamp_keypoints(
+                output,
+                canvas_size=canvas_size,
+            ).to(dtype=dtype, device=device)
+
+        return tv_tensors.KeyPoints(
+            torch.cat([perspective_keypoints(k) for k in keypoints.reshape(-1, 2).unbind()], dim=0).reshape(
+                keypoints.shape
+            ),
+            canvas_size=canvas_size,
+        )
+
+    @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS)
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_correctness_perspective_keypoints(self, startpoints, endpoints, dtype, device):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+
+        actual = F.perspective(keypoints, startpoints=startpoints, endpoints=endpoints)
+        expected = self._reference_perspective_keypoints(keypoints, startpoints=startpoints, endpoints=endpoints)
+
+        assert_close(actual, expected, rtol=0, atol=1)
+
+
+class TestEqualize:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.equalize_image, make_image(dtype=dtype, device=device))
+
+    def test_kernel_video(self):
+        check_kernel(F.equalize_image, make_video())
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.equalize, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.equalize_image, torch.Tensor),
+            (F._color._equalize_image_pil, PIL.Image.Image),
+            (F.equalize_image, tv_tensors.Image),
+            (F.equalize_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.equalize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_transform(self, make_input):
+        check_transform(transforms.RandomEqualize(p=1), make_input())
+
+    @pytest.mark.parametrize(("low", "high"), [(0, 64), (64, 192), (192, 256), (0, 1), (127, 128), (255, 256)])
+    @pytest.mark.parametrize("fn", [F.equalize, transform_cls_to_functional(transforms.RandomEqualize, p=1)])
+    def test_image_correctness(self, low, high, fn):
+        # We are not using the default `make_image` here since that uniformly samples the values over the whole value
+        # range. Since the whole point of F.equalize is to transform an arbitrary distribution of values into a uniform
+        # one over the full range, the information gain is low if we already provide something really close to the
+        # expected value.
+        image = tv_tensors.Image(
+            torch.testing.make_tensor((3, 117, 253), dtype=torch.uint8, device="cpu", low=low, high=high)
+        )
+
+        actual = fn(image)
+        expected = F.to_image(F.equalize(F.to_pil_image(image)))
+
+        assert_equal(actual, expected)
+
+
+class TestUniformTemporalSubsample:
+    def test_kernel_video(self):
+        check_kernel(F.uniform_temporal_subsample_video, make_video(), num_samples=2)
+
+    @pytest.mark.parametrize("make_input", [make_video_tensor, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.uniform_temporal_subsample, make_input(), num_samples=2)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.uniform_temporal_subsample_video, torch.Tensor),
+            (F.uniform_temporal_subsample_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.uniform_temporal_subsample, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_video_tensor, make_video])
+    def test_transform(self, make_input):
+        check_transform(transforms.UniformTemporalSubsample(num_samples=2), make_input())
+
+    def _reference_uniform_temporal_subsample_video(self, video, *, num_samples):
+        # Adapted from
+        # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19
+        t = video.shape[-4]
+        assert num_samples > 0 and t > 0
+        # Sample by nearest neighbor interpolation if num_samples > t.
+        indices = torch.linspace(0, t - 1, num_samples, device=video.device)
+        indices = torch.clamp(indices, 0, t - 1).long()
+        return tv_tensors.Video(torch.index_select(video, -4, indices))
+
+    CORRECTNESS_NUM_FRAMES = 5
+
+    @pytest.mark.parametrize("num_samples", list(range(1, CORRECTNESS_NUM_FRAMES + 1)))
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize(
+        "fn", [F.uniform_temporal_subsample, transform_cls_to_functional(transforms.UniformTemporalSubsample)]
+    )
+    def test_video_correctness(self, num_samples, dtype, device, fn):
+        video = make_video(num_frames=self.CORRECTNESS_NUM_FRAMES, dtype=dtype, device=device)
+
+        actual = fn(video, num_samples=num_samples)
+        expected = self._reference_uniform_temporal_subsample_video(video, num_samples=num_samples)
+
+        assert_equal(actual, expected)
+
+
+class TestNormalize:
+    MEANS_STDS = [
+        ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]),
+    ]
+    MEAN, STD = MEANS_STDS[0]
+
+    @pytest.mark.parametrize(("mean", "std"), [*MEANS_STDS, (0.5, 2.0)])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, mean, std, device):
+        check_kernel(F.normalize_image, make_image(dtype=torch.float32, device=device), mean=self.MEAN, std=self.STD)
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_inplace(self, device):
+        input = make_image_tensor(dtype=torch.float32, device=device)
+        input_version = input._version
+
+        output_out_of_place = F.normalize_image(input, mean=self.MEAN, std=self.STD)
+        assert output_out_of_place.data_ptr() != input.data_ptr()
+        assert output_out_of_place is not input
+
+        output_inplace = F.normalize_image(input, mean=self.MEAN, std=self.STD, inplace=True)
+        assert output_inplace.data_ptr() == input.data_ptr()
+        assert output_inplace._version > input_version
+        assert output_inplace is input
+
+        assert_equal(output_inplace, output_out_of_place)
+
+    def test_kernel_video(self):
+        check_kernel(F.normalize_video, make_video(dtype=torch.float32), mean=self.MEAN, std=self.STD)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.normalize, make_input(dtype=torch.float32), mean=self.MEAN, std=self.STD)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.normalize_image, torch.Tensor),
+            (F.normalize_image, tv_tensors.Image),
+            (F.normalize_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.normalize, kernel=kernel, input_type=input_type)
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="should be a float tensor"):
+            F.normalize_image(make_image(dtype=torch.uint8), mean=self.MEAN, std=self.STD)
+
+        with pytest.raises(ValueError, match="tensor image of size"):
+            F.normalize_image(torch.rand(16, 16, dtype=torch.float32), mean=self.MEAN, std=self.STD)
+
+        for std in [0, [0, 0, 0], [0, 1, 1]]:
+            with pytest.raises(ValueError, match="std evaluated to zero, leading to division by zero"):
+                F.normalize_image(make_image(dtype=torch.float32), mean=self.MEAN, std=std)
+
+    def _sample_input_adapter(self, transform, input, device):
+        adapted_input = {}
+        for key, value in input.items():
+            if isinstance(value, PIL.Image.Image):
+                # normalize doesn't support PIL images
+                continue
+            elif check_type(value, (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)):
+                # normalize doesn't support integer images
+                value = F.to_dtype(value, torch.float32, scale=True)
+            adapted_input[key] = value
+        return adapted_input
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
+    def test_transform(self, make_input):
+        check_transform(
+            transforms.Normalize(mean=self.MEAN, std=self.STD),
+            make_input(dtype=torch.float32),
+            check_sample_input=self._sample_input_adapter,
+        )
+
+    def _reference_normalize_image(self, image, *, mean, std):
+        image = image.numpy()
+        mean, std = (np.array(stat, dtype=image.dtype).reshape((-1, 1, 1)) for stat in [mean, std])
+        return tv_tensors.Image((image - mean) / std)
+
+    @pytest.mark.parametrize(("mean", "std"), MEANS_STDS)
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64])
+    @pytest.mark.parametrize("fn", [F.normalize, transform_cls_to_functional(transforms.Normalize)])
+    def test_correctness_image(self, mean, std, dtype, fn):
+        image = make_image(dtype=dtype)
+
+        actual = fn(image, mean=mean, std=std)
+        expected = self._reference_normalize_image(image, mean=mean, std=std)
+
+        assert_equal(actual, expected)
+
+
+class TestClampBoundingBoxes:
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("clamping_mode", ("soft", "hard", None))
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, format, clamping_mode, dtype, device):
+        if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format):
+            pytest.xfail("Rotated bounding boxes should be floating point tensors")
+        bounding_boxes = make_bounding_boxes(format=format, clamping_mode=clamping_mode, dtype=dtype, device=device)
+        check_kernel(
+            F.clamp_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            clamping_mode=clamping_mode,
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("clamping_mode", ("soft", "hard", None))
+    def test_functional(self, format, clamping_mode):
+        check_functional(F.clamp_bounding_boxes, make_bounding_boxes(format=format, clamping_mode=clamping_mode))
+
+    def test_errors(self):
+        input_tv_tensor = make_bounding_boxes()
+        input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor)
+        format, canvas_size = input_tv_tensor.format, input_tv_tensor.canvas_size
+
+        for format_, canvas_size_, clamping_mode_ in itertools.product(
+            (format, None), (canvas_size, None), (input_tv_tensor.clamping_mode, "auto")
+        ):
+            with pytest.raises(
+                ValueError,
+                match="For pure tensor inputs, `format`, `canvas_size` and `clamping_mode` have to be passed.",
+            ):
+                F.clamp_bounding_boxes(input_pure_tensor, format=format_, canvas_size=canvas_size_)
+
+        for format_, canvas_size_ in [(format, canvas_size), (format, None), (None, canvas_size)]:
+            with pytest.raises(
+                ValueError, match="For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed."
+            ):
+                F.clamp_bounding_boxes(input_tv_tensor, format=format_, canvas_size=canvas_size_)
+
+        with pytest.raises(ValueError, match="clamping_mode must be soft,"):
+            F.clamp_bounding_boxes(input_tv_tensor, clamping_mode="bad")
+        with pytest.raises(ValueError, match="clamping_mode must be soft,"):
+            transforms.ClampBoundingBoxes(clamping_mode="bad")(input_tv_tensor)
+
+    def test_transform(self):
+        check_transform(transforms.ClampBoundingBoxes(), make_bounding_boxes())
+
+    @pytest.mark.parametrize("rotated", (True, False))
+    @pytest.mark.parametrize("constructor_clamping_mode", ("soft", "hard", None))
+    @pytest.mark.parametrize("clamping_mode", ("soft", "hard", None, "auto"))
+    @pytest.mark.parametrize("pass_pure_tensor", (True, False))
+    @pytest.mark.parametrize("fn", [F.clamp_bounding_boxes, transform_cls_to_functional(transforms.ClampBoundingBoxes)])
+    def test_clamping_mode(self, rotated, constructor_clamping_mode, clamping_mode, pass_pure_tensor, fn):
+        # This test checks 2 things:
+        # - That passing clamping_mode=None to the clamp_bounding_boxes
+        #   functional (or to the class) relies on the box's `.clamping_mode`
+        #   attribute
+        # - That clamping happens when it should, and only when it should, i.e.
+        #   when the clamping mode is not None. It doesn't validate the
+        #   numerical results, only that clamping happened. For that, we create
+        #   a large 100x100 box inside of a small 10x10 image.
+
+        if pass_pure_tensor and fn is not F.clamp_bounding_boxes:
+            # Only the functional supports pure tensors, not the class
+            return
+        if pass_pure_tensor and clamping_mode == "auto":
+            # cannot leave clamping_mode="auto" when passing pure tensor
+            return
+
+        if rotated:
+            boxes = tv_tensors.BoundingBoxes(
+                [0.0, 0.0, 100.0, 100.0, 0.0],
+                format="XYWHR",
+                canvas_size=(10, 10),
+                clamping_mode=constructor_clamping_mode,
+            )
+            expected_clamped_output = torch.tensor([[0.0, 0.0, 10.0, 10.0, 0.0]])
+        else:
+            boxes = tv_tensors.BoundingBoxes(
+                [0, 100, 0, 100], format="XYXY", canvas_size=(10, 10), clamping_mode=constructor_clamping_mode
+            )
+            expected_clamped_output = torch.tensor([[0, 10, 0, 10]])
+
+        if pass_pure_tensor:
+            out = fn(
+                boxes.as_subclass(torch.Tensor),
+                format=boxes.format,
+                canvas_size=boxes.canvas_size,
+                clamping_mode=clamping_mode,
+            )
+        else:
+            out = fn(boxes, clamping_mode=clamping_mode)
+
+        clamping_mode_prevailing = constructor_clamping_mode if clamping_mode == "auto" else clamping_mode
+        if clamping_mode_prevailing is None:
+            assert_equal(boxes, out)  # should be a pass-through
+        else:
+            assert_equal(out, expected_clamped_output)
+
+
+class TestSetClampingMode:
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("constructor_clamping_mode", ("soft", "hard", None))
+    @pytest.mark.parametrize("desired_clamping_mode", ("soft", "hard", None))
+    def test_setter(self, format, constructor_clamping_mode, desired_clamping_mode):
+
+        in_boxes = make_bounding_boxes(format=format, clamping_mode=constructor_clamping_mode)
+        out_boxes = transforms.SetClampingMode(clamping_mode=desired_clamping_mode)(in_boxes)
+
+        assert in_boxes.clamping_mode == constructor_clamping_mode  # input is unchanged: no leak
+        assert out_boxes.clamping_mode == desired_clamping_mode
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("constructor_clamping_mode", ("soft", "hard", None))
+    def test_pipeline_no_leak(self, format, constructor_clamping_mode):
+        class AssertClampingMode(transforms.Transform):
+            def __init__(self, expected_clamping_mode):
+                super().__init__()
+                self.expected_clamping_mode = expected_clamping_mode
+
+            _transformed_types = (tv_tensors.BoundingBoxes,)
+
+            def transform(self, inpt, _):
+                assert inpt.clamping_mode == self.expected_clamping_mode
+                return inpt
+
+        t = transforms.Compose(
+            [
+                transforms.SetClampingMode(None),
+                AssertClampingMode(None),
+                transforms.SetClampingMode("hard"),
+                AssertClampingMode("hard"),
+                transforms.SetClampingMode(None),
+                AssertClampingMode(None),
+                transforms.ClampBoundingBoxes("hard"),
+            ]
+        )
+
+        in_boxes = make_bounding_boxes(format=format, clamping_mode=constructor_clamping_mode)
+        out_boxes = t(in_boxes)
+
+        assert in_boxes.clamping_mode == constructor_clamping_mode  # input is unchanged: no leak
+
+        # assert that the output boxes clamping_mode is the one set by the last SetClampingMode.
+        # ClampBoundingBoxes doesn't set clamping_mode.
+        assert out_boxes.clamping_mode is None
+
+    def test_error(self):
+        with pytest.raises(ValueError, match="clamping_mode must be"):
+            transforms.SetClampingMode("bad")
+
+
+class TestClampKeyPoints:
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, dtype, device):
+        keypoints = make_keypoints(dtype=dtype, device=device)
+        check_kernel(
+            F.clamp_keypoints,
+            keypoints,
+            canvas_size=keypoints.canvas_size,
+        )
+
+    def test_functional(self):
+        check_functional(F.clamp_keypoints, make_keypoints())
+
+    def test_errors(self):
+        input_tv_tensor = make_keypoints()
+        input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor)
+
+        with pytest.raises(ValueError, match="`canvas_size` has to be passed"):
+            F.clamp_keypoints(input_pure_tensor, canvas_size=None)
+
+        with pytest.raises(ValueError, match="`canvas_size` must not be passed"):
+            F.clamp_keypoints(input_tv_tensor, canvas_size=input_tv_tensor.canvas_size)
+
+    def test_transform(self):
+        check_transform(transforms.ClampKeyPoints(), make_keypoints())
+
+
+class TestInvert:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.int16, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.invert_image, make_image(dtype=dtype, device=device))
+
+    def test_kernel_video(self):
+        check_kernel(F.invert_video, make_video())
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.invert, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.invert_image, torch.Tensor),
+            (F._color._invert_image_pil, PIL.Image.Image),
+            (F.invert_image, tv_tensors.Image),
+            (F.invert_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.invert, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_transform(self, make_input):
+        check_transform(transforms.RandomInvert(p=1), make_input())
+
+    @pytest.mark.parametrize("fn", [F.invert, transform_cls_to_functional(transforms.RandomInvert, p=1)])
+    def test_correctness_image(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image(F.invert(F.to_pil_image(image)))
+
+        assert_equal(actual, expected)
+
+
+class TestPosterize:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.posterize_image, make_image(dtype=dtype, device=device), bits=1)
+
+    def test_kernel_video(self):
+        check_kernel(F.posterize_video, make_video(), bits=1)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.posterize, make_input(), bits=1)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.posterize_image, torch.Tensor),
+            (F._color._posterize_image_pil, PIL.Image.Image),
+            (F.posterize_image, tv_tensors.Image),
+            (F.posterize_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.posterize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_transform(self, make_input):
+        check_transform(transforms.RandomPosterize(bits=1, p=1), make_input())
+
+    @pytest.mark.parametrize("bits", [1, 4, 8])
+    @pytest.mark.parametrize("fn", [F.posterize, transform_cls_to_functional(transforms.RandomPosterize, p=1)])
+    def test_correctness_image(self, bits, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image, bits=bits)
+        expected = F.to_image(F.posterize(F.to_pil_image(image), bits=bits))
+
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("bits", [-1, 9, 2.1])
+    def test_error_functional(self, bits):
+        with pytest.raises(
+            TypeError,
+            match=re.escape(f"bits must be a positive integer in the range [0, 8], got {bits} instead."),
+        ):
+            F.posterize(make_image(dtype=torch.uint8), bits=bits)
+
+
+class TestSolarize:
+    def _make_threshold(self, input, *, factor=0.5):
+        dtype = input.dtype if isinstance(input, torch.Tensor) else torch.uint8
+        return (float if dtype.is_floating_point else int)(get_max_value(dtype) * factor)
+
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        image = make_image(dtype=dtype, device=device)
+        check_kernel(F.solarize_image, image, threshold=self._make_threshold(image))
+
+    def test_kernel_video(self):
+        video = make_video()
+        check_kernel(F.solarize_video, video, threshold=self._make_threshold(video))
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        input = make_input()
+        check_functional(F.solarize, input, threshold=self._make_threshold(input))
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.solarize_image, torch.Tensor),
+            (F._color._solarize_image_pil, PIL.Image.Image),
+            (F.solarize_image, tv_tensors.Image),
+            (F.solarize_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.solarize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(("dtype", "threshold"), [(torch.uint8, 256), (torch.float, 1.5)])
+    def test_functional_error(self, dtype, threshold):
+        with pytest.raises(TypeError, match="Threshold should be less or equal the maximum value of the dtype"):
+            F.solarize(make_image(dtype=dtype), threshold=threshold)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_transform(self, make_input):
+        input = make_input()
+        check_transform(transforms.RandomSolarize(threshold=self._make_threshold(input), p=1), input)
+
+    @pytest.mark.parametrize("threshold_factor", [0.0, 0.1, 0.5, 0.9, 1.0])
+    @pytest.mark.parametrize("fn", [F.solarize, transform_cls_to_functional(transforms.RandomSolarize, p=1)])
+    def test_correctness_image(self, threshold_factor, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+        threshold = self._make_threshold(image, factor=threshold_factor)
+
+        actual = fn(image, threshold=threshold)
+        expected = F.to_image(F.solarize(F.to_pil_image(image), threshold=threshold))
+
+        assert_equal(actual, expected)
+
+
+class TestAutocontrast:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.int16, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.autocontrast_image, make_image(dtype=dtype, device=device))
+
+    def test_kernel_video(self):
+        check_kernel(F.autocontrast_video, make_video())
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.autocontrast, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.autocontrast_image, torch.Tensor),
+            (F._color._autocontrast_image_pil, PIL.Image.Image),
+            (F.autocontrast_image, tv_tensors.Image),
+            (F.autocontrast_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.autocontrast, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_transform(self, make_input):
+        check_transform(transforms.RandomAutocontrast(p=1), make_input(), check_v1_compatibility=dict(rtol=0, atol=1))
+
+    @pytest.mark.parametrize("fn", [F.autocontrast, transform_cls_to_functional(transforms.RandomAutocontrast, p=1)])
+    def test_correctness_image(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image(F.autocontrast(F.to_pil_image(image)))
+
+        assert_close(actual, expected, rtol=0, atol=1)
+
+
+class TestAdjustSharpness:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.adjust_sharpness_image, make_image(dtype=dtype, device=device), sharpness_factor=0.5)
+
+    def test_kernel_video(self):
+        check_kernel(F.adjust_sharpness_video, make_video(), sharpness_factor=0.5)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_sharpness, make_input(), sharpness_factor=0.5)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_sharpness_image, torch.Tensor),
+            (F._color._adjust_sharpness_image_pil, PIL.Image.Image),
+            (F.adjust_sharpness_image, tv_tensors.Image),
+            (F.adjust_sharpness_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_sharpness, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_transform(self, make_input):
+        check_transform(transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1), make_input())
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="can have 1 or 3 channels"):
+            F.adjust_sharpness(make_image(color_space="RGBA"), sharpness_factor=0.5)
+
+        with pytest.raises(ValueError, match="is not non-negative"):
+            F.adjust_sharpness(make_image(), sharpness_factor=-1)
+
+    @pytest.mark.parametrize("sharpness_factor", [0.1, 0.5, 1.0])
+    @pytest.mark.parametrize(
+        "fn", [F.adjust_sharpness, transform_cls_to_functional(transforms.RandomAdjustSharpness, p=1)]
+    )
+    def test_correctness_image(self, sharpness_factor, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image, sharpness_factor=sharpness_factor)
+        expected = F.to_image(F.adjust_sharpness(F.to_pil_image(image), sharpness_factor=sharpness_factor))
+
+        assert_equal(actual, expected)
+
+
+class TestAdjustContrast:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.adjust_contrast_image, make_image(dtype=dtype, device=device), contrast_factor=0.5)
+
+    def test_kernel_video(self):
+        check_kernel(F.adjust_contrast_video, make_video(), contrast_factor=0.5)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_contrast, make_input(), contrast_factor=0.5)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_contrast_image, torch.Tensor),
+            (F._color._adjust_contrast_image_pil, PIL.Image.Image),
+            (F.adjust_contrast_image, tv_tensors.Image),
+            (F.adjust_contrast_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_contrast, kernel=kernel, input_type=input_type)
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="permitted channel values are 1 or 3"):
+            F.adjust_contrast(make_image(color_space="RGBA"), contrast_factor=0.5)
+
+        with pytest.raises(ValueError, match="is not non-negative"):
+            F.adjust_contrast(make_image(), contrast_factor=-1)
+
+    @pytest.mark.parametrize("contrast_factor", [0.1, 0.5, 1.0])
+    def test_correctness_image(self, contrast_factor):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_contrast(image, contrast_factor=contrast_factor)
+        expected = F.to_image(F.adjust_contrast(F.to_pil_image(image), contrast_factor=contrast_factor))
+
+        assert_close(actual, expected, rtol=0, atol=1)
+
+
+class TestAdjustGamma:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.adjust_gamma_image, make_image(dtype=dtype, device=device), gamma=0.5)
+
+    def test_kernel_video(self):
+        check_kernel(F.adjust_gamma_video, make_video(), gamma=0.5)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_gamma, make_input(), gamma=0.5)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_gamma_image, torch.Tensor),
+            (F._color._adjust_gamma_image_pil, PIL.Image.Image),
+            (F.adjust_gamma_image, tv_tensors.Image),
+            (F.adjust_gamma_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_gamma, kernel=kernel, input_type=input_type)
+
+    def test_functional_error(self):
+        with pytest.raises(ValueError, match="Gamma should be a non-negative real number"):
+            F.adjust_gamma(make_image(), gamma=-1)
+
+    @pytest.mark.parametrize("gamma", [0.1, 0.5, 1.0])
+    @pytest.mark.parametrize("gain", [0.1, 1.0, 2.0])
+    def test_correctness_image(self, gamma, gain):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_gamma(image, gamma=gamma, gain=gain)
+        expected = F.to_image(F.adjust_gamma(F.to_pil_image(image), gamma=gamma, gain=gain))
+
+        assert_equal(actual, expected)
+
+
+class TestAdjustHue:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.adjust_hue_image, make_image(dtype=dtype, device=device), hue_factor=0.25)
+
+    def test_kernel_video(self):
+        check_kernel(F.adjust_hue_video, make_video(), hue_factor=0.25)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_hue, make_input(), hue_factor=0.25)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_hue_image, torch.Tensor),
+            (F._color._adjust_hue_image_pil, PIL.Image.Image),
+            (F.adjust_hue_image, tv_tensors.Image),
+            (F.adjust_hue_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_hue, kernel=kernel, input_type=input_type)
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="permitted channel values are 1 or 3"):
+            F.adjust_hue(make_image(color_space="RGBA"), hue_factor=0.25)
+
+        for hue_factor in [-1, 1]:
+            with pytest.raises(ValueError, match=re.escape("is not in [-0.5, 0.5]")):
+                F.adjust_hue(make_image(), hue_factor=hue_factor)
+
+    @pytest.mark.parametrize("hue_factor", [-0.5, -0.3, 0.0, 0.2, 0.5])
+    def test_correctness_image(self, hue_factor):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_hue(image, hue_factor=hue_factor)
+        expected = F.to_image(F.adjust_hue(F.to_pil_image(image), hue_factor=hue_factor))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2
+
+
+class TestAdjustSaturation:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.adjust_saturation_image, make_image(dtype=dtype, device=device), saturation_factor=0.5)
+
+    def test_kernel_video(self):
+        check_kernel(F.adjust_saturation_video, make_video(), saturation_factor=0.5)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_saturation, make_input(), saturation_factor=0.5)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_saturation_image, torch.Tensor),
+            (F._color._adjust_saturation_image_pil, PIL.Image.Image),
+            (F.adjust_saturation_image, tv_tensors.Image),
+            (F.adjust_saturation_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_saturation, kernel=kernel, input_type=input_type)
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="permitted channel values are 1 or 3"):
+            F.adjust_saturation(make_image(color_space="RGBA"), saturation_factor=0.5)
+
+        with pytest.raises(ValueError, match="is not non-negative"):
+            F.adjust_saturation(make_image(), saturation_factor=-1)
+
+    @pytest.mark.parametrize("saturation_factor", [0.1, 0.5, 1.0])
+    def test_correctness_image(self, saturation_factor):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_saturation(image, saturation_factor=saturation_factor)
+        expected = F.to_image(F.adjust_saturation(F.to_pil_image(image), saturation_factor=saturation_factor))
+
+        assert_close(actual, expected, rtol=0, atol=1)
+
+
+class TestFiveTenCrop:
+    INPUT_SIZE = (17, 11)
+    OUTPUT_SIZE = (3, 5)
+
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("kernel", [F.five_crop_image, F.ten_crop_image])
+    def test_kernel_image(self, dtype, device, kernel):
+        check_kernel(
+            kernel,
+            make_image(self.INPUT_SIZE, dtype=dtype, device=device),
+            size=self.OUTPUT_SIZE,
+            check_batched_vs_unbatched=False,
+        )
+
+    @pytest.mark.parametrize("kernel", [F.five_crop_video, F.ten_crop_video])
+    def test_kernel_video(self, kernel):
+        check_kernel(kernel, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZE, check_batched_vs_unbatched=False)
+
+    def _functional_wrapper(self, fn):
+        # This wrapper is needed to make five_crop / ten_crop compatible with check_functional, since that requires a
+        # single output rather than a sequence.
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            outputs = fn(*args, **kwargs)
+            return outputs[0]
+
+        return wrapper
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("functional", [F.five_crop, F.ten_crop])
+    def test_functional(self, make_input, functional):
+        check_functional(
+            self._functional_wrapper(functional),
+            make_input(self.INPUT_SIZE),
+            size=self.OUTPUT_SIZE,
+            check_scripted_smoke=False,
+        )
+
+    @pytest.mark.parametrize(
+        ("functional", "kernel", "input_type"),
+        [
+            (F.five_crop, F.five_crop_image, torch.Tensor),
+            (F.five_crop, F._geometry._five_crop_image_pil, PIL.Image.Image),
+            (F.five_crop, F.five_crop_image, tv_tensors.Image),
+            (F.five_crop, F.five_crop_video, tv_tensors.Video),
+            (F.ten_crop, F.ten_crop_image, torch.Tensor),
+            (F.ten_crop, F._geometry._ten_crop_image_pil, PIL.Image.Image),
+            (F.ten_crop, F.ten_crop_image, tv_tensors.Image),
+            (F.ten_crop, F.ten_crop_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, functional, kernel, input_type):
+        check_functional_kernel_signature_match(functional, kernel=kernel, input_type=input_type)
+
+    class _TransformWrapper(nn.Module):
+        # This wrapper is needed to make FiveCrop / TenCrop compatible with check_transform, since that requires a
+        # single output rather than a sequence.
+        _v1_transform_cls = None
+
+        def _extract_params_for_v1_transform(self):
+            return dict(five_ten_crop_transform=self.five_ten_crop_transform)
+
+        def __init__(self, five_ten_crop_transform):
+            super().__init__()
+            type(self)._v1_transform_cls = type(self)
+            self.five_ten_crop_transform = five_ten_crop_transform
+
+        def forward(self, input: torch.Tensor) -> torch.Tensor:
+            outputs = self.five_ten_crop_transform(input)
+            return outputs[0]
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop])
+    def test_transform(self, make_input, transform_cls):
+        check_transform(
+            self._TransformWrapper(transform_cls(size=self.OUTPUT_SIZE)),
+            make_input(self.INPUT_SIZE),
+            check_sample_input=False,
+        )
+
+    @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_masks])
+    @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop])
+    def test_transform_error(self, make_input, transform_cls):
+        transform = transform_cls(size=self.OUTPUT_SIZE)
+
+        with pytest.raises(TypeError, match="not supported"):
+            transform(make_input(self.INPUT_SIZE))
+
+    @pytest.mark.parametrize("fn", [F.five_crop, transform_cls_to_functional(transforms.FiveCrop)])
+    def test_correctness_image_five_crop(self, fn):
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu")
+
+        actual = fn(image, size=self.OUTPUT_SIZE)
+        expected = F.five_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE)
+
+        assert isinstance(actual, tuple)
+        assert_equal(actual, [F.to_image(e) for e in expected])
+
+    @pytest.mark.parametrize("fn_or_class", [F.ten_crop, transforms.TenCrop])
+    @pytest.mark.parametrize("vertical_flip", [False, True])
+    def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip):
+        if fn_or_class is transforms.TenCrop:
+            fn = transform_cls_to_functional(fn_or_class, size=self.OUTPUT_SIZE, vertical_flip=vertical_flip)
+            kwargs = dict()
+        else:
+            fn = fn_or_class
+            kwargs = dict(size=self.OUTPUT_SIZE, vertical_flip=vertical_flip)
+
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu")
+
+        actual = fn(image, **kwargs)
+        expected = F.ten_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE, vertical_flip=vertical_flip)
+
+        assert isinstance(actual, tuple)
+        assert_equal(actual, [F.to_image(e) for e in expected])
+
+
+class TestColorJitter:
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, dtype, device):
+        if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"):
+            pytest.skip(
+                "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' "
+                "will degenerate to that anyway."
+            )
+
+        # TODO needed to add seed after KeyPoints PR, not sure why? failure
+        # wasn't really significant anyway.
+        torch.manual_seed(1)
+        check_transform(
+            transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25),
+            make_input(dtype=dtype, device=device),
+        )
+
+    def test_transform_noop(self):
+        input = make_image()
+        input_version = input._version
+
+        transform = transforms.ColorJitter()
+        output = transform(input)
+
+        assert output is input
+        assert output.data_ptr() == input.data_ptr()
+        assert output._version == input_version
+
+    def test_transform_error(self):
+        with pytest.raises(ValueError, match="must be non negative"):
+            transforms.ColorJitter(brightness=-1)
+
+        for brightness in [object(), [1, 2, 3]]:
+            with pytest.raises(TypeError, match="single number or a sequence with length 2"):
+                transforms.ColorJitter(brightness=brightness)
+
+        with pytest.raises(ValueError, match="values should be between"):
+            transforms.ColorJitter(brightness=(-1, 0.5))
+
+        with pytest.raises(ValueError, match="values should be between"):
+            transforms.ColorJitter(hue=1)
+
+    @pytest.mark.parametrize("brightness", [None, 0.1, (0.2, 0.3)])
+    @pytest.mark.parametrize("contrast", [None, 0.4, (0.5, 0.6)])
+    @pytest.mark.parametrize("saturation", [None, 0.7, (0.8, 0.9)])
+    @pytest.mark.parametrize("hue", [None, 0.3, (-0.1, 0.2)])
+    def test_transform_correctness(self, brightness, contrast, saturation, hue):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        transform = transforms.ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+
+        with freeze_rng_state():
+            torch.manual_seed(0)
+            actual = transform(image)
+
+            torch.manual_seed(0)
+            expected = F.to_image(transform(F.to_pil_image(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2
+
+
+class TestRgbToGrayscale:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.rgb_to_grayscale_image, make_image(dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image])
+    def test_functional(self, make_input):
+        check_functional(F.rgb_to_grayscale, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.rgb_to_grayscale_image, torch.Tensor),
+            (F._color._rgb_to_grayscale_image_pil, PIL.Image.Image),
+            (F.rgb_to_grayscale_image, tv_tensors.Image),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.rgb_to_grayscale, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("transform", [transforms.Grayscale(), transforms.RandomGrayscale(p=1)])
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image])
+    def test_transform(self, transform, make_input):
+        check_transform(transform, make_input())
+
+    @pytest.mark.parametrize("num_output_channels", [1, 3])
+    @pytest.mark.parametrize("color_space", ["RGB", "GRAY"])
+    @pytest.mark.parametrize("fn", [F.rgb_to_grayscale, transform_cls_to_functional(transforms.Grayscale)])
+    def test_image_correctness(self, num_output_channels, color_space, fn):
+        image = make_image(dtype=torch.uint8, device="cpu", color_space=color_space)
+
+        actual = fn(image, num_output_channels=num_output_channels)
+        expected = F.to_image(F.rgb_to_grayscale(F.to_pil_image(image), num_output_channels=num_output_channels))
+
+        assert_equal(actual, expected, rtol=0, atol=1)
+
+    def test_expanded_channels_are_not_views_into_the_same_underlying_tensor(self):
+        image = make_image(dtype=torch.uint8, device="cpu", color_space="GRAY")
+
+        output_image = F.rgb_to_grayscale(image, num_output_channels=3)
+        assert_equal(output_image[0][0][0], output_image[1][0][0])
+        output_image[0][0][0] = output_image[0][0][0] + 1
+        assert output_image[0][0][0] != output_image[1][0][0]
+
+    @pytest.mark.parametrize("num_input_channels", [1, 3])
+    def test_random_transform_correctness(self, num_input_channels):
+        image = make_image(
+            color_space={
+                1: "GRAY",
+                3: "RGB",
+            }[num_input_channels],
+            dtype=torch.uint8,
+            device="cpu",
+        )
+
+        transform = transforms.RandomGrayscale(p=1)
+
+        actual = transform(image)
+        expected = F.to_image(F.rgb_to_grayscale(F.to_pil_image(image), num_output_channels=num_input_channels))
+
+        assert_equal(actual, expected, rtol=0, atol=1)
+
+
+class TestGrayscaleToRgb:
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.grayscale_to_rgb_image, make_image(dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image])
+    def test_functional(self, make_input):
+        check_functional(F.grayscale_to_rgb, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.rgb_to_grayscale_image, torch.Tensor),
+            (F._color._rgb_to_grayscale_image_pil, PIL.Image.Image),
+            (F.rgb_to_grayscale_image, tv_tensors.Image),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.grayscale_to_rgb, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image])
+    def test_transform(self, make_input):
+        check_transform(transforms.RGB(), make_input(color_space="GRAY"))
+
+    @pytest.mark.parametrize("fn", [F.grayscale_to_rgb, transform_cls_to_functional(transforms.RGB)])
+    def test_image_correctness(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu", color_space="GRAY")
+
+        actual = fn(image)
+        expected = F.to_image(F.grayscale_to_rgb(F.to_pil_image(image)))
+
+        assert_equal(actual, expected, rtol=0, atol=1)
+
+    def test_expanded_channels_are_not_views_into_the_same_underlying_tensor(self):
+        image = make_image(dtype=torch.uint8, device="cpu", color_space="GRAY")
+
+        output_image = F.grayscale_to_rgb(image)
+        assert_equal(output_image[0][0][0], output_image[1][0][0])
+        output_image[0][0][0] = output_image[0][0][0] + 1
+        assert output_image[0][0][0] != output_image[1][0][0]
+
+    def test_rgb_image_is_unchanged(self):
+        image = make_image(dtype=torch.uint8, device="cpu", color_space="RGB")
+        assert_equal(image.shape[-3], 3)
+        assert_equal(F.grayscale_to_rgb(image), image)
+
+
+class TestRandomZoomOut:
+    # Tests are light because this largely relies on the already tested `pad` kernels.
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+        ],
+    )
+    def test_transform(self, make_input):
+        check_transform(transforms.RandomZoomOut(p=1), make_input())
+
+    def test_transform_error(self):
+        for side_range in [None, 1, [1, 2, 3]]:
+            with pytest.raises(
+                ValueError if isinstance(side_range, list) else TypeError, match="should be a sequence of length 2"
+            ):
+                transforms.RandomZoomOut(side_range=side_range)
+
+        for side_range in [[0.5, 1.5], [2.0, 1.0]]:
+            with pytest.raises(ValueError, match="Invalid side range"):
+                transforms.RandomZoomOut(side_range=side_range)
+
+    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_params_correctness(self, side_range, make_input, device):
+        if make_input is make_image_pil and device != "cpu":
+            pytest.skip("PIL image tests with parametrization device!='cpu' will degenerate to that anyway.")
+
+        transform = transforms.RandomZoomOut(side_range=side_range)
+
+        input = make_input()
+        height, width = F.get_size(input)
+
+        params = transform.make_params([input])
+        assert "padding" in params
+
+        padding = params["padding"]
+        assert len(padding) == 4
+
+        assert 0 <= padding[0] <= (side_range[1] - 1) * width
+        assert 0 <= padding[1] <= (side_range[1] - 1) * height
+        assert 0 <= padding[2] <= (side_range[1] - 1) * width
+        assert 0 <= padding[3] <= (side_range[1] - 1) * height
+
+
+class TestRandomPhotometricDistort:
+    # Tests are light because this largely relies on the already tested
+    # `adjust_{brightness,contrast,saturation,hue}` and `permute_channels` kernels.
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, dtype, device):
+        if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"):
+            pytest.skip(
+                "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' "
+                "will degenerate to that anyway."
+            )
+
+        check_transform(
+            transforms.RandomPhotometricDistort(
+                brightness=(0.3, 0.4), contrast=(0.5, 0.6), saturation=(0.7, 0.8), hue=(-0.1, 0.2), p=1
+            ),
+            make_input(dtype=dtype, device=device),
+        )
+
+
+class TestScaleJitter:
+    # Tests are light because this largely relies on the already tested `resize` kernels.
+
+    INPUT_SIZE = (17, 11)
+    TARGET_SIZE = (12, 13)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        if make_input is make_image_pil and device != "cpu":
+            pytest.skip("PIL image tests with parametrization device!='cpu' will degenerate to that anyway.")
+
+        check_transform(transforms.ScaleJitter(self.TARGET_SIZE), make_input(self.INPUT_SIZE, device=device))
+
+    def test_make_params(self):
+        input_size = self.INPUT_SIZE
+        target_size = self.TARGET_SIZE
+        scale_range = (0.5, 1.5)
+
+        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
+        params = transform.make_params([make_image(input_size)])
+
+        assert "size" in params
+        size = params["size"]
+
+        assert isinstance(size, tuple) and len(size) == 2
+        height, width = size
+
+        r_min = min(target_size[1] / input_size[0], target_size[0] / input_size[1]) * scale_range[0]
+        r_max = min(target_size[1] / input_size[0], target_size[0] / input_size[1]) * scale_range[1]
+
+        assert int(input_size[0] * r_min) <= height <= int(input_size[0] * r_max)
+        assert int(input_size[1] * r_min) <= width <= int(input_size[1] * r_max)
+
+
+class TestLinearTransform:
+    def _make_matrix_and_vector(self, input, *, device=None):
+        device = device or input.device
+        numel = math.prod(F.get_dimensions(input))
+        transformation_matrix = torch.randn((numel, numel), device=device)
+        mean_vector = torch.randn((numel,), device=device)
+        return transformation_matrix, mean_vector
+
+    def _sample_input_adapter(self, transform, input, device):
+        return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, dtype, device):
+        input = make_input(dtype=dtype, device=device)
+        check_transform(
+            transforms.LinearTransformation(*self._make_matrix_and_vector(input)),
+            input,
+            check_sample_input=self._sample_input_adapter,
+            # Compat check is failing on M1 with:
+            # AssertionError: Tensor-likes are not close!
+            # Mismatched elements: 1 / 561 (0.2%)
+            # See https://github.com/pytorch/vision/issues/8453
+            check_v1_compatibility=(sys.platform != "darwin"),
+        )
+
+    def test_transform_error(self):
+        with pytest.raises(ValueError, match="transformation_matrix should be square"):
+            transforms.LinearTransformation(transformation_matrix=torch.rand(2, 3), mean_vector=torch.rand(2))
+
+        with pytest.raises(ValueError, match="mean_vector should have the same length"):
+            transforms.LinearTransformation(transformation_matrix=torch.rand(2, 2), mean_vector=torch.rand(1))
+
+        for matrix_dtype, vector_dtype in [(torch.float32, torch.float64), (torch.float64, torch.float32)]:
+            with pytest.raises(ValueError, match="Input tensors should have the same dtype"):
+                transforms.LinearTransformation(
+                    transformation_matrix=torch.rand(2, 2, dtype=matrix_dtype),
+                    mean_vector=torch.rand(2, dtype=vector_dtype),
+                )
+
+        image = make_image()
+        transform = transforms.LinearTransformation(transformation_matrix=torch.rand(2, 2), mean_vector=torch.rand(2))
+        with pytest.raises(ValueError, match="Input tensor and transformation matrix have incompatible shape"):
+            transform(image)
+
+        transform = transforms.LinearTransformation(*self._make_matrix_and_vector(image))
+        with pytest.raises(TypeError, match="does not support PIL images"):
+            transform(F.to_pil_image(image))
+
+    @needs_cuda
+    def test_transform_error_cuda(self):
+        for matrix_device, vector_device in [("cuda", "cpu"), ("cpu", "cuda")]:
+            with pytest.raises(ValueError, match="Input tensors should be on the same device"):
+                transforms.LinearTransformation(
+                    transformation_matrix=torch.rand(2, 2, device=matrix_device),
+                    mean_vector=torch.rand(2, device=vector_device),
+                )
+
+        for input_device, param_device in [("cuda", "cpu"), ("cpu", "cuda")]:
+            input = make_image(device=input_device)
+            transform = transforms.LinearTransformation(*self._make_matrix_and_vector(input, device=param_device))
+            with pytest.raises(
+                ValueError, match="Input tensor should be on the same device as transformation matrix and mean vector"
+            ):
+                transform(input)
+
+
+def make_image_numpy(*args, **kwargs):
+    image = make_image_tensor(*args, **kwargs)
+    return image.permute((1, 2, 0)).numpy()
+
+
+class TestToImage:
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_image_numpy])
+    @pytest.mark.parametrize("fn", [F.to_image, transform_cls_to_functional(transforms.ToImage)])
+    def test_functional_and_transform(self, make_input, fn):
+        input = make_input()
+        output = fn(input)
+
+        assert isinstance(output, tv_tensors.Image)
+
+        input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input)
+        assert F.get_size(output) == input_size
+
+        if isinstance(input, torch.Tensor):
+            assert output.data_ptr() == input.data_ptr()
+
+    def test_2d_np_array(self):
+        # Non-regression test for https://github.com/pytorch/vision/issues/8255
+        input = np.random.rand(10, 10)
+        assert F.to_image(input).shape == (1, 10, 10)
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="Input can either be a pure Tensor, a numpy array, or a PIL image"):
+            F.to_image(object())
+
+
+class TestToPILImage:
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_numpy])
+    @pytest.mark.parametrize("color_space", ["RGB", "GRAY"])
+    @pytest.mark.parametrize("fn", [F.to_pil_image, transform_cls_to_functional(transforms.ToPILImage)])
+    def test_functional_and_transform(self, make_input, color_space, fn):
+        input = make_input(color_space=color_space)
+        output = fn(input)
+
+        assert isinstance(output, PIL.Image.Image)
+
+        input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input)
+        assert F.get_size(output) == input_size
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="pic should be Tensor or ndarray"):
+            F.to_pil_image(object())
+
+        for ndim in [1, 4]:
+            with pytest.raises(ValueError, match="pic should be 2/3 dimensional"):
+                F.to_pil_image(torch.empty(*[1] * ndim))
+
+        with pytest.raises(ValueError, match="pic should not have > 4 channels"):
+            num_channels = 5
+            F.to_pil_image(torch.empty(num_channels, 1, 1))
+
+
+class TestToTensor:
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_image_numpy])
+    def test_smoke(self, make_input):
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            transform = transforms.ToTensor()
+
+        input = make_input()
+        output = transform(input)
+
+        input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input)
+        assert F.get_size(output) == input_size
+
+
+class TestPILToTensor:
+    @pytest.mark.parametrize("color_space", ["RGB", "GRAY"])
+    @pytest.mark.parametrize("fn", [F.pil_to_tensor, transform_cls_to_functional(transforms.PILToTensor)])
+    def test_functional_and_transform(self, color_space, fn):
+        input = make_image_pil(color_space=color_space)
+        output = fn(input)
+
+        assert isinstance(output, torch.Tensor) and not isinstance(output, tv_tensors.TVTensor)
+        assert F.get_size(output) == F.get_size(input)
+
+    def test_functional_error(self):
+        with pytest.raises(TypeError, match="pic should be PIL Image"):
+            F.pil_to_tensor(object())
+
+
+class TestLambda:
+    @pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0])
+    @pytest.mark.parametrize("types", [(), (torch.Tensor, np.ndarray)])
+    def test_transform(self, input, types):
+        was_applied = False
+
+        def was_applied_fn(input):
+            nonlocal was_applied
+            was_applied = True
+            return input
+
+        transform = transforms.Lambda(was_applied_fn, *types)
+        output = transform(input)
+
+        assert output is input
+        assert was_applied is (not types or isinstance(input, types))
+
+
+@pytest.mark.parametrize(
+    ("alias", "target"),
+    [
+        pytest.param(alias, target, id=alias.__name__)
+        for alias, target in [
+            (F.hflip, F.horizontal_flip),
+            (F.vflip, F.vertical_flip),
+            (F.get_image_num_channels, F.get_num_channels),
+            (F.to_pil_image, F.to_pil_image),
+            (F.elastic_transform, F.elastic),
+            (F.to_grayscale, F.rgb_to_grayscale),
+        ]
+    ],
+)
+def test_alias(alias, target):
+    assert alias is target
+
+
+@pytest.mark.parametrize(
+    "make_inputs",
+    itertools.permutations(
+        [
+            make_image_tensor,
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_video,
+        ],
+        3,
+    ),
+)
+def test_pure_tensor_heuristic(make_inputs):
+    flat_inputs = [make_input() for make_input in make_inputs]
+
+    def split_on_pure_tensor(to_split):
+        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
+        # 1. The first pure tensor. If none is present, this will be `None`
+        # 2. A list of the remaining pure tensors
+        # 3. A list of all other items
+        pure_tensors = []
+        others = []
+        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
+        # affect the splitting.
+        for item, inpt in zip(to_split, flat_inputs):
+            (pure_tensors if is_pure_tensor(inpt) else others).append(item)
+        return pure_tensors[0] if pure_tensors else None, pure_tensors[1:], others
+
+    class CopyCloneTransform(transforms.Transform):
+        def transform(self, inpt, params):
+            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
+
+        @staticmethod
+        def was_applied(output, inpt):
+            identity = output is inpt
+            if identity:
+                return False
+
+            # Make sure nothing fishy is going on
+            assert_equal(output, inpt)
+            return True
+
+    first_pure_tensor_input, other_pure_tensor_inputs, other_inputs = split_on_pure_tensor(flat_inputs)
+
+    transform = CopyCloneTransform()
+    transformed_sample = transform(flat_inputs)
+
+    first_pure_tensor_output, other_pure_tensor_outputs, other_outputs = split_on_pure_tensor(transformed_sample)
+
+    if first_pure_tensor_input is not None:
+        if other_inputs:
+            assert not transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
+        else:
+            assert transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
+
+    for output, inpt in zip(other_pure_tensor_outputs, other_pure_tensor_inputs):
+        assert not transform.was_applied(output, inpt)
+
+    for input, output in zip(other_inputs, other_outputs):
+        assert transform.was_applied(output, input)
+
+
+class TestRandomIoUCrop:
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
+    def test_make_params(self, device, options):
+        orig_h, orig_w = size = (24, 32)
+        image = make_image(size)
+        bboxes = tv_tensors.BoundingBoxes(
+            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
+            format="XYXY",
+            canvas_size=size,
+            device=device,
+        )
+        sample = [image, bboxes]
+
+        transform = transforms.RandomIoUCrop(sampler_options=options)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform.make_params(sample)
+
+            if options == [2.0]:
+                assert len(params) == 0
+                return
+
+            assert len(params["is_within_crop_area"]) > 0
+            assert params["is_within_crop_area"].dtype == torch.bool
+
+            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
+            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
+
+            left, top = params["left"], params["top"]
+            new_h, new_w = params["height"], params["width"]
+            ious = box_iou(
+                bboxes,
+                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
+            )
+            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
+
+    def test__transform_empty_params(self, mocker):
+        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
+        image = tv_tensors.Image(torch.rand(1, 3, 4, 4))
+        bboxes = tv_tensors.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4))
+        label = torch.tensor([1])
+        sample = [image, bboxes, label]
+        # Let's mock transform.make_params to control the output:
+        transform.make_params = mocker.MagicMock(return_value={})
+        output = transform(sample)
+        torch.testing.assert_close(output, sample)
+
+    def test_forward_assertion(self):
+        transform = transforms.RandomIoUCrop()
+        with pytest.raises(
+            TypeError,
+            match="requires input sample to contain tensor or PIL images and bounding boxes",
+        ):
+            transform(torch.tensor(0))
+
+    def test__transform(self, mocker):
+        transform = transforms.RandomIoUCrop()
+
+        size = (32, 24)
+        image = make_image(size)
+        bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, num_boxes=6)
+        masks = make_detection_masks(size, num_masks=6)
+
+        sample = [image, bboxes, masks]
+
+        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
+
+        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
+        transform.make_params = mocker.MagicMock(return_value=params)
+        output = transform(sample)
+
+        # check number of bboxes vs number of labels:
+        output_bboxes = output[1]
+        assert isinstance(output_bboxes, tv_tensors.BoundingBoxes)
+        assert (output_bboxes[~is_within_crop_area] == 0).all()
+
+        output_masks = output[2]
+        assert isinstance(output_masks, tv_tensors.Mask)
+
+
+class TestRandomShortestSize:
+    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
+    def test_make_params(self, min_size, max_size):
+        canvas_size = (3, 10)
+
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
+
+        sample = make_image(canvas_size)
+        params = transform.make_params([sample])
+
+        assert "size" in params
+        size = params["size"]
+
+        assert isinstance(size, tuple) and len(size) == 2
+
+        longer = max(size)
+        shorter = min(size)
+        if max_size is not None:
+            assert longer <= max_size
+            assert shorter <= max_size
+        else:
+            assert shorter in min_size
+
+
+class TestRandomResize:
+    def test_make_params(self):
+        min_size = 3
+        max_size = 6
+
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
+
+        for _ in range(10):
+            params = transform.make_params([])
+
+            assert isinstance(params["size"], list) and len(params["size"]) == 1
+            size = params["size"][0]
+
+            assert min_size <= size < max_size
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
+@pytest.mark.parametrize("label_type", (torch.Tensor, int))
+@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
+def test_classification_preset(image_type, label_type, dataset_return_type, to_tensor):
+
+    image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_pure_tensor(image)
+
+    label = 1 if label_type is int else torch.tensor([1])
+
+    if dataset_return_type is dict:
+        sample = {
+            "image": image,
+            "label": label,
+        }
+    else:
+        sample = image, label
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
+    t = transforms.Compose(
+        [
+            transforms.RandomResizedCrop((224, 224), antialias=True),
+            transforms.RandomHorizontalFlip(p=1),
+            transforms.RandAugment(),
+            transforms.TrivialAugmentWide(),
+            transforms.AugMix(),
+            transforms.AutoAugment(),
+            to_tensor,
+            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
+            # intended?  This results in a failure if we convert to tensor after
+            # it, because the image would still be uint8 which make Normalize
+            # fail.
+            transforms.ConvertImageDtype(torch.float),
+            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
+            transforms.RandomErasing(p=1),
+        ]
+    )
+
+    out = t(sample)
+
+    assert type(out) == type(sample)
+
+    if dataset_return_type is tuple:
+        out_image, out_label = out
+    else:
+        assert out.keys() == sample.keys()
+        out_image, out_label = out.values()
+
+    assert out_image.shape[-2:] == (224, 224)
+    assert out_label == label
+
+
+@pytest.mark.parametrize("input_size", [(17, 11), (11, 17), (11, 11)])
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_parallelogram_to_bounding_boxes(input_size, device):
+    # Assert that applying `_parallelogram_to_bounding_boxes` to rotated boxes
+    # does not modify the input.
+    bounding_boxes = make_bounding_boxes(input_size, format=tv_tensors.BoundingBoxFormat.XYXYXYXY, device=device)
+    actual = _parallelogram_to_bounding_boxes(bounding_boxes)
+    torch.testing.assert_close(actual, bounding_boxes, rtol=0, atol=1)
+
+    # Test the transformation of two simple parallelograms.
+    #   1---2    1----2
+    #  /   /  -> |    |
+    # 4---3      4----3
+
+    # 1---2      1----2
+    #  \   \  -> |    |
+    #   4---3    4----3
+    parallelogram = torch.tensor(
+        [[1, 0, 4, 0, 3, 2, 0, 2], [0, 0, 3, 0, 4, 2, 1, 2]],
+        dtype=torch.float32,
+    )
+    expected = torch.tensor(
+        [
+            [0, 0, 4, 0, 4, 2, 0, 2],
+            [0, 0, 4, 0, 4, 2, 0, 2],
+        ],
+        dtype=torch.float32,
+    )
+    actual = _parallelogram_to_bounding_boxes(parallelogram)
+    torch.testing.assert_close(actual, expected)
+
+    # Test the transformation of a simple parallelogram.
+    #              1
+    #    1-2      /   2
+    #   / /  ->  /   /
+    # 4-3       4   /
+    #              3
+    #
+    #          1
+    # 1-2       \ 2
+    #   \ \  ->  \  \
+    #    4-3       4 \
+    #                 3
+    parallelogram = torch.tensor(
+        [[0, 4, 3, 1, 5, 1, 2, 4], [0, 1, 2, 1, 5, 4, 3, 4]],
+        dtype=torch.float32,
+    )
+    expected = torch.tensor(
+        [[0, 4, 4, 0, 5, 1, 1, 5], [0, 1, 1, 0, 5, 4, 4, 5]],
+        dtype=torch.float32,
+    )
+    actual = _parallelogram_to_bounding_boxes(parallelogram)
+    torch.testing.assert_close(actual, expected)
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
+@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
+@pytest.mark.parametrize("sanitize", (True, False))
+def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
+    torch.manual_seed(0)
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
+    if data_augmentation == "hflip":
+        t = [
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "lsj":
+        t = [
+            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
+            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
+            # leaving FixedSizeCrop in prototype for now, and it expects Label
+            # classes which we won't release yet.
+            # transforms.FixedSizeCrop(
+            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {tv_tensors.Mask: 0})
+            # ),
+            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "multiscale":
+        t = [
+            transforms.RandomShortestSize(
+                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
+            ),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssd":
+        t = [
+            transforms.RandomPhotometricDistort(p=1),
+            transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), tv_tensors.Mask: 0}, p=1),
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssdlite":
+        t = [
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    if sanitize:
+        t += [transforms.SanitizeBoundingBoxes()]
+    t = transforms.Compose(t)
+
+    num_boxes = 5
+    H = W = 250
+
+    image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_pure_tensor(image)
+
+    label = torch.randint(0, 10, size=(num_boxes,))
+
+    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = boxes.clamp(min=0, max=min(H, W))
+    boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
+
+    masks = tv_tensors.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
+
+    sample = {
+        "image": image,
+        "label": label,
+        "boxes": boxes,
+        "masks": masks,
+    }
+
+    out = t(sample)
+
+    if isinstance(to_tensor, transforms.ToTensor) and image_type is not tv_tensors.Image:
+        assert is_pure_tensor(out["image"])
+    else:
+        assert isinstance(out["image"], tv_tensors.Image)
+    assert isinstance(out["label"], type(sample["label"]))
+
+    num_boxes_expected = {
+        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
+        # doesn't remove them strictly speaking, it just marks some boxes as
+        # degenerate and those boxes will be later removed by
+        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # param is True.
+        # Note that the values below are probably specific to the random seed
+        # set above (which is fine).
+        (True, "ssd"): 5,
+        (True, "ssdlite"): 4,
+    }.get((sanitize, data_augmentation), num_boxes)
+
+    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
+
+
+class TestSanitizeBoundingBoxes:
+    def _get_boxes_and_valid_mask(self, H=256, W=128, min_size=10, min_area=10):
+        boxes_and_validity = [
+            ([0, 1, 10, 1], False),  # Y1 == Y2
+            ([0, 1, 0, 20], False),  # X1 == X2
+            ([0, 0, min_size - 1, 10], False),  # H < min_size
+            ([0, 0, 10, min_size - 1], False),  # W < min_size
+            ([0, 0, 10, H + 1], False),  # Y2 > H
+            ([0, 0, W + 1, 10], False),  # X2 > W
+            ([-1, 1, 10, 20], False),  # any < 0
+            ([0, 0, -1, 20], False),  # any < 0
+            ([0, 0, -10, -1], False),  # any < 0
+            ([0, 0, min_size, 10], min_size * 10 >= min_area),  # H < min_size
+            ([0, 0, 10, min_size], min_size * 10 >= min_area),  # W < min_size
+            ([0, 0, W, H], W * H >= min_area),
+            ([1, 1, 30, 20], 29 * 19 >= min_area),
+            ([0, 0, 10, 10], 9 * 9 >= min_area),
+            ([1, 1, 30, 20], 29 * 19 >= min_area),
+        ]
+
+        random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
+        boxes, expected_valid_mask = zip(*boxes_and_validity)
+        boxes = tv_tensors.BoundingBoxes(
+            boxes,
+            format=tv_tensors.BoundingBoxFormat.XYXY,
+            canvas_size=(H, W),
+        )
+
+        return boxes, expected_valid_mask
+
+    @pytest.mark.parametrize("min_size, min_area", ((1, 1), (10, 1), (10, 101)))
+    @pytest.mark.parametrize(
+        "labels_getter",
+        (
+            "default",
+            lambda inputs: inputs["labels"],
+            lambda inputs: (inputs["labels"], inputs["other_labels"]),
+            lambda inputs: [inputs["labels"], inputs["other_labels"]],
+            None,
+            lambda inputs: None,
+        ),
+    )
+    @pytest.mark.parametrize("sample_type", (tuple, dict))
+    def test_transform(self, min_size, min_area, labels_getter, sample_type):
+
+        if sample_type is tuple and not isinstance(labels_getter, str):
+            # The "lambda inputs: inputs["labels"]" labels_getter used in this test
+            # doesn't work if the input is a tuple.
+            return
+
+        H, W = 256, 128
+        boxes, expected_valid_mask = self._get_boxes_and_valid_mask(H=H, W=W, min_size=min_size, min_area=min_area)
+        valid_indices = [i for (i, is_valid) in enumerate(expected_valid_mask) if is_valid]
+
+        labels = torch.arange(boxes.shape[0])
+        masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
+        # other_labels corresponds to properties from COCO like iscrowd, area...
+        # We only sanitize it when labels_getter returns a tuple
+        other_labels = torch.arange(boxes.shape[0])
+        whatever = torch.rand(10)
+        input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)
+        sample = {
+            "image": input_img,
+            "labels": labels,
+            "boxes": boxes,
+            "other_labels": other_labels,
+            "whatever": whatever,
+            "None": None,
+            "masks": masks,
+        }
+
+        if sample_type is tuple:
+            img = sample.pop("image")
+            sample = (img, sample)
+
+        out = transforms.SanitizeBoundingBoxes(min_size=min_size, min_area=min_area, labels_getter=labels_getter)(
+            sample
+        )
+
+        if sample_type is tuple:
+            out_image = out[0]
+            out_labels = out[1]["labels"]
+            out_other_labels = out[1]["other_labels"]
+            out_boxes = out[1]["boxes"]
+            out_masks = out[1]["masks"]
+            out_whatever = out[1]["whatever"]
+        else:
+            out_image = out["image"]
+            out_labels = out["labels"]
+            out_other_labels = out["other_labels"]
+            out_boxes = out["boxes"]
+            out_masks = out["masks"]
+            out_whatever = out["whatever"]
+
+        assert out_image is input_img
+        assert out_whatever is whatever
+
+        assert isinstance(out_boxes, tv_tensors.BoundingBoxes)
+        assert isinstance(out_masks, tv_tensors.Mask)
+
+        if labels_getter is None or (callable(labels_getter) and labels_getter(sample) is None):
+            assert out_labels is labels
+            assert out_other_labels is other_labels
+        else:
+            assert isinstance(out_labels, torch.Tensor)
+            assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0]
+            # This works because we conveniently set labels to arange(num_boxes)
+            assert out_labels.tolist() == valid_indices
+
+            if callable(labels_getter) and isinstance(labels_getter(sample), (tuple, list)):
+                assert_equal(out_other_labels, out_labels)
+            else:
+                assert_equal(out_other_labels, other_labels)
+
+    @pytest.mark.parametrize("input_type", (torch.Tensor, tv_tensors.BoundingBoxes))
+    def test_functional(self, input_type):
+        # Note: the "functional" F.sanitize_bounding_boxes was added after the class, so there is some
+        # redundancy with test_transform() in terms of correctness checks. But that's OK.
+
+        H, W, min_size = 256, 128, 10
+
+        boxes, expected_valid_mask = self._get_boxes_and_valid_mask(H=H, W=W, min_size=min_size)
+
+        if input_type is tv_tensors.BoundingBoxes:
+            format = canvas_size = None
+        else:
+            # just passing "XYXY" explicitly to make sure we support strings
+            format, canvas_size = "XYXY", boxes.canvas_size
+            boxes = boxes.as_subclass(torch.Tensor)
+
+        boxes, valid = F.sanitize_bounding_boxes(boxes, format=format, canvas_size=canvas_size, min_size=min_size)
+
+        assert_equal(valid, torch.tensor(expected_valid_mask))
+        assert type(valid) == torch.Tensor
+        assert boxes.shape[0] == sum(valid)
+        assert isinstance(boxes, input_type)
+
+    def test_kernel(self):
+        H, W, min_size = 256, 128, 10
+        boxes, _ = self._get_boxes_and_valid_mask(H=H, W=W, min_size=min_size)
+
+        format, canvas_size = boxes.format, boxes.canvas_size
+        boxes = boxes.as_subclass(torch.Tensor)
+
+        check_kernel(
+            F.sanitize_bounding_boxes,
+            input=boxes,
+            format=format,
+            canvas_size=canvas_size,
+            check_batched_vs_unbatched=False,
+        )
+
+    def test_no_label(self):
+        # Non-regression test for https://github.com/pytorch/vision/issues/7878
+
+        img = make_image()
+        boxes = make_bounding_boxes()
+
+        with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"):
+            transforms.SanitizeBoundingBoxes()(img, boxes)
+
+        out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes)
+        assert isinstance(out_img, tv_tensors.Image)
+        assert isinstance(out_boxes, tv_tensors.BoundingBoxes)
+
+    def test_errors_transform(self):
+        good_bbox = tv_tensors.BoundingBoxes(
+            [[0, 0, 10, 10]],
+            format=tv_tensors.BoundingBoxFormat.XYXY,
+            canvas_size=(20, 20),
+        )
+
+        with pytest.raises(ValueError, match="min_size must be >= 1"):
+            transforms.SanitizeBoundingBoxes(min_size=0)
+        with pytest.raises(ValueError, match="min_area must be >= 1"):
+            transforms.SanitizeBoundingBoxes(min_area=0)
+        with pytest.raises(ValueError, match="labels_getter should either be 'default'"):
+            transforms.SanitizeBoundingBoxes(labels_getter=12)
+
+        with pytest.raises(ValueError, match="Could not infer where the labels are"):
+            bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
+            transforms.SanitizeBoundingBoxes()(bad_labels_key)
+
+        with pytest.raises(ValueError, match="must be a tensor"):
+            not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
+            transforms.SanitizeBoundingBoxes()(not_a_tensor)
+
+        with pytest.raises(ValueError, match="Number of boxes"):
+            different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
+            transforms.SanitizeBoundingBoxes()(different_sizes)
+
+    def test_errors_functional(self):
+
+        good_bbox = tv_tensors.BoundingBoxes(
+            [[0, 0, 10, 10]],
+            format=tv_tensors.BoundingBoxFormat.XYXY,
+            canvas_size=(20, 20),
+        )
+
+        with pytest.raises(ValueError, match="canvas_size cannot be None if bounding_boxes is a pure tensor"):
+            F.sanitize_bounding_boxes(good_bbox.as_subclass(torch.Tensor), format="XYXY", canvas_size=None)
+
+        with pytest.raises(ValueError, match="canvas_size cannot be None if bounding_boxes is a pure tensor"):
+            F.sanitize_bounding_boxes(good_bbox.as_subclass(torch.Tensor), format=None, canvas_size=(10, 10))
+
+        with pytest.raises(ValueError, match="canvas_size must be None when bounding_boxes is a tv_tensors"):
+            F.sanitize_bounding_boxes(good_bbox, format="XYXY", canvas_size=None)
+
+        with pytest.raises(ValueError, match="canvas_size must be None when bounding_boxes is a tv_tensors"):
+            F.sanitize_bounding_boxes(good_bbox, format="XYXY", canvas_size=None)
+
+        with pytest.raises(ValueError, match="bounding_boxes must be a tv_tensors.BoundingBoxes instance or a"):
+            F.sanitize_bounding_boxes(good_bbox.tolist())
+
+
+class TestJPEG:
+    @pytest.mark.parametrize("quality", [5, 75])
+    @pytest.mark.parametrize("color_space", ["RGB", "GRAY"])
+    def test_kernel_image(self, quality, color_space):
+        check_kernel(F.jpeg_image, make_image(color_space=color_space), quality=quality)
+
+    def test_kernel_video(self):
+        check_kernel(F.jpeg_video, make_video(), quality=5)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.jpeg, make_input(), quality=5)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.jpeg_image, torch.Tensor),
+            (F._augment._jpeg_image_pil, PIL.Image.Image),
+            (F.jpeg_image, tv_tensors.Image),
+            (F.jpeg_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.jpeg, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    @pytest.mark.parametrize("quality", [5, (10, 20)])
+    @pytest.mark.parametrize("color_space", ["RGB", "GRAY"])
+    def test_transform(self, make_input, quality, color_space):
+        check_transform(transforms.JPEG(quality=quality), make_input(color_space=color_space))
+
+    @pytest.mark.parametrize("quality", [5])
+    def test_functional_image_correctness(self, quality):
+        image = make_image()
+
+        actual = F.jpeg(image, quality=quality)
+        expected = F.to_image(F.jpeg(F.to_pil_image(image), quality=quality))
+
+        # NOTE: this will fail if torchvision and Pillow use different JPEG encoder/decoder
+        torch.testing.assert_close(actual, expected, rtol=0, atol=1)
+
+    @pytest.mark.parametrize("quality", [5, (10, 20)])
+    @pytest.mark.parametrize("color_space", ["RGB", "GRAY"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, quality, color_space, seed):
+        image = make_image(color_space=color_space)
+
+        transform = transforms.JPEG(quality=quality)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+            torch.manual_seed(seed)
+            expected = F.to_image(transform(F.to_pil_image(image)))
+
+        torch.testing.assert_close(actual, expected, rtol=0, atol=1)
+
+    @pytest.mark.parametrize("quality", [5, (10, 20)])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transformmake_params_bounds(self, quality, seed):
+        transform = transforms.JPEG(quality=quality)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            params = transform.make_params([])
+
+        if isinstance(quality, int):
+            assert params["quality"] == quality
+        else:
+            assert quality[0] <= params["quality"] <= quality[1]
+
+    @pytest.mark.parametrize("quality", [[0], [0, 0, 0]])
+    def test_transform_sequence_len_error(self, quality):
+        with pytest.raises(ValueError, match="quality should be a sequence of length 2"):
+            transforms.JPEG(quality=quality)
+
+    @pytest.mark.parametrize("quality", [-1, 0, 150])
+    def test_transform_invalid_quality_error(self, quality):
+        with pytest.raises(ValueError, match="quality must be an integer from 1 to 100"):
+            transforms.JPEG(quality=quality)
+
+    @pytest.mark.parametrize("quality", [None, True])
+    def test_transform_quality_type_error(self, quality):
+        with pytest.raises(TypeError, match="quality"):
+            transforms.JPEG(quality=quality)
+
+
+class TestUtils:
+    # TODO: Still need to test has_all, has_any, check_type and get_bouding_boxes
+    @pytest.mark.parametrize(
+        "make_input1", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize(
+        "make_input2", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw])
+    def test_query_size_and_query_chw(self, make_input1, make_input2, query):
+        size = (32, 64)
+        input1 = make_input1(size)
+        input2 = make_input2(size)
+
+        if query is transforms.query_chw and not any(
+            transforms.check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+            for inpt in (input1, input2)
+        ):
+            return
+
+        expected = size if query is transforms.query_size else ((3,) + size)
+        assert query([input1, input2]) == expected
+
+    @pytest.mark.parametrize(
+        "make_input1", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize(
+        "make_input2", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw])
+    def test_different_sizes(self, make_input1, make_input2, query):
+        input1 = make_input1((10, 10))
+        input2 = make_input2((20, 20))
+        if query is transforms.query_chw and not all(
+            transforms.check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+            for inpt in (input1, input2)
+        ):
+            return
+        with pytest.raises(ValueError, match="Found multiple"):
+            query([input1, input2])
+
+    @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw])
+    def test_no_valid_input(self, query):
+        with pytest.raises(TypeError, match="No image"):
+            query(["blah"])
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
new file mode 100644
index 00000000000..dab6d525a38
--- /dev/null
+++ b/test/test_transforms_v2_utils.py
@@ -0,0 +1,102 @@
+import PIL.Image
+import pytest
+
+import torch
+
+import torchvision.transforms.v2._utils
+from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_masks, make_image, make_keypoints
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2._utils import has_all, has_any
+from torchvision.transforms.v2.functional import to_pil_image
+
+
+IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
+BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=tv_tensors.BoundingBoxFormat.XYXY)
+MASK = make_detection_masks(DEFAULT_SIZE)
+KEYPOINTS = make_keypoints(DEFAULT_SIZE)
+
+
+@pytest.mark.parametrize(
+    ("sample", "types", "expected"),
+    [
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes,), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.BoundingBoxes), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.KeyPoints,), True),
+        ((MASK,), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.KeyPoints), False),
+        ((BOUNDING_BOX,), (tv_tensors.Image, tv_tensors.Mask, tv_tensors.KeyPoints), False),
+        ((IMAGE,), (tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), False),
+        ((KEYPOINTS,), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        (
+            (IMAGE, BOUNDING_BOX, MASK, KEYPOINTS),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints),
+            True,
+        ),
+        ((), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), False),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (lambda obj: isinstance(obj, tv_tensors.Image),), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (lambda _: False,), False),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (lambda _: True,), True),
+        ((IMAGE,), (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True),
+        (
+            (torch.Tensor(IMAGE),),
+            (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
+            True,
+        ),
+        (
+            (to_pil_image(IMAGE),),
+            (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
+            True,
+        ),
+    ],
+)
+def test_has_any(sample, types, expected):
+    assert has_any(sample, *types) is expected
+
+
+@pytest.mark.parametrize(
+    ("sample", "types", "expected"),
+    [
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes,), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.BoundingBoxes), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Mask, tv_tensors.KeyPoints), True),
+        ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints), True),
+        (
+            (IMAGE, BOUNDING_BOX, MASK, KEYPOINTS),
+            (tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints),
+            True,
+        ),
+        (
+            (IMAGE, BOUNDING_BOX, MASK, KEYPOINTS),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints),
+            True,
+        ),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), False),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), False),
+        ((IMAGE, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        (
+            (IMAGE, BOUNDING_BOX, MASK),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
+            True,
+        ),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, BOUNDING_BOX), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        (
+            (IMAGE, BOUNDING_BOX, MASK),
+            (lambda obj: isinstance(obj, (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask)),),
+            True,
+        ),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
+    ],
+)
+def test_has_all(sample, types, expected):
+    assert has_all(sample, *types) is expected
diff --git a/test/test_transforms_video.py b/test/test_transforms_video.py
index 21594868f09..4ad57e6a98e 100644
--- a/test/test_transforms_video.py
+++ b/test/test_transforms_video.py
@@ -23,8 +23,8 @@ def test_random_crop_video(self):
         numFrames = random.randint(4, 128)
         height = random.randint(10, 32) * 2
         width = random.randint(10, 32) * 2
-        oheight = random.randint(5, (height - 2) / 2) * 2
-        owidth = random.randint(5, (width - 2) / 2) * 2
+        oheight = random.randint(5, (height - 2) // 2) * 2
+        owidth = random.randint(5, (width - 2) // 2) * 2
         clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8)
         result = Compose(
             [
@@ -41,8 +41,8 @@ def test_random_resized_crop_video(self):
         numFrames = random.randint(4, 128)
         height = random.randint(10, 32) * 2
         width = random.randint(10, 32) * 2
-        oheight = random.randint(5, (height - 2) / 2) * 2
-        owidth = random.randint(5, (width - 2) / 2) * 2
+        oheight = random.randint(5, (height - 2) // 2) * 2
+        owidth = random.randint(5, (width - 2) // 2) * 2
         clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8)
         result = Compose(
             [
@@ -59,8 +59,8 @@ def test_center_crop_video(self):
         numFrames = random.randint(4, 128)
         height = random.randint(10, 32) * 2
         width = random.randint(10, 32) * 2
-        oheight = random.randint(5, (height - 2) / 2) * 2
-        owidth = random.randint(5, (width - 2) / 2) * 2
+        oheight = random.randint(5, (height - 2) // 2) * 2
+        owidth = random.randint(5, (width - 2) // 2) * 2
 
         clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) * 255
         oh1 = (height - oheight) // 2
diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py
new file mode 100644
index 00000000000..f9d545eb9c9
--- /dev/null
+++ b/test/test_tv_tensors.py
@@ -0,0 +1,445 @@
+from copy import deepcopy
+
+import pytest
+import torch
+from common_utils import (
+    assert_equal,
+    make_bounding_boxes,
+    make_image,
+    make_keypoints,
+    make_segmentation_mask,
+    make_video,
+)
+from PIL import Image
+
+from torchvision import tv_tensors
+
+
+@pytest.fixture(autouse=True)
+def restore_tensor_return_type():
+    # This is for security, as we should already be restoring the default manually in each test anyway
+    # (at least at the time of writing...)
+    yield
+    tv_tensors.set_return_type("Tensor")
+
+
+@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
+def test_image_instance(data):
+    image = tv_tensors.Image(data)
+    assert isinstance(image, torch.Tensor)
+    assert image.ndim == 3 and image.shape[0] == 3
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
+def test_mask_instance(data):
+    mask = tv_tensors.Mask(data)
+    assert isinstance(mask, torch.Tensor)
+    assert mask.ndim == 3 and mask.shape[0] == 1
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]])
+@pytest.mark.parametrize(
+    "format", ["XYXY", "CXCYWH", tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH]
+)
+def test_bbox_instance(data, format):
+    bboxes = tv_tensors.BoundingBoxes(data, format=format, canvas_size=(32, 32))
+    assert isinstance(bboxes, torch.Tensor)
+    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[(format.upper())]
+    assert bboxes.format == format
+
+
+@pytest.mark.parametrize(
+    "format, is_rotated_expected",
+    [
+        ("XYXY", False),
+        ("XYWH", False),
+        ("CXCYWH", False),
+        ("XYXYXYXY", True),
+        ("XYWHR", True),
+        ("CXCYWHR", True),
+        (tv_tensors.BoundingBoxFormat.XYXY, False),
+        (tv_tensors.BoundingBoxFormat.XYWH, False),
+        (tv_tensors.BoundingBoxFormat.CXCYWH, False),
+        (tv_tensors.BoundingBoxFormat.XYXYXYXY, True),
+        (tv_tensors.BoundingBoxFormat.XYWHR, True),
+        (tv_tensors.BoundingBoxFormat.CXCYWHR, True),
+    ],
+)
+@pytest.mark.parametrize("scripted", (False, True))
+def test_bbox_format(format, is_rotated_expected, scripted):
+    fn = tv_tensors.is_rotated_bounding_format
+    if scripted:
+        fn = torch.jit.script(fn)
+    assert fn(format) == is_rotated_expected
+
+
+@pytest.mark.parametrize(
+    "format, support_integer_dtype",
+    [
+        ("XYXY", True),
+        ("XYWH", True),
+        ("CXCYWH", True),
+        ("XYXYXYXY", False),
+        ("XYWHR", False),
+        ("CXCYWHR", False),
+        (tv_tensors.BoundingBoxFormat.XYXY, True),
+        (tv_tensors.BoundingBoxFormat.XYWH, True),
+        (tv_tensors.BoundingBoxFormat.CXCYWH, True),
+        (tv_tensors.BoundingBoxFormat.XYXYXYXY, False),
+        (tv_tensors.BoundingBoxFormat.XYWHR, False),
+        (tv_tensors.BoundingBoxFormat.CXCYWHR, False),
+    ],
+)
+@pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+def test_bbox_format_dtype(format, support_integer_dtype, input_dtype):
+    tensor = torch.randint(0, 32, size=(5, 2), dtype=input_dtype)
+    if not input_dtype.is_floating_point and not support_integer_dtype:
+        with pytest.raises(ValueError, match="Rotated bounding boxes should be floating point tensors"):
+            tv_tensors.BoundingBoxes(tensor, format=format, canvas_size=(32, 32))
+    else:
+        tv_tensors.BoundingBoxes(tensor, format=format, canvas_size=(32, 32))
+
+
+def test_bbox_dim_error():
+    data_3d = [[[1, 2, 3, 4]]]
+    with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"):
+        tv_tensors.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32))
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 2)), [[0, 0], [2, 2]], [1, 2]])
+def test_keypoints_instance(data):
+    kpoint = tv_tensors.KeyPoints(data, canvas_size=(32, 32))
+    assert isinstance(kpoint, torch.Tensor)
+    assert type(kpoint) is tv_tensors.KeyPoints
+    assert kpoint.shape[-1] == 2
+
+
+def test_keypoints_shape_error():
+    with pytest.raises(ValueError, match="Expected a tensor of shape"):
+        tv_tensors.KeyPoints(torch.tensor([[1, 2, 3]]), canvas_size=(11, 7))
+
+
+@pytest.mark.parametrize(
+    ("data", "input_requires_grad", "expected_requires_grad"),
+    [
+        ([[[0.0, 1.0], [0.0, 1.0]]], None, False),
+        ([[[0.0, 1.0], [0.0, 1.0]]], False, False),
+        ([[[0.0, 1.0], [0.0, 1.0]]], True, True),
+        (torch.rand(3, 16, 16, requires_grad=False), None, False),
+        (torch.rand(3, 16, 16, requires_grad=False), False, False),
+        (torch.rand(3, 16, 16, requires_grad=False), True, True),
+        (torch.rand(3, 16, 16, requires_grad=True), None, True),
+        (torch.rand(3, 16, 16, requires_grad=True), False, False),
+        (torch.rand(3, 16, 16, requires_grad=True), True, True),
+    ],
+)
+def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
+    tv_tensor = tv_tensors.Image(data, requires_grad=input_requires_grad)
+    assert tv_tensor.requires_grad is expected_requires_grad
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+def test_isinstance(make_input):
+    assert isinstance(make_input(), torch.Tensor)
+
+
+def test_wrapping_no_copy():
+    tensor = torch.rand(3, 16, 16)
+    image = tv_tensors.Image(tensor)
+
+    assert image.data_ptr() == tensor.data_ptr()
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+def test_to_wrapping(make_input):
+    dp = make_input()
+
+    dp_to = dp.to(torch.float64)
+
+    assert type(dp_to) is type(dp)
+    assert dp_to.dtype is torch.float64
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_to_tv_tensor_reference(make_input, return_type):
+    tensor = torch.rand((3, 16, 16), dtype=torch.float64)
+    dp = make_input()
+
+    with tv_tensors.set_return_type(return_type):
+        tensor_to = tensor.to(dp)
+
+    assert type(tensor_to) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    assert tensor_to.dtype is dp.dtype
+    assert type(tensor) is torch.Tensor
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_clone_wrapping(make_input, return_type):
+    dp = make_input()
+
+    with tv_tensors.set_return_type(return_type):
+        dp_clone = dp.clone()
+
+    assert type(dp_clone) is type(dp)
+    assert dp_clone.data_ptr() != dp.data_ptr()
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_requires_grad__wrapping(make_input, return_type):
+    dp = make_input(dtype=torch.float)
+
+    assert not dp.requires_grad
+
+    with tv_tensors.set_return_type(return_type):
+        dp_requires_grad = dp.requires_grad_(True)
+
+    assert type(dp_requires_grad) is type(dp)
+    assert dp.requires_grad
+    assert dp_requires_grad.requires_grad
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_detach_wrapping(make_input, return_type):
+    dp = make_input(dtype=torch.float).requires_grad_(True)
+
+    with tv_tensors.set_return_type(return_type):
+        dp_detached = dp.detach()
+
+    assert type(dp_detached) is type(dp)
+
+
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_force_subclass_with_metadata(return_type):
+    # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and tv_tensors with metadata
+    # Largely the same as above, we additionally check that the metadata is preserved
+    format, canvas_size = "XYXY", (32, 32)
+    bbox = tv_tensors.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
+    kpoints = tv_tensors.KeyPoints([[0, 0], [2, 2]], canvas_size=canvas_size)
+
+    tv_tensors.set_return_type(return_type)
+    bbox = bbox.clone()
+    kpoints = kpoints.clone()
+    if return_type == "TVTensor":
+        assert kpoints.canvas_size == canvas_size
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    bbox = bbox.to(torch.float64)
+    kpoints = kpoints.to(torch.float64)
+    if return_type == "TVTensor":
+        assert kpoints.canvas_size == canvas_size
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    bbox = bbox.detach()
+    kpoints = kpoints.detach()
+    if return_type == "TVTensor":
+        assert kpoints.canvas_size == canvas_size
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    if torch.cuda.is_available():
+        bbox = bbox.pin_memory()
+        if return_type == "TVTensor":
+            assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    assert not bbox.requires_grad
+    assert not kpoints.requires_grad
+    bbox.requires_grad_(True)
+    kpoints.requires_grad_(True)
+    if return_type == "TVTensor":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+        assert bbox.requires_grad
+        assert kpoints.canvas_size == canvas_size
+        assert kpoints.requires_grad
+    tv_tensors.set_return_type("tensor")
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_other_op_no_wrapping(make_input, return_type):
+    dp = make_input()
+
+    with tv_tensors.set_return_type(return_type):
+        # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
+        output = dp * 2
+
+    assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize(
+    "op",
+    [
+        lambda t: t.numpy(),
+        lambda t: t.tolist(),
+        lambda t: t.max(dim=-1),
+    ],
+)
+def test_no_tensor_output_op_no_wrapping(make_input, op):
+    dp = make_input()
+
+    output = op(dp)
+
+    assert type(output) is not type(dp)
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_inplace_op_no_wrapping(make_input, return_type):
+    dp = make_input()
+    original_type = type(dp)
+
+    with tv_tensors.set_return_type(return_type):
+        output = dp.add_(0)
+
+    assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    assert type(dp) is original_type
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+def test_wrap(make_input):
+    dp = make_input()
+
+    # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
+    output = dp * 2
+
+    dp_new = tv_tensors.wrap(output, like=dp)
+
+    assert type(dp_new) is type(dp)
+    assert dp_new.data_ptr() == output.data_ptr()
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("requires_grad", [False, True])
+def test_deepcopy(make_input, requires_grad):
+    dp = make_input(dtype=torch.float)
+
+    dp.requires_grad_(requires_grad)
+
+    dp_deepcopied = deepcopy(dp)
+
+    assert dp_deepcopied is not dp
+    assert dp_deepcopied.data_ptr() != dp.data_ptr()
+    assert_equal(dp_deepcopied, dp)
+
+    assert type(dp_deepcopied) is type(dp)
+    assert dp_deepcopied.requires_grad is requires_grad
+
+
+@pytest.mark.parametrize(
+    "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints]
+)
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+@pytest.mark.parametrize(
+    "op",
+    (
+        lambda dp: dp + torch.rand(*dp.shape),
+        lambda dp: torch.rand(*dp.shape) + dp,
+        lambda dp: dp * torch.rand(*dp.shape),
+        lambda dp: torch.rand(*dp.shape) * dp,
+        lambda dp: dp + 3,
+        lambda dp: 3 + dp,
+        lambda dp: dp + dp,
+        lambda dp: dp.sum(),
+        lambda dp: dp.reshape(-1),
+        lambda dp: dp.int(),
+        lambda dp: torch.stack([dp, dp]),
+        lambda dp: torch.chunk(dp, 2)[0],
+        lambda dp: torch.unbind(dp)[0],
+    ),
+)
+def test_usual_operations(make_input, return_type, op):
+
+    dp = make_input()
+    with tv_tensors.set_return_type(return_type):
+        out = op(dp)
+    assert type(out) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "TVTensor":
+        assert hasattr(out, "format")
+        assert hasattr(out, "canvas_size")
+
+
+def test_subclasses():
+    img = make_image()
+    masks = make_segmentation_mask()
+
+    with pytest.raises(TypeError, match="unsupported operand"):
+        img + masks
+
+
+def test_set_return_type():
+    img = make_image()
+
+    assert type(img + 3) is torch.Tensor
+
+    with tv_tensors.set_return_type("TVTensor"):
+        assert type(img + 3) is tv_tensors.Image
+    assert type(img + 3) is torch.Tensor
+
+    tv_tensors.set_return_type("TVTensor")
+    assert type(img + 3) is tv_tensors.Image
+
+    with tv_tensors.set_return_type("tensor"):
+        assert type(img + 3) is torch.Tensor
+        with tv_tensors.set_return_type("TVTensor"):
+            assert type(img + 3) is tv_tensors.Image
+            tv_tensors.set_return_type("tensor")
+            assert type(img + 3) is torch.Tensor
+        assert type(img + 3) is torch.Tensor
+    # Exiting a context manager will restore the return type as it was prior to entering it,
+    # regardless of whether the "global" tv_tensors.set_return_type() was called within the context manager.
+    assert type(img + 3) is tv_tensors.Image
+
+    tv_tensors.set_return_type("tensor")
+
+
+def test_return_type_input():
+    img = make_image()
+
+    # Case-insensitive
+    with tv_tensors.set_return_type("tvtensor"):
+        assert type(img + 3) is tv_tensors.Image
+
+    with pytest.raises(ValueError, match="return_type must be"):
+        tv_tensors.set_return_type("typo")
+
+    tv_tensors.set_return_type("tensor")
+
+
+def test_box_clamping_mode_default_and_error():
+    assert (
+        tv_tensors.BoundingBoxes([0.0, 0.0, 10.0, 10.0], format="XYXY", canvas_size=(100, 100)).clamping_mode == "soft"
+    )
+    assert (
+        tv_tensors.BoundingBoxes([0.0, 0.0, 10.0, 10.0, 0.0], format="XYWHR", canvas_size=(100, 100)).clamping_mode
+        == "soft"
+    )
+
+    with pytest.raises(ValueError, match="clamping_mode must be"):
+        tv_tensors.BoundingBoxes([0, 0, 10, 10], format="XYXY", canvas_size=(100, 100), clamping_mode="bad")
diff --git a/test/test_utils.py b/test/test_utils.py
index dde3ee90dc3..8b6f357ce6e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -9,14 +9,33 @@
 import torch
 import torchvision.transforms.functional as F
 import torchvision.utils as utils
-from common_utils import assert_equal
+from common_utils import assert_equal, cpu_and_cuda
 from PIL import __version__ as PILLOW_VERSION, Image, ImageColor
+from torchvision.transforms.v2.functional import to_dtype
 
 
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split("."))
 
 boxes = torch.tensor([[0, 0, 20, 20], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float)
-
+rotated_boxes = torch.tensor(
+    [
+        [100, 150, 150, 150, 150, 250, 100, 250],
+        [200, 350, 250, 350, 250, 250, 200, 250],
+        [300, 200, 200, 200, 200, 250, 300, 250],
+        # Not really a rectangle, but it doesn't matter
+        [
+            100,
+            100,
+            200,
+            50,
+            290,
+            350,
+            200,
+            400,
+        ],
+    ],
+    dtype=torch.float,
+)
 keypoints = torch.tensor([[[10, 10], [5, 5], [2, 2]], [[20, 20], [30, 30], [3, 3]]], dtype=torch.float)
 
 
@@ -105,7 +124,7 @@ def test_draw_boxes():
         res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy())
         res.save(path)
 
-    if PILLOW_VERSION >= (8, 2):
+    if PILLOW_VERSION >= (10, 1):
         # The reference image is only valid for new PIL versions
         expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
         assert_equal(result, expected)
@@ -115,11 +134,85 @@ def test_draw_boxes():
     assert_equal(img, img_cp)
 
 
+@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1")
+def test_draw_boxes_with_coloured_labels():
+    img = torch.full((3, 100, 100), 255, dtype=torch.uint8)
+    labels = ["a", "b", "c", "d"]
+    colors = ["green", "#FF00FF", (0, 255, 0), "red"]
+    label_colors = ["green", "red", (0, 255, 0), "#FF00FF"]
+    result = utils.draw_bounding_boxes(img, boxes, labels=labels, colors=colors, fill=True, label_colors=label_colors)
+
+    path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_boxes_different_label_colors.png"
+    )
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
+@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1")
+def test_draw_boxes_with_coloured_label_backgrounds():
+    img = torch.full((3, 100, 100), 255, dtype=torch.uint8)
+    labels = ["a", "b", "c", "d"]
+    colors = ["green", "#FF00FF", (0, 255, 0), "red"]
+    label_colors = ["green", "red", (0, 255, 0), "#FF00FF"]
+    result = utils.draw_bounding_boxes(
+        img, boxes, labels=labels, colors=colors, fill=True, label_colors=label_colors, fill_labels=True
+    )
+
+    path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_boxes_different_label_fill_colors.png"
+    )
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
+@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1")
+def test_draw_rotated_boxes():
+    img = torch.full((3, 500, 500), 255, dtype=torch.uint8)
+    colors = ["blue", "yellow", (0, 255, 0), "black"]
+
+    result = utils.draw_bounding_boxes(img, rotated_boxes, colors=colors)
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_rotated_boxes.png")
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
+@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1")
+def test_draw_rotated_boxes_fill():
+    img = torch.full((3, 500, 500), 255, dtype=torch.uint8)
+    colors = ["blue", "yellow", (0, 255, 0), "black"]
+
+    result = utils.draw_bounding_boxes(img, rotated_boxes, colors=colors, fill=True)
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_rotated_boxes_fill.png")
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("fill", [True, False])
+def test_draw_boxes_dtypes(fill):
+    img_uint8 = torch.full((3, 100, 100), 255, dtype=torch.uint8)
+    out_uint8 = utils.draw_bounding_boxes(img_uint8, boxes, fill=fill)
+
+    assert img_uint8 is not out_uint8
+    assert out_uint8.dtype == torch.uint8
+
+    img_float = to_dtype(img_uint8, torch.float, scale=True)
+    out_float = utils.draw_bounding_boxes(img_float, boxes, fill=fill)
+
+    assert img_float is not out_float
+    assert out_float.is_floating_point()
+
+    torch.testing.assert_close(out_uint8, to_dtype(out_float, torch.uint8, scale=True), rtol=0, atol=1)
+
+
 @pytest.mark.parametrize("colors", [None, ["red", "blue", "#FF00FF", (1, 34, 122)], "red", "#FF00FF", (1, 34, 122)])
 def test_draw_boxes_colors(colors):
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
     utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors)
 
+    with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"):
+        utils.draw_bounding_boxes(image=img, boxes=boxes, colors=[])
+
 
 def test_draw_boxes_vanilla():
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
@@ -148,7 +241,6 @@ def test_draw_boxes_grayscale():
 
 def test_draw_invalid_boxes():
     img_tp = ((1, 1, 1), (1, 2, 3))
-    img_wrong1 = torch.full((3, 5, 5), 255, dtype=torch.float)
     img_wrong2 = torch.full((1, 3, 5, 5), 255, dtype=torch.uint8)
     img_correct = torch.zeros((3, 10, 10), dtype=torch.uint8)
     boxes = torch.tensor([[0, 0, 20, 20], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float)
@@ -158,8 +250,6 @@ def test_draw_invalid_boxes():
 
     with pytest.raises(TypeError, match="Tensor expected"):
         utils.draw_bounding_boxes(img_tp, boxes)
-    with pytest.raises(ValueError, match="Tensor uint8 expected"):
-        utils.draw_bounding_boxes(img_wrong1, boxes)
     with pytest.raises(ValueError, match="Pass individual images, not batches"):
         utils.draw_bounding_boxes(img_wrong2, boxes)
     with pytest.raises(ValueError, match="Only grayscale and RGB images are supported"):
@@ -184,7 +274,7 @@ def test_draw_no_boxes():
     boxes = torch.full((0, 4), 0, dtype=torch.float)
     with pytest.warns(UserWarning, match=re.escape("boxes doesn't contain any box. No box was drawn")):
         res = utils.draw_bounding_boxes(img, boxes)
-        # Check that the function didnt change the image
+        # Check that the function didn't change the image
         assert res.eq(img).all()
 
 
@@ -200,19 +290,17 @@ def test_draw_no_boxes():
     ],
 )
 @pytest.mark.parametrize("alpha", (0, 0.5, 0.7, 1))
-def test_draw_segmentation_masks(colors, alpha):
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_segmentation_masks(colors, alpha, device):
     """This test makes sure that masks draw their corresponding color where they should"""
     num_masks, h, w = 2, 100, 100
     dtype = torch.uint8
-    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype)
-    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype, device=device)
+    masks = torch.zeros((num_masks, h, w), dtype=torch.bool, device=device)
+    masks[0, 10:20, 10:20] = True
+    masks[1, 15:25, 15:25] = True
 
-    # For testing we enforce that there's no overlap between the masks. The
-    # current behaviour is that the last mask's color will take priority when
-    # masks overlap, but this makes testing slightly harder so we don't really
-    # care
     overlap = masks[0] & masks[1]
-    masks[:, overlap] = False
 
     out = utils.draw_segmentation_masks(img, masks, colors=colors, alpha=alpha)
     assert out.dtype == dtype
@@ -231,22 +319,46 @@ def test_draw_segmentation_masks(colors, alpha):
     for mask, color in zip(masks, colors):
         if isinstance(color, str):
             color = ImageColor.getrgb(color)
-        color = torch.tensor(color, dtype=dtype)
+        color = torch.tensor(color, dtype=dtype, device=device)
 
         if alpha == 1:
-            assert (out[:, mask] == color[:, None]).all()
+            assert (out[:, mask & ~overlap] == color[:, None]).all()
         elif alpha == 0:
-            assert (out[:, mask] == img[:, mask]).all()
+            assert (out[:, mask & ~overlap] == img[:, mask & ~overlap]).all()
 
-        interpolated_color = (img[:, mask] * (1 - alpha) + color[:, None] * alpha).to(dtype)
-        torch.testing.assert_close(out[:, mask], interpolated_color, rtol=0.0, atol=1.0)
+        interpolated_color = (img[:, mask & ~overlap] * (1 - alpha) + color[:, None] * alpha).to(dtype)
+        torch.testing.assert_close(out[:, mask & ~overlap], interpolated_color, rtol=0.0, atol=1.0)
 
+    interpolated_overlap = (img[:, overlap] * (1 - alpha)).to(dtype)
+    torch.testing.assert_close(out[:, overlap], interpolated_overlap, rtol=0.0, atol=1.0)
 
-def test_draw_segmentation_masks_errors():
+
+def test_draw_segmentation_masks_dtypes():
+    num_masks, h, w = 2, 100, 100
+
+    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
+
+    img_uint8 = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
+    out_uint8 = utils.draw_segmentation_masks(img_uint8, masks)
+
+    assert img_uint8 is not out_uint8
+    assert out_uint8.dtype == torch.uint8
+
+    img_float = to_dtype(img_uint8, torch.float, scale=True)
+    out_float = utils.draw_segmentation_masks(img_float, masks)
+
+    assert img_float is not out_float
+    assert out_float.is_floating_point()
+
+    torch.testing.assert_close(out_uint8, to_dtype(out_float, torch.uint8, scale=True), rtol=0, atol=1)
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_segmentation_masks_errors(device):
     h, w = 10, 10
 
-    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool)
-    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
+    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool, device=device)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8, device=device)
 
     with pytest.raises(TypeError, match="The image must be a tensor"):
         utils.draw_segmentation_masks(image="Not A Tensor Image", masks=masks)
@@ -268,22 +380,23 @@ def test_draw_segmentation_masks_errors():
     with pytest.raises(ValueError, match="must have the same height and width"):
         masks_bad_shape = torch.randint(0, 2, size=(h + 4, w), dtype=torch.bool)
         utils.draw_segmentation_masks(image=img, masks=masks_bad_shape)
-    with pytest.raises(ValueError, match="There are more masks"):
+    with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"):
         utils.draw_segmentation_masks(image=img, masks=masks, colors=[])
-    with pytest.raises(ValueError, match="colors must be a tuple or a string, or a list thereof"):
+    with pytest.raises(ValueError, match="`colors` must be a tuple or a string, or a list thereof"):
         bad_colors = np.array(["red", "blue"])  # should be a list
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
-    with pytest.raises(ValueError, match="It seems that you passed a tuple of colors instead of"):
+    with pytest.raises(ValueError, match="If passed as tuple, colors should be an RGB triplet"):
         bad_colors = ("red", "blue")  # should be a list
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
 
 
-def test_draw_no_segmention_mask():
-    img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
-    masks = torch.full((0, 100, 100), 0, dtype=torch.bool)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_no_segmention_mask(device):
+    img = torch.full((3, 100, 100), 0, dtype=torch.uint8, device=device)
+    masks = torch.full((0, 100, 100), 0, dtype=torch.bool, device=device)
     with pytest.warns(UserWarning, match=re.escape("masks doesn't contain any mask. No mask was drawn")):
         res = utils.draw_segmentation_masks(img, masks)
-        # Check that the function didnt change the image
+        # Check that the function didn't change the image
         assert res.eq(img).all()
 
 
@@ -314,6 +427,13 @@ def test_draw_keypoints_vanilla():
     assert_equal(img, img_cp)
 
 
+def test_draw_keypoins_K_equals_one():
+    # Non-regression test for https://github.com/pytorch/vision/pull/8439
+    img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
+    keypoints = torch.tensor([[[10, 10]]], dtype=torch.float)
+    utils.draw_keypoints(img, keypoints)
+
+
 @pytest.mark.parametrize("colors", ["red", "#FF00FF", (1, 34, 122)])
 def test_draw_keypoints_colored(colors):
     # Keypoints is declared on top as global variable
@@ -334,6 +454,93 @@ def test_draw_keypoints_colored(colors):
     assert_equal(img, img_cp)
 
 
+@pytest.mark.parametrize("connectivity", [[(0, 1)], [(0, 1), (1, 2)]])
+@pytest.mark.parametrize(
+    "vis",
+    [
+        torch.tensor([[1, 1, 0], [1, 1, 0]], dtype=torch.bool),
+        torch.tensor([[1, 1, 0], [1, 1, 0]], dtype=torch.float).unsqueeze_(-1),
+    ],
+)
+def test_draw_keypoints_visibility(connectivity, vis):
+    # Keypoints is declared on top as global variable
+    keypoints_cp = keypoints.clone()
+
+    img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
+    img_cp = img.clone()
+
+    vis_cp = vis if vis is None else vis.clone()
+
+    result = utils.draw_keypoints(
+        image=img,
+        keypoints=keypoints,
+        connectivity=connectivity,
+        colors="red",
+        visibility=vis,
+    )
+    assert result.size(0) == 3
+    assert_equal(keypoints, keypoints_cp)
+    assert_equal(img, img_cp)
+
+    # compare with a fakedata image
+    # connect the key points 0 to 1 for both skeletons and do not show the other key points
+    path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_keypoints_visibility.png"
+    )
+    if not os.path.exists(path):
+        res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy())
+        res.save(path)
+
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+    if vis_cp is None:
+        assert vis is None
+    else:
+        assert_equal(vis, vis_cp)
+        assert vis.dtype == vis_cp.dtype
+
+
+def test_draw_keypoints_visibility_default():
+    # Keypoints is declared on top as global variable
+    keypoints_cp = keypoints.clone()
+
+    img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
+    img_cp = img.clone()
+
+    result = utils.draw_keypoints(
+        image=img,
+        keypoints=keypoints,
+        connectivity=[(0, 1)],
+        colors="red",
+        visibility=None,
+    )
+    assert result.size(0) == 3
+    assert_equal(keypoints, keypoints_cp)
+    assert_equal(img, img_cp)
+
+    # compare against fakedata image, which connects 0->1 for both key-point skeletons
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_keypoint_vanilla.png")
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
+def test_draw_keypoints_dtypes():
+    image_uint8 = torch.randint(0, 256, size=(3, 100, 100), dtype=torch.uint8)
+    image_float = to_dtype(image_uint8, torch.float, scale=True)
+
+    out_uint8 = utils.draw_keypoints(image_uint8, keypoints)
+    out_float = utils.draw_keypoints(image_float, keypoints)
+
+    assert out_uint8.dtype == torch.uint8
+    assert out_uint8 is not image_uint8
+
+    assert out_float.is_floating_point()
+    assert out_float is not image_float
+
+    torch.testing.assert_close(out_uint8, to_dtype(out_float, torch.uint8, scale=True), rtol=0, atol=1)
+
+
 def test_draw_keypoints_errors():
     h, w = 10, 10
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
@@ -352,6 +559,18 @@ def test_draw_keypoints_errors():
     with pytest.raises(ValueError, match="keypoints must be of shape"):
         invalid_keypoints = torch.tensor([[10, 10, 10, 10], [5, 6, 7, 8]], dtype=torch.float)
         utils.draw_keypoints(image=img, keypoints=invalid_keypoints)
+    with pytest.raises(ValueError, match=re.escape("visibility must be of shape (num_instances, K)")):
+        one_dim_visibility = torch.tensor([True, True, True], dtype=torch.bool)
+        utils.draw_keypoints(image=img, keypoints=keypoints, visibility=one_dim_visibility)
+    with pytest.raises(ValueError, match=re.escape("visibility must be of shape (num_instances, K)")):
+        three_dim_visibility = torch.ones((2, 3, 4), dtype=torch.bool)
+        utils.draw_keypoints(image=img, keypoints=keypoints, visibility=three_dim_visibility)
+    with pytest.raises(ValueError, match="keypoints and visibility must have the same dimensionality"):
+        vis_wrong_n = torch.ones((3, 3), dtype=torch.bool)
+        utils.draw_keypoints(image=img, keypoints=keypoints, visibility=vis_wrong_n)
+    with pytest.raises(ValueError, match="keypoints and visibility must have the same dimensionality"):
+        vis_wrong_k = torch.ones((2, 4), dtype=torch.bool)
+        utils.draw_keypoints(image=img, keypoints=keypoints, visibility=vis_wrong_k)
 
 
 @pytest.mark.parametrize("batch", (True, False))
@@ -369,7 +588,7 @@ def test_flow_to_image(batch):
     assert img.shape == (2, 3, h, w) if batch else (3, h, w)
 
     path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "expected_flow.pt")
-    expected_img = torch.load(path, map_location="cpu")
+    expected_img = torch.load(path, map_location="cpu", weights_only=True)
 
     if batch:
         expected_img = torch.stack([expected_img, expected_img])
diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
index d987db6ddeb..aa6d0aee9e0 100644
--- a/test/test_video_gpu_decoder.py
+++ b/test/test_video_gpu_decoder.py
@@ -3,6 +3,7 @@
 
 import pytest
 import torch
+import torchvision
 from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
 
 try:
@@ -29,8 +30,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_frame_reading(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         with av.open(full_path) as container:
             for av_frame in container.decode(container.streams.video[0]):
                 av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -54,7 +56,8 @@ def test_frame_reading(self, video_file):
         ],
     )
     def test_seek_reading(self, keyframes, full_path, duration):
-        decoder = VideoReader(full_path, device="cuda")
+        torchvision.set_video_backend("cuda")
+        decoder = VideoReader(full_path)
         time = duration / 2
         decoder.seek(time, keyframes_only=keyframes)
         with av.open(full_path) as container:
@@ -79,8 +82,9 @@ def test_seek_reading(self, keyframes, full_path, duration):
         ],
     )
     def test_metadata(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         video_metadata = decoder.get_metadata()["video"]
         with av.open(full_path) as container:
             video = container.streams.video[0]
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
index 867923d10d0..10995424982 100644
--- a/test/test_video_reader.py
+++ b/test/test_video_reader.py
@@ -11,7 +11,7 @@
 from numpy.random import randint
 from pytest import approx
 from torchvision import set_video_backend
-from torchvision.io import _HAS_VIDEO_OPT
+from torchvision.io import _HAS_CPU_VIDEO_DECODER
 
 
 try:
@@ -127,7 +127,7 @@ def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer
             ascending order. We need to decode more frames even when we meet end
             pts
     """
-    # seeking in the stream is imprecise. Thus, seek to an ealier PTS by a margin
+    # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin
     margin = 1
     seek_offset = max(start_pts - margin, 0)
 
@@ -263,7 +263,7 @@ def _get_video_tensor(video_dir, video_file):
 
 
 @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-@pytest.mark.skipif(_HAS_VIDEO_OPT is False, reason="Didn't compile with ffmpeg")
+@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
 class TestVideoReader:
     def check_separate_decoding_result(self, tv_result, config):
         """check the decoding results from TorchVision decoder"""
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index 895b9b83555..aabcf6407f7 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -7,7 +7,13 @@
 import torchvision
 from pytest import approx
 from torchvision.datasets.utils import download_url
-from torchvision.io import _HAS_VIDEO_OPT, VideoReader
+from torchvision.io import _HAS_CPU_VIDEO_DECODER, VideoReader
+
+
+# WARNING: these tests have been skipped forever on the CI because the video ops
+# are never properly available. This is bad, but things have been in a terrible
+# state for a long time already as we write this comment, and we'll hopefully be
+# able to get rid of this all soon.
 
 
 try:
@@ -25,6 +31,13 @@
 GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
 
 
+def backends():
+    backends_ = ["video_reader"]
+    if av is not None:
+        backends_.append("pyav")
+    return backends_
+
+
 def fate(name, path="."):
     """Download and return a path to a sample from the FFmpeg test suite.
     See the `FFmpeg Automated Test Environment <https://www.ffmpeg.org/fate.html>`_
@@ -49,11 +62,13 @@ def fate(name, path="."):
 }
 
 
-@pytest.mark.skipif(_HAS_VIDEO_OPT is False, reason="Didn't compile with ffmpeg")
+@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading(self, test_video):
+    @pytest.mark.parametrize("backend", backends())
+    def test_frame_reading(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         with av.open(full_path) as av_reader:
             if av_reader.streams.video:
@@ -77,6 +92,7 @@ def test_frame_reading(self, test_video):
                 # compare the frames and ptss
                 for i in range(len(vr_frames)):
                     assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
+
                     mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
                     # on average the difference is very small and caused
                     # by decoding (around 1%)
@@ -114,12 +130,62 @@ def test_frame_reading(self, test_video):
                     # we assure that there is never more than 1% difference in signal
                     assert max_delta.item() < 0.001
 
+    @pytest.mark.parametrize("stream", ["video", "audio"])
+    @pytest.mark.parametrize("test_video", test_videos.keys())
+    @pytest.mark.parametrize("backend", backends())
+    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
+        torchvision.set_video_backend(backend)
+        full_path = os.path.join(VIDEO_DIR, test_video)
+
+        reader = VideoReader(full_path)
+        reader_md = reader.get_metadata()
+
+        if stream in reader_md:
+            # Test video reading from file vs from memory
+            vr_frames, vr_frames_mem = [], []
+            vr_pts, vr_pts_mem = [], []
+            # get vr frames
+            video_reader = VideoReader(full_path, stream)
+            for vr_frame in video_reader:
+                vr_frames.append(vr_frame["data"])
+                vr_pts.append(vr_frame["pts"])
+
+            # get vr frames = read from memory
+            f = open(full_path, "rb")
+            fbytes = f.read()
+            f.close()
+            video_reader_from_mem = VideoReader(fbytes, stream)
+
+            for vr_frame_from_mem in video_reader_from_mem:
+                vr_frames_mem.append(vr_frame_from_mem["data"])
+                vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+            # same number of frames
+            assert len(vr_frames) == len(vr_frames_mem)
+            assert len(vr_pts) == len(vr_pts_mem)
+
+            # compare the frames and ptss
+            for i in range(len(vr_frames)):
+                assert vr_pts[i] == vr_pts_mem[i]
+                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+                # on average the difference is very small and caused
+                # by decoding (around 1%)
+                # TODO: asses empirically how to set this? atm it's 1%
+                # averaged over all frames
+                assert mean_delta.item() < 2.55
+
+            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        else:
+            del reader, reader_md
+
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_metadata(self, test_video, config):
+    @pytest.mark.parametrize("backend", backends())
+    def test_metadata(self, test_video, config, backend):
         """
         Test that the metadata returned via pyav corresponds to the one returned
         by the new video decoder API
         """
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         reader = VideoReader(full_path, "video")
         reader_md = reader.get_metadata()
@@ -127,7 +193,9 @@ def test_metadata(self, test_video, config):
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_seek_start(self, test_video):
+    @pytest.mark.parametrize("backend", backends())
+    def test_seek_start(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         video_reader = VideoReader(full_path, "video")
         num_frames = 0
@@ -153,7 +221,9 @@ def test_seek_start(self, test_video):
         assert start_num_frames == num_frames
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_accurateseek_middle(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader"])
+    def test_accurateseek_middle(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         stream = "video"
         video_reader = VideoReader(full_path, stream)
@@ -192,7 +262,9 @@ def test_fate_suite(self):
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_keyframe_reading(self, test_video, config):
+    @pytest.mark.parametrize("backend", backends())
+    def test_keyframe_reading(self, test_video, config, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
         av_reader = av.open(full_path)
@@ -227,6 +299,14 @@ def test_keyframe_reading(self, test_video, config):
                 for i in range(len(av_keyframes)):
                     assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001)
 
+    def test_src(self):
+        with pytest.raises(ValueError, match="src cannot be empty"):
+            VideoReader(src="")
+        with pytest.raises(ValueError, match="src must be either string"):
+            VideoReader(src=2)
+        with pytest.raises(TypeError, match="unexpected keyword argument"):
+            VideoReader(path="path")
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/tracing/frcnn/CMakeLists.txt b/test/tracing/frcnn/CMakeLists.txt
deleted file mode 100644
index c79382470bd..00000000000
--- a/test/tracing/frcnn/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-project(test_frcnn_tracing)
-
-find_package(Torch REQUIRED)
-find_package(TorchVision REQUIRED)
-
-# This due to some headers importing Python.h
-find_package(Python3 COMPONENTS Development)
-
-add_executable(test_frcnn_tracing test_frcnn_tracing.cpp)
-target_compile_features(test_frcnn_tracing PUBLIC cxx_range_for)
-target_link_libraries(test_frcnn_tracing ${TORCH_LIBRARIES} TorchVision::TorchVision Python3::Python)
-set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 14)
diff --git a/test/tracing/frcnn/test_frcnn_tracing.cpp b/test/tracing/frcnn/test_frcnn_tracing.cpp
deleted file mode 100644
index f5f350b6b02..00000000000
--- a/test/tracing/frcnn/test_frcnn_tracing.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <torch/script.h>
-#include <torch/torch.h>
-#include <torchvision/vision.h>
-#include <torchvision/ops/nms.h>
-
-
-int main() {
-  torch::DeviceType device_type;
-  device_type = torch::kCPU;
-
-  torch::jit::script::Module module;
-  try {
-    std::cout << "Loading model\n";
-    // Deserialize the ScriptModule from a file using torch::jit::load().
-    module = torch::jit::load("fasterrcnn_resnet50_fpn.pt");
-    std::cout << "Model loaded\n";
-  } catch (const torch::Error& e) {
-    std::cout << "error loading the model\n";
-    return -1;
-  } catch (const std::exception& e) {
-    std::cout << "Other error: " << e.what() << "\n";
-    return -1;
-  }
-
-  // TorchScript models require a List[IValue] as input
-  std::vector<torch::jit::IValue> inputs;
-
-  // Faster RCNN accepts a List[Tensor] as main input
-  std::vector<torch::Tensor> images;
-  images.push_back(torch::rand({3, 256, 275}));
-  images.push_back(torch::rand({3, 256, 275}));
-
-  inputs.push_back(images);
-  auto output = module.forward(inputs);
-
-  std::cout << "ok\n";
-  std::cout << "output" << output << "\n";
-
-  if (torch::cuda::is_available()) {
-    // Move traced model to GPU
-    module.to(torch::kCUDA);
-
-    // Add GPU inputs
-    images.clear();
-    inputs.clear();
-
-    torch::TensorOptions options = torch::TensorOptions{torch::kCUDA};
-    images.push_back(torch::rand({3, 256, 275}, options));
-    images.push_back(torch::rand({3, 256, 275}, options));
-
-    inputs.push_back(images);
-    auto output = module.forward(inputs);
-
-    std::cout << "ok\n";
-    std::cout << "output" << output << "\n";
-  }
-  return 0;
-}
diff --git a/test/tracing/frcnn/trace_model.py b/test/tracing/frcnn/trace_model.py
deleted file mode 100644
index b5ec50bdab1..00000000000
--- a/test/tracing/frcnn/trace_model.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import os.path as osp
-
-import torch
-import torchvision
-
-HERE = osp.dirname(osp.abspath(__file__))
-ASSETS = osp.dirname(osp.dirname(HERE))
-
-model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None)
-model.eval()
-
-traced_model = torch.jit.script(model)
-traced_model.save("fasterrcnn_resnet50_fpn.pt")
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 739f79407b3..5d06156c25f 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -1,16 +1,20 @@
 import os
 import warnings
+from modulefinder import Module
 
 import torch
-from torchvision import datasets, io, models, ops, transforms, utils
 
-from .extension import _HAS_OPS
+# Don't re-order these, we need to load the _C extension (done when importing
+# .extensions) before entering _meta_registrations.
+from .extension import _HAS_OPS  # usort:skip
+from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils  # usort:skip
 
 try:
     from .version import __version__  # noqa: F401
 except ImportError:
     pass
 
+
 # Check if torchvision is being imported within the root folder
 if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
     os.path.realpath(os.getcwd()), "torchvision"
@@ -66,11 +70,16 @@ def set_video_backend(backend):
         backend, please compile torchvision from source.
     """
     global _video_backend
-    if backend not in ["pyav", "video_reader"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
-    if backend == "video_reader" and not io._HAS_VIDEO_OPT:
+    if backend not in ["pyav", "video_reader", "cuda"]:
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
+    if backend == "video_reader" and not io._HAS_CPU_VIDEO_DECODER:
+        # TODO: better messages
         message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        warnings.warn(message)
+        raise RuntimeError(message)
+    elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER:
+        # TODO: better messages
+        message = "cuda video backend is not available."
+        raise RuntimeError(message)
     else:
         _video_backend = backend
 
@@ -88,3 +97,9 @@ def get_video_backend():
 
 def _is_tracing():
     return torch._C._get_tracing_state()
+
+
+def disable_beta_transforms_warning():
+    # Noop, only exists to avoid breaking existing code.
+    # See https://github.com/pytorch/vision/issues/7896
+    pass
diff --git a/torchvision/_internally_replaced_utils.py b/torchvision/_internally_replaced_utils.py
index 18afc3ed93a..e0fa72489f1 100644
--- a/torchvision/_internally_replaced_utils.py
+++ b/torchvision/_internally_replaced_utils.py
@@ -6,6 +6,7 @@
 
 _HOME = os.path.join(_get_torch_home(), "datasets", "vision")
 _USE_SHARDED_DATASETS = False
+IN_FBCODE = False
 
 
 def _download_file_from_remote_location(fpath: str, url: str) -> None:
@@ -28,7 +29,6 @@ def _get_extension_path(lib_name):
     if os.name == "nt":
         # Register the main torchvision library location on the default DLL path
         import ctypes
-        import sys
 
         kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
         with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
@@ -37,14 +37,7 @@ def _get_extension_path(lib_name):
         if with_load_library_flags:
             kernel32.AddDllDirectory.restype = ctypes.c_void_p
 
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(lib_dir)
-        elif with_load_library_flags:
-            res = kernel32.AddDllDirectory(lib_dir)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
-                raise err
+        os.add_dll_directory(lib_dir)
 
         kernel32.SetErrorMode(prev_error_mode)
 
diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py
new file mode 100644
index 00000000000..f75bfb77a7f
--- /dev/null
+++ b/torchvision/_meta_registrations.py
@@ -0,0 +1,225 @@
+import functools
+
+import torch
+import torch._custom_ops
+import torch.library
+
+# Ensure that torch.ops.torchvision is visible
+import torchvision.extension  # noqa: F401
+
+
+@functools.lru_cache(None)
+def get_meta_lib():
+    return torch.library.Library("torchvision", "IMPL", "Meta")
+
+
+def register_meta(op_name, overload_name="default"):
+    def wrapper(fn):
+        if torchvision.extension._has_ops():
+            get_meta_lib().impl(getattr(getattr(torch.ops.torchvision, op_name), overload_name), fn)
+        return fn
+
+    return wrapper
+
+
+@register_meta("roi_align")
+def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    torch._check(
+        input.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for input to have the same type as tensor for rois; "
+            f"but type {input.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    num_rois = rois.size(0)
+    channels = input.size(1)
+    return input.new_empty((num_rois, channels, pooled_height, pooled_width))
+
+
+@register_meta("_roi_align_backward")
+def meta_roi_align_backward(
+    grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned
+):
+    torch._check(
+        grad.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for grad to have the same type as tensor for rois; "
+            f"but type {grad.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    return grad.new_empty((batch_size, channels, height, width))
+
+
+@register_meta("ps_roi_align")
+def meta_ps_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio):
+    torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    torch._check(
+        input.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for input to have the same type as tensor for rois; "
+            f"but type {input.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    channels = input.size(1)
+    torch._check(
+        channels % (pooled_height * pooled_width) == 0,
+        "input channels must be a multiple of pooling height * pooling width",
+    )
+
+    num_rois = rois.size(0)
+    out_size = (num_rois, channels // (pooled_height * pooled_width), pooled_height, pooled_width)
+    return input.new_empty(out_size), torch.empty(out_size, dtype=torch.int32, device="meta")
+
+
+@register_meta("_ps_roi_align_backward")
+def meta_ps_roi_align_backward(
+    grad,
+    rois,
+    channel_mapping,
+    spatial_scale,
+    pooled_height,
+    pooled_width,
+    sampling_ratio,
+    batch_size,
+    channels,
+    height,
+    width,
+):
+    torch._check(
+        grad.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for grad to have the same type as tensor for rois; "
+            f"but type {grad.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    return grad.new_empty((batch_size, channels, height, width))
+
+
+@register_meta("roi_pool")
+def meta_roi_pool(input, rois, spatial_scale, pooled_height, pooled_width):
+    torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    torch._check(
+        input.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for input to have the same type as tensor for rois; "
+            f"but type {input.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    num_rois = rois.size(0)
+    channels = input.size(1)
+    out_size = (num_rois, channels, pooled_height, pooled_width)
+    return input.new_empty(out_size), torch.empty(out_size, device="meta", dtype=torch.int32)
+
+
+@register_meta("_roi_pool_backward")
+def meta_roi_pool_backward(
+    grad, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width
+):
+    torch._check(
+        grad.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for grad to have the same type as tensor for rois; "
+            f"but type {grad.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    return grad.new_empty((batch_size, channels, height, width))
+
+
+@register_meta("ps_roi_pool")
+def meta_ps_roi_pool(input, rois, spatial_scale, pooled_height, pooled_width):
+    torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    torch._check(
+        input.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for input to have the same type as tensor for rois; "
+            f"but type {input.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    channels = input.size(1)
+    torch._check(
+        channels % (pooled_height * pooled_width) == 0,
+        "input channels must be a multiple of pooling height * pooling width",
+    )
+    num_rois = rois.size(0)
+    out_size = (num_rois, channels // (pooled_height * pooled_width), pooled_height, pooled_width)
+    return input.new_empty(out_size), torch.empty(out_size, device="meta", dtype=torch.int32)
+
+
+@register_meta("_ps_roi_pool_backward")
+def meta_ps_roi_pool_backward(
+    grad, rois, channel_mapping, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width
+):
+    torch._check(
+        grad.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for grad to have the same type as tensor for rois; "
+            f"but type {grad.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    return grad.new_empty((batch_size, channels, height, width))
+
+
+@torch.library.register_fake("torchvision::nms")
+def meta_nms(dets, scores, iou_threshold):
+    torch._check(dets.dim() == 2, lambda: f"boxes should be a 2d tensor, got {dets.dim()}D")
+    torch._check(dets.size(1) == 4, lambda: f"boxes should have 4 elements in dimension 1, got {dets.size(1)}")
+    torch._check(scores.dim() == 1, lambda: f"scores should be a 1d tensor, got {scores.dim()}")
+    torch._check(
+        dets.size(0) == scores.size(0),
+        lambda: f"boxes and scores should have same number of elements in dimension 0, got {dets.size(0)} and {scores.size(0)}",
+    )
+    ctx = torch._custom_ops.get_ctx()
+    num_to_keep = ctx.create_unbacked_symint()
+    return dets.new_empty(num_to_keep, dtype=torch.long)
+
+
+@register_meta("deform_conv2d")
+def meta_deform_conv2d(
+    input,
+    weight,
+    offset,
+    mask,
+    bias,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    dil_h,
+    dil_w,
+    n_weight_grps,
+    n_offset_grps,
+    use_mask,
+):
+
+    out_height, out_width = offset.shape[-2:]
+    out_channels = weight.shape[0]
+    batch_size = input.shape[0]
+    return input.new_empty((batch_size, out_channels, out_height, out_width))
+
+
+@register_meta("_deform_conv2d_backward")
+def meta_deform_conv2d_backward(
+    grad,
+    input,
+    weight,
+    offset,
+    mask,
+    bias,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    dilation_h,
+    dilation_w,
+    groups,
+    offset_groups,
+    use_mask,
+):
+
+    grad_input = input.new_empty(input.shape)
+    grad_weight = weight.new_empty(weight.shape)
+    grad_offset = offset.new_empty(offset.shape)
+    grad_mask = mask.new_empty(mask.shape)
+    grad_bias = bias.new_empty(bias.shape)
+    return grad_input, grad_weight, grad_offset, grad_mask, grad_bias
diff --git a/torchvision/_utils.py b/torchvision/_utils.py
index b739ef0966e..aee2676df45 100644
--- a/torchvision/_utils.py
+++ b/torchvision/_utils.py
@@ -1,5 +1,6 @@
 import enum
-from typing import Sequence, Type, TypeVar
+from collections.abc import Sequence
+from typing import TypeVar
 
 T = TypeVar("T", bound=enum.Enum)
 
@@ -7,7 +8,7 @@
 class StrEnumMeta(enum.EnumMeta):
     auto = enum.auto
 
-    def from_str(self: Type[T], member: str) -> T:  # type: ignore[misc]
+    def from_str(self: type[T], member: str) -> T:  # type: ignore[misc]
         try:
             return self[member]
         except KeyError:
diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp
index e26d788d9c7..648955c5845 100644
--- a/torchvision/csrc/io/decoder/audio_sampler.cpp
+++ b/torchvision/csrc/io/decoder/audio_sampler.cpp
@@ -48,6 +48,23 @@ bool AudioSampler::init(const SamplerParameters& params) {
     return false;
   }
 
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
+  SwrContext* swrContext_ = NULL;
+  AVChannelLayout channel_out;
+  AVChannelLayout channel_in;
+  av_channel_layout_default(&channel_out, params.out.audio.channels);
+  av_channel_layout_default(&channel_in, params.in.audio.channels);
+  swr_alloc_set_opts2(
+      &swrContext_,
+      &channel_out,
+      (AVSampleFormat)params.out.audio.format,
+      params.out.audio.samples,
+      &channel_in,
+      (AVSampleFormat)params.in.audio.format,
+      params.in.audio.samples,
+      0,
+      logCtx_);
+#else
   swrContext_ = swr_alloc_set_opts(
       nullptr,
       av_get_default_channel_layout(params.out.audio.channels),
@@ -58,6 +75,7 @@ bool AudioSampler::init(const SamplerParameters& params) {
       params.in.audio.samples,
       0,
       logCtx_);
+#endif
   if (swrContext_ == nullptr) {
     LOG(ERROR) << "Cannot allocate SwrContext";
     return false;
diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp
index 0f6c57e5588..c3a003434b8 100644
--- a/torchvision/csrc/io/decoder/audio_stream.cpp
+++ b/torchvision/csrc/io/decoder/audio_stream.cpp
@@ -1,31 +1,40 @@
 #include "audio_stream.h"
 #include <c10/util/Logging.h>
-#include <limits>
 #include "util.h"
 
 namespace ffmpeg {
 
 namespace {
+static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) {
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
+  return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels;
+#else
+  return frame ? frame->channels : codec->channels;
+#endif
+}
+
 bool operator==(const AudioFormat& x, const AVFrame& y) {
   return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(y.channels) && x.format == y.format;
+      x.channels == static_cast<size_t>(get_nb_channels(&y, nullptr)) &&
+      x.format == y.format;
 }
 
 bool operator==(const AudioFormat& x, const AVCodecContext& y) {
   return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(y.channels) && x.format == y.sample_fmt;
+      x.channels == static_cast<size_t>(get_nb_channels(nullptr, &y)) &&
+      x.format == y.sample_fmt;
 }
 
 AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
   x.samples = y.sample_rate;
-  x.channels = y.channels;
+  x.channels = get_nb_channels(&y, nullptr);
   x.format = y.format;
   return x;
 }
 
 AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
   x.samples = y.sample_rate;
-  x.channels = y.channels;
+  x.channels = get_nb_channels(nullptr, &y);
   x.format = y.sample_fmt;
   return x;
 }
@@ -54,9 +63,15 @@ int AudioStream::initFormat() {
   if (format_.format.audio.samples == 0) {
     format_.format.audio.samples = codecCtx_->sample_rate;
   }
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
+  if (format_.format.audio.channels == 0) {
+    format_.format.audio.channels = codecCtx_->ch_layout.nb_channels;
+  }
+#else
   if (format_.format.audio.channels == 0) {
     format_.format.audio.channels = codecCtx_->channels;
   }
+#endif
   if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) {
     format_.format.audio.format = codecCtx_->sample_fmt;
   }
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index f13e2c3ffcf..cfe762bbc6e 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -285,6 +285,8 @@ bool Decoder::init(
       return false;
     }
 
+    avioCtx_->max_packet_size = params.maxEncodedBufferSize;
+
     inputCtx_->pb = avioCtx_;
     inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
   }
@@ -312,6 +314,8 @@ bool Decoder::init(
     }
   }
 
+  av_dict_set_int(&options, "probesize", params_.probeSize, 0);
+
   interrupted_ = false;
 
   // ffmpeg avformat_open_input call can hang if media source doesn't respond
@@ -380,7 +384,30 @@ bool Decoder::init(
     av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
   }
 
+  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
+    if (
+#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
+        inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
+#else // FFMPEG 4.0+
+        inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
+#endif
+        && inputCtx_->streams[i]->duration > 0) {
+      // There is at least two 1/r_frame_rates from the frame before the last
+      // one until the video duration, let's prefer to set duration after the
+      // frame before the last one, but as early as possible
+      double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
+              (double)inputCtx_->streams[i]->r_frame_rate.num -
+          1 / (double)AV_TIME_BASE;
+      videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
+              inputCtx_->streams[i]->time_base.num /
+              (double)inputCtx_->streams[i]->time_base.den -
+          1000 * correction;
+      break;
+    }
+  }
+
   VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
+  VLOG(1) << "Video duration: " << videoDurationMs_;
   return true;
 }
 
@@ -418,20 +445,20 @@ bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
     if (it->stream == -2 || // all streams of this type are welcome
         (!stream && (it->stream == -1 || it->stream == i))) { // new stream
       VLOG(1) << "Stream type: " << format.type << " found, at index: " << i;
-      auto stream = createStream(
+      auto stream_2 = createStream(
           format.type,
           inputCtx_,
           i,
           params_.convertPtsToWallTime,
           it->format,
           params_.loggingUuid);
-      CHECK(stream);
-      if (stream->openCodec(metadata, params_.numThreads) < 0) {
+      CHECK(stream_2);
+      if (stream_2->openCodec(metadata, params_.numThreads) < 0) {
         LOG(ERROR) << "uuid=" << params_.loggingUuid
                    << " open codec failed, stream_idx=" << i;
         return false;
       }
-      streams_.emplace(i, std::move(stream));
+      streams_.emplace(i, std::move(stream_2));
       inRange_.set(i, true);
     }
   }
@@ -588,13 +615,30 @@ int Decoder::getFrame(size_t workingTimeInMs) {
     result = 0;
 
     av_packet_unref(avPacket);
+
+    if (params_.uniformSampling > 1) {
+      if (doSeek_) {
+        double duration =
+            videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
+        double step =
+            (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
+        avformat_seek_file(
+            inputCtx_,
+            -1,
+            static_cast<int64_t>(step * kFramesDecoded_) + 1,
+            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
+            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
+            0);
+        ++kFramesDecoded_;
+        doSeek_ = false;
+      }
+    }
   }
 
   av_packet_free(&avPacket);
-  VLOG(2) << "Interrupted loop"
-          << ", interrupted_ " << interrupted_ << ", inRange_.any() "
-          << inRange_.any() << ", decodedFrame " << decodedFrame << ", result "
-          << result;
+  VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
+          << ", inRange_.any() " << inRange_.any() << ", decodedFrame "
+          << decodedFrame << ", result " << result;
 
   // loop can be terminated, either by:
   // 1. explicitly interrupted
@@ -658,13 +702,35 @@ int Decoder::processPacket(
       startCondition = msg.header.pts >= params_.startOffset;
     }
     if (endInRange && startCondition) {
-      *hasMsg = true;
-      push(std::move(msg));
+      *hasMsg = pushMsg(std::move(msg));
     }
   }
   return result;
 }
 
+bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
+  pastDecodedPTS_ = currentDecodedPTS_;
+  currentDecodedPTS_ = msg.header.pts;
+
+  if (params_.uniformSampling <= 1) {
+    push(std::move(msg));
+    return true;
+  }
+
+  double duration =
+      videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
+  double step =
+      (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
+  if (pastDecodedPTS_ < step * kFramesDecoded_ &&
+      step * kFramesDecoded_ <= currentDecodedPTS_) {
+    push(std::move(msg));
+    doSeek_ = true;
+    return true;
+  }
+
+  return false;
+}
+
 void Decoder::flushStreams() {
   VLOG(1) << "Flushing streams...";
   for (auto& stream : streams_) {
@@ -676,7 +742,7 @@ void Decoder::flushStreams() {
           params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
       inRange_.set(stream.second->getIndex(), endInRange);
       if (endInRange && msg.header.pts >= params_.startOffset) {
-        push(std::move(msg));
+        pushMsg(std::move(msg));
       } else {
         msg.payload.reset();
       }
diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
index 44d6676aa6b..172a011f93e 100644
--- a/torchvision/csrc/io/decoder/decoder.h
+++ b/torchvision/csrc/io/decoder/decoder.h
@@ -56,6 +56,7 @@ class Decoder : public MediaDecoder {
   int* getPrintPrefix() {
     return &printPrefix;
   }
+  double videoDurationMs_ = -1;
 
  private:
   // mark below function for a proper invocation
@@ -76,6 +77,8 @@ class Decoder : public MediaDecoder {
       bool fastSeek = false);
   void flushStreams();
   void cleanUp();
+  bool pushMsg(DecoderOutputMessage&&
+                   msg); // returns whether frame is passed to downstream
 
  protected:
   DecoderParameters params_;
@@ -89,5 +92,9 @@ class Decoder : public MediaDecoder {
   AVIOContext* avioCtx_{nullptr};
   std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
   std::bitset<64> inRange_;
+  int kFramesDecoded_{0};
+  int64_t pastDecodedPTS_{-1};
+  int64_t currentDecodedPTS_{-1};
+  bool doSeek_{false};
 };
 } // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
index dac6293d366..d2dc5c7935b 100644
--- a/torchvision/csrc/io/decoder/defs.h
+++ b/torchvision/csrc/io/decoder/defs.h
@@ -165,7 +165,7 @@ struct MediaFormat {
 struct DecoderParameters {
   // local file, remote file, http url, rtmp stream uri, etc. anything that
   // ffmpeg can recognize
-  std::string uri;
+  std::string uri{std::string()};
   // timeout on getting bytes for decoding
   size_t timeoutMs{1000};
   // logging level, default AV_LOG_PANIC
@@ -213,6 +213,23 @@ struct DecoderParameters {
 
   // Skip packets that fail with EPERM errors and continue decoding.
   bool skipOperationNotPermittedPackets{false};
+
+  // probing size in bytes, i.e. the size of the data to analyze to get stream
+  // information. A higher value will enable detecting more information in case
+  // it is dispersed into the stream, but will increase latency. Must be an
+  // integer not lesser than 32. It is 5000000 by default.
+  int64_t probeSize{5000000};
+
+  // Expected duration of the video to be decoded, mainly used with uniform
+  // sampling
+  float expectedDuration{0.0f};
+
+  // Sample N key-frames from the video roughly uniformly across the timeline
+  int uniformSampling{0};
+
+  // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames.
+  // Override this with bigger buffer size if needed.
+  int64_t maxEncodedBufferSize{0};
 };
 
 struct DecoderHeader {
@@ -295,7 +312,7 @@ struct DecoderMetadata {
 };
 /**
  * Abstract class for decoding media bytes
- * It has two diffrent modes. Internal media bytes retrieval for given uri and
+ * It has two different modes. Internal media bytes retrieval for given uri and
  * external media bytes provider in case of memory streams
  */
 class MediaDecoder {
diff --git a/torchvision/csrc/io/decoder/gpu/README.rst b/torchvision/csrc/io/decoder/gpu/README.rst
index cebd31cb557..e4573d7fe75 100644
--- a/torchvision/csrc/io/decoder/gpu/README.rst
+++ b/torchvision/csrc/io/decoder/gpu/README.rst
@@ -18,4 +18,4 @@ GPU decoder depends on ffmpeg for demuxing, uses NVDECODE APIs from the nvidia-v
 
 .. code:: bash
 
-    python setup.py install
+    pip install . -v --no-build-isolation
diff --git a/torchvision/csrc/io/decoder/gpu/decoder.cpp b/torchvision/csrc/io/decoder/gpu/decoder.cpp
index 0e451298825..f7377ede38b 100644
--- a/torchvision/csrc/io/decoder/gpu/decoder.cpp
+++ b/torchvision/csrc/io/decoder/gpu/decoder.cpp
@@ -59,7 +59,7 @@ void Decoder::release() {
   if (decoder) {
     cuvidDestroyDecoder(decoder);
   }
-  cuCtxPopCurrent(NULL);
+  cuCtxPopCurrent(nullptr);
 }
 
 /* Trigger video decoding.
@@ -100,7 +100,7 @@ int Decoder::handle_picture_decode(CUVIDPICPARAMS* pic_params) {
   check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
   check_for_cuda_errors(
       cuvidDecodePicture(decoder, pic_params), __LINE__, __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
+  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
   return 1;
 }
 
@@ -143,7 +143,8 @@ int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) {
   uint8_t* frame_ptr = decoded_frame.data_ptr<uint8_t>();
   const uint8_t* const source_arr[] = {
       (const uint8_t* const)source_frame,
-      (const uint8_t* const)(source_frame + source_pitch * ((surface_height + 1) & ~1))};
+      (const uint8_t* const)(source_frame +
+                             source_pitch * ((surface_height + 1) & ~1))};
 
   auto err = nppiNV12ToRGB_709CSC_8u_P2C3R(
       source_arr,
@@ -159,7 +160,7 @@ int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) {
 
   check_for_cuda_errors(cuStreamSynchronize(cuvidStream), __LINE__, __FILE__);
   decoded_frames.push(decoded_frame);
-  check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
+  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
 
   check_for_cuda_errors(
       cuvidUnmapVideoFrame(decoder, source_frame), __LINE__, __FILE__);
@@ -177,7 +178,7 @@ void Decoder::query_hardware(CUVIDEOFORMAT* video_format) {
 
   check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
   check_for_cuda_errors(cuvidGetDecoderCaps(&decode_caps), __LINE__, __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
+  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
 
   if (!decode_caps.bIsSupported) {
     TORCH_CHECK(false, "Codec not supported on this GPU");
@@ -319,7 +320,7 @@ int Decoder::handle_video_sequence(CUVIDEOFORMAT* video_format) {
       cuvidCreateDecoder(&decoder, &video_decode_create_info),
       __LINE__,
       __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
+  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
   return decode_surface;
 }
 
@@ -389,7 +390,7 @@ int Decoder::reconfigure_decoder(CUVIDEOFORMAT* video_format) {
   check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
   check_for_cuda_errors(
       cuvidReconfigureDecoder(decoder, &reconfig_params), __LINE__, __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
+  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
 
   return decode_surface;
 }
diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
index a7b0128e3ed..4e420c3b3cd 100644
--- a/torchvision/csrc/io/decoder/memory_buffer.cpp
+++ b/torchvision/csrc/io/decoder/memory_buffer.cpp
@@ -61,7 +61,7 @@ DecoderInCallback MemoryBuffer::getCallback(
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - supported
+          // seek capability, yes - supported
           return 0;
         }
         return object.seek(size, whence);
diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp
index 0d625ef211c..7969741e72c 100644
--- a/torchvision/csrc/io/decoder/stream.cpp
+++ b/torchvision/csrc/io/decoder/stream.cpp
@@ -1,6 +1,5 @@
 #include "stream.h"
 #include <c10/util/Logging.h>
-#include <stdio.h>
 #include <string.h>
 #include "util.h"
 
@@ -63,15 +62,8 @@ int Stream::openCodec(std::vector<DecoderMetadata>* metadata, int num_threads) {
     codecCtx_->thread_count = num_threads;
   } else {
     // otherwise set sensible defaults
-    // with the special case for the different MPEG4 codecs
-    // that don't have threading context functions
-    if (codecCtx_->codec->capabilities & AV_CODEC_CAP_INTRA_ONLY) {
-      codecCtx_->thread_type = FF_THREAD_FRAME;
-      codecCtx_->thread_count = 2;
-    } else {
-      codecCtx_->thread_count = 8;
-      codecCtx_->thread_type = FF_THREAD_SLICE;
-    }
+    codecCtx_->thread_count = 8;
+    codecCtx_->thread_type = FF_THREAD_SLICE;
   }
 
   int ret;
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp
index 27c61d4dbd9..3416f702d7e 100644
--- a/torchvision/csrc/io/decoder/subtitle_stream.cpp
+++ b/torchvision/csrc/io/decoder/subtitle_stream.cpp
@@ -1,6 +1,5 @@
 #include "subtitle_stream.h"
 #include <c10/util/Logging.h>
-#include <limits>
 #include "util.h"
 
 namespace ffmpeg {
diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
index 936d1e94f46..085966ce687 100644
--- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp
+++ b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
@@ -158,8 +158,7 @@ void runDecoder(SyncDecoder& decoder) {
       AVSubtitle sub;
       memset(&sub, 0, sizeof(sub));
       EXPECT_TRUE(Util::deserialize(*out.payload, &sub));
-      LOG(INFO) << "Found subtitles"
-                << ", num rects: " << sub.num_rects;
+      LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects;
       for (int i = 0; i < sub.num_rects; ++i) {
         std::string text = "picture";
         if (sub.rects[i]->type == SUBTITLE_TEXT) {
@@ -210,9 +209,9 @@ TEST(SyncDecoder, TestSyncDecoderPerformance) {
   auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8);
   auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8);
   auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4);
-  LOG(INFO) << "Clip decoding (us)"
-            << ", new(4x2): " << new4x2 << ", new(8x8): " << new8x8
-            << ", new(16x8): " << new16x8 << ", new(32x4): " << new32x4;
+  LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2
+            << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8
+            << ", new(32x4): " << new32x4;
 }
 
 TEST(SyncDecoder, Test) {
@@ -368,7 +367,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - no
+          // seek capability, yes - no
           return -1;
         }
         return object.seek(size, whence);
@@ -408,7 +407,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - no
+          // seek capability, yes - no
           return -1;
         }
         return object.seek(size, whence);
diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp
index 2ecd7512c06..7198d2174ed 100644
--- a/torchvision/csrc/io/decoder/util.cpp
+++ b/torchvision/csrc/io/decoder/util.cpp
@@ -265,7 +265,6 @@ std::string generateErrorDesc(int errorCode) {
 
 size_t serialize(const AVSubtitle& sub, ByteStorage* out) {
   const auto len = size(sub);
-  TORCH_CHECK_LE(len, out->tail());
   size_t pos = 0;
   if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) {
     return 0;
diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp
index 78de08b7139..0a093d9561b 100644
--- a/torchvision/csrc/io/decoder/util_test.cpp
+++ b/torchvision/csrc/io/decoder/util_test.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Logging.h>
-#include <dirent.h>
 #include <gtest/gtest.h>
 #include "util.h"
 
diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
index 62ec0709be1..8b712609e34 100644
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ b/torchvision/csrc/io/decoder/video_sampler.cpp
@@ -181,6 +181,23 @@ bool VideoSampler::init(const SamplerParameters& params) {
   // set output format
   params_ = params;
 
+  if (params.in.video.format == AV_PIX_FMT_YUV420P) {
+    /* When the video width and height are not multiples of 8,
+     * and there is no size change in the conversion,
+     * a blurry screen will appear on the right side
+     * This problem was discovered in 2012 and
+     * continues to exist in version 4.1.3 in 2019
+     * This problem can be avoided by increasing SWS_ACCURATE_RND
+     * details https://trac.ffmpeg.org/ticket/1582
+     */
+    if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) {
+      VLOG(1) << "The width " << params.in.video.width << " and height "
+              << params.in.video.height << " the image is not a multiple of 8, "
+              << "the decoding speed may be reduced";
+      swsFlags_ |= SWS_ACCURATE_RND;
+    }
+  }
+
   scaleContext_ = sws_getContext(
       params.in.video.width,
       params.in.video.height,
diff --git a/torchvision/csrc/io/image/common.cpp b/torchvision/csrc/io/image/common.cpp
new file mode 100644
index 00000000000..7743961a09d
--- /dev/null
+++ b/torchvision/csrc/io/image/common.cpp
@@ -0,0 +1,50 @@
+
+#include "common.h"
+
+// If we are in a Windows environment, we need to define
+// initialization functions for the _custom_ops extension
+#ifdef _WIN32
+void* PyInit_image(void) {
+  return nullptr;
+}
+#endif
+
+namespace vision {
+namespace image {
+
+void validate_encoded_data(const torch::Tensor& encoded_data) {
+  TORCH_CHECK(encoded_data.is_contiguous(), "Input tensor must be contiguous.");
+  TORCH_CHECK(
+      encoded_data.dtype() == torch::kU8,
+      "Input tensor must have uint8 data type, got ",
+      encoded_data.dtype());
+  TORCH_CHECK(
+      encoded_data.dim() == 1 && encoded_data.numel() > 0,
+      "Input tensor must be 1-dimensional and non-empty, got ",
+      encoded_data.dim(),
+      " dims  and ",
+      encoded_data.numel(),
+      " numels.");
+}
+
+bool should_this_return_rgb_or_rgba_let_me_know_in_the_comments_down_below_guys_see_you_in_the_next_video(
+    ImageReadMode mode,
+    bool has_alpha) {
+  // Return true if the calling decoding function should return a 3D RGB tensor,
+  // and false if it should return a 4D RGBA tensor.
+  // This function ignores the requested "grayscale" modes and treats it as
+  // "unchanged", so it should only used on decoders who don't support grayscale
+  // outputs.
+
+  if (mode == IMAGE_READ_MODE_RGB) {
+    return true;
+  }
+  if (mode == IMAGE_READ_MODE_RGB_ALPHA) {
+    return false;
+  }
+  // From here we assume mode is "unchanged", even for grayscale ones.
+  return !has_alpha;
+}
+
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/image_read_mode.h b/torchvision/csrc/io/image/common.h
similarity index 65%
rename from torchvision/csrc/io/image/image_read_mode.h
rename to torchvision/csrc/io/image/common.h
index 84425265c34..d81acfda7d4 100644
--- a/torchvision/csrc/io/image/image_read_mode.h
+++ b/torchvision/csrc/io/image/common.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <torch/torch.h>
 
 namespace vision {
 namespace image {
@@ -13,5 +14,11 @@ const ImageReadMode IMAGE_READ_MODE_GRAY_ALPHA = 2;
 const ImageReadMode IMAGE_READ_MODE_RGB = 3;
 const ImageReadMode IMAGE_READ_MODE_RGB_ALPHA = 4;
 
+void validate_encoded_data(const torch::Tensor& encoded_data);
+
+bool should_this_return_rgb_or_rgba_let_me_know_in_the_comments_down_below_guys_see_you_in_the_next_video(
+    ImageReadMode mode,
+    bool has_alpha);
+
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_gif.cpp b/torchvision/csrc/io/image/cpu/decode_gif.cpp
new file mode 100644
index 00000000000..f26d37950e3
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/decode_gif.cpp
@@ -0,0 +1,165 @@
+#include "decode_gif.h"
+#include <cstring>
+#include "../common.h"
+#include "giflib/gif_lib.h"
+
+namespace vision {
+namespace image {
+
+typedef struct reader_helper_t {
+  uint8_t const* encoded_data; // input tensor data pointer
+  size_t encoded_data_size; // size of input tensor in bytes
+  size_t num_bytes_read; // number of bytes read so far in the tensor
+} reader_helper_t;
+
+// That function is used by GIFLIB routines to read the encoded bytes.
+// This reads `len` bytes and writes them into `buf`. The data is read from the
+// input tensor passed to decode_gif() starting at the `num_bytes_read`
+// position.
+int read_from_tensor(GifFileType* gifFile, GifByteType* buf, int len) {
+  // the UserData field was set in DGifOpen()
+  reader_helper_t* reader_helper =
+      static_cast<reader_helper_t*>(gifFile->UserData);
+
+  size_t num_bytes_to_read = std::min(
+      (size_t)len,
+      reader_helper->encoded_data_size - reader_helper->num_bytes_read);
+  std::memcpy(
+      buf, reader_helper->encoded_data + reader_helper->num_bytes_read, len);
+  reader_helper->num_bytes_read += num_bytes_to_read;
+  return num_bytes_to_read;
+}
+
+torch::Tensor decode_gif(const torch::Tensor& encoded_data) {
+  // LibGif docs: https://giflib.sourceforge.net/intro.html
+  // Refer over there for more details on the libgif API, API ref, and a
+  // detailed description of the GIF format.
+
+  validate_encoded_data(encoded_data);
+
+  int error = D_GIF_SUCCEEDED;
+
+  // We're using DGidOpen. The other entrypoints of libgif are
+  // DGifOpenFileName and DGifOpenFileHandle but we don't want to use those,
+  // since we need to read the encoded bytes from a tensor of encoded bytes, not
+  // from a file (for consistency with existing jpeg and png decoders). Using
+  // DGifOpen is the only way to read from a custom source.
+  // For that we need to provide a reader function `read_from_tensor` that
+  // reads from the tensor, and we have to keep track of the number of bytes
+  // read so far: this is why we need the reader_helper struct.
+
+  // TODO: We are potentially doing an unnecessary copy of the encoded bytes:
+  // - 1 copy in from file to tensor (in read_file())
+  // - 1 copy from tensor to GIFLIB buffers (in read_from_tensor())
+  // Since we're vendoring GIFLIB we can potentially modify the calls to
+  // InternalRead() and just set the `buf` pointer to the tensor data directly.
+  // That might even save allocation of those buffers.
+  // If we do that, we'd have to make sure the buffers are never written to by
+  // GIFLIB, otherwise we'd be overridding the tensor data.
+  reader_helper_t reader_helper;
+  reader_helper.encoded_data = encoded_data.data_ptr<uint8_t>();
+  reader_helper.encoded_data_size = encoded_data.numel();
+  reader_helper.num_bytes_read = 0;
+  GifFileType* gifFile =
+      DGifOpen(static_cast<void*>(&reader_helper), read_from_tensor, &error);
+
+  TORCH_CHECK(
+      (gifFile != nullptr) && (error == D_GIF_SUCCEEDED),
+      "DGifOpenFileName() failed - ",
+      error);
+
+  if (DGifSlurp(gifFile) == GIF_ERROR) {
+    auto gifFileError = gifFile->Error;
+    DGifCloseFile(gifFile, &error);
+    TORCH_CHECK(false, "DGifSlurp() failed - ", gifFileError);
+  }
+  auto num_images = gifFile->ImageCount;
+
+  // This check should already done within DGifSlurp(), just to be safe
+  TORCH_CHECK(num_images > 0, "GIF file should contain at least one image!");
+
+  GifColorType bg = {0, 0, 0};
+  if (gifFile->SColorMap) {
+    bg = gifFile->SColorMap->Colors[gifFile->SBackGroundColor];
+  }
+
+  // The GIFLIB docs say that the canvas's height and width are potentially
+  // ignored by modern viewers, so to be on the safe side we set the output
+  // height to max(canvas_heigh, first_image_height). Same for width.
+  // https://giflib.sourceforge.net/whatsinagif/bits_and_bytes.html
+  auto out_h =
+      std::max(gifFile->SHeight, gifFile->SavedImages[0].ImageDesc.Height);
+  auto out_w =
+      std::max(gifFile->SWidth, gifFile->SavedImages[0].ImageDesc.Width);
+
+  // We output a channels-last tensor for consistency with other image decoders.
+  // Torchvision's resize tends to be is faster on uint8 channels-last tensors.
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kU8)
+                     .memory_format(torch::MemoryFormat::ChannelsLast);
+  auto out = torch::empty(
+      {int64_t(num_images), 3, int64_t(out_h), int64_t(out_w)}, options);
+  auto out_a = out.accessor<uint8_t, 4>();
+  for (int i = 0; i < num_images; i++) {
+    const SavedImage& img = gifFile->SavedImages[i];
+
+    GraphicsControlBlock gcb;
+    DGifSavedExtensionToGCB(gifFile, i, &gcb);
+
+    const GifImageDesc& desc = img.ImageDesc;
+    const ColorMapObject* cmap =
+        desc.ColorMap ? desc.ColorMap : gifFile->SColorMap;
+    TORCH_CHECK(
+        cmap != nullptr,
+        "Global and local color maps are missing. This should never happen!");
+
+    // When going from one image to another, there is a "disposal method" which
+    // specifies how to handle the transition. E.g. DISPOSE_DO_NOT means that
+    // the current image should essentially be drawn on top of the previous
+    // canvas. The pixels of that previous canvas will appear on the new one if
+    // either:
+    // - a pixel is transparent in the current image
+    // - the current image is smaller than the canvas, hence exposing its pixels
+    // The "background" disposal method means that the current canvas should be
+    // set to the background color.
+    // We only support these 2 modes and default to "background" when the
+    // disposal method is unspecified, or when it's set to "DISPOSE_PREVIOUS"
+    // which according to GIFLIB is not widely supported.
+    // (https://giflib.sourceforge.net/whatsinagif/animation_and_transparency.html).
+    if (i > 0 && gcb.DisposalMode == DISPOSE_DO_NOT) {
+      out[i] = out[i - 1];
+    } else {
+      // Background. If bg wasn't defined, it will be (0, 0, 0)
+      for (int h = 0; h < gifFile->SHeight; h++) {
+        for (int w = 0; w < gifFile->SWidth; w++) {
+          out_a[i][0][h][w] = bg.Red;
+          out_a[i][1][h][w] = bg.Green;
+          out_a[i][2][h][w] = bg.Blue;
+        }
+      }
+    }
+
+    for (int h = 0; h < desc.Height; h++) {
+      for (int w = 0; w < desc.Width; w++) {
+        auto c = img.RasterBits[h * desc.Width + w];
+        if (c == gcb.TransparentColor) {
+          continue;
+        }
+        GifColorType rgb = cmap->Colors[c];
+        out_a[i][0][h + desc.Top][w + desc.Left] = rgb.Red;
+        out_a[i][1][h + desc.Top][w + desc.Left] = rgb.Green;
+        out_a[i][2][h + desc.Top][w + desc.Left] = rgb.Blue;
+      }
+    }
+  }
+
+  out = out.squeeze(0); // remove batch dim if there's only one image
+
+  DGifCloseFile(gifFile, &error);
+  TORCH_CHECK(error == D_GIF_SUCCEEDED, "DGifCloseFile() failed - ", error);
+
+  return out;
+}
+
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_gif.h b/torchvision/csrc/io/image/cpu/decode_gif.h
new file mode 100644
index 00000000000..68d5073c91b
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/decode_gif.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/types.h>
+
+namespace vision {
+namespace image {
+
+// encoded_data tensor must be 1D uint8 and contiguous
+C10_EXPORT torch::Tensor decode_gif(const torch::Tensor& encoded_data);
+
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp
index 1cc05dc76ca..43a688604f6 100644
--- a/torchvision/csrc/io/image/cpu/decode_image.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -1,12 +1,19 @@
 #include "decode_image.h"
 
+#include "decode_gif.h"
 #include "decode_jpeg.h"
 #include "decode_png.h"
+#include "decode_webp.h"
 
 namespace vision {
 namespace image {
 
-torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) {
+torch::Tensor decode_image(
+    const torch::Tensor& data,
+    ImageReadMode mode,
+    bool apply_exif_orientation) {
+  // Check that tensor is a CPU tensor
+  TORCH_CHECK(data.device() == torch::kCPU, "Expected a CPU tensor");
   // Check that the input tensor dtype is uint8
   TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
   // Check that the input tensor is 1-dimensional
@@ -14,21 +21,43 @@ torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) {
       data.dim() == 1 && data.numel() > 0,
       "Expected a non empty 1-dimensional tensor");
 
+  auto err_msg =
+      "Unsupported image file. Only jpeg, png, webp and gif are currently supported. For avif and heic format, please rely on `decode_avif` and `decode_heic` directly.";
+
   auto datap = data.data_ptr<uint8_t>();
 
   const uint8_t jpeg_signature[3] = {255, 216, 255}; // == "\xFF\xD8\xFF"
+  TORCH_CHECK(data.numel() >= 3, err_msg);
+  if (memcmp(jpeg_signature, datap, 3) == 0) {
+    return decode_jpeg(data, mode, apply_exif_orientation);
+  }
+
   const uint8_t png_signature[4] = {137, 80, 78, 71}; // == "\211PNG"
+  TORCH_CHECK(data.numel() >= 4, err_msg);
+  if (memcmp(png_signature, datap, 4) == 0) {
+    return decode_png(data, mode, apply_exif_orientation);
+  }
 
-  if (memcmp(jpeg_signature, datap, 3) == 0) {
-    return decode_jpeg(data, mode);
-  } else if (memcmp(png_signature, datap, 4) == 0) {
-    return decode_png(data, mode);
-  } else {
-    TORCH_CHECK(
-        false,
-        "Unsupported image file. Only jpeg and png ",
-        "are currently supported.");
+  const uint8_t gif_signature_1[6] = {
+      0x47, 0x49, 0x46, 0x38, 0x39, 0x61}; // == "GIF89a"
+  const uint8_t gif_signature_2[6] = {
+      0x47, 0x49, 0x46, 0x38, 0x37, 0x61}; // == "GIF87a"
+  TORCH_CHECK(data.numel() >= 6, err_msg);
+  if (memcmp(gif_signature_1, datap, 6) == 0 ||
+      memcmp(gif_signature_2, datap, 6) == 0) {
+    return decode_gif(data);
+  }
+
+  const uint8_t webp_signature_begin[4] = {0x52, 0x49, 0x46, 0x46}; // == "RIFF"
+  const uint8_t webp_signature_end[7] = {
+      0x57, 0x45, 0x42, 0x50, 0x56, 0x50, 0x38}; // == "WEBPVP8"
+  TORCH_CHECK(data.numel() >= 15, err_msg);
+  if ((memcmp(webp_signature_begin, datap, 4) == 0) &&
+      (memcmp(webp_signature_end, datap + 8, 7) == 0)) {
+    return decode_webp(data, mode);
   }
+
+  TORCH_CHECK(false, err_msg);
 }
 
 } // namespace image
diff --git a/torchvision/csrc/io/image/cpu/decode_image.h b/torchvision/csrc/io/image/cpu/decode_image.h
index 853d6d91afa..f66d47eccd4 100644
--- a/torchvision/csrc/io/image/cpu/decode_image.h
+++ b/torchvision/csrc/io/image/cpu/decode_image.h
@@ -1,14 +1,15 @@
 #pragma once
 
 #include <torch/types.h>
-#include "../image_read_mode.h"
+#include "../common.h"
 
 namespace vision {
 namespace image {
 
 C10_EXPORT torch::Tensor decode_image(
     const torch::Tensor& data,
-    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
+    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
+    bool apply_exif_orientation = false);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
index 6ec644d003e..052b98e1be9 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -1,17 +1,23 @@
 #include "decode_jpeg.h"
+#include "../common.h"
 #include "common_jpeg.h"
+#include "exif.h"
 
 namespace vision {
 namespace image {
 
 #if !JPEG_FOUND
-torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
+torch::Tensor decode_jpeg(
+    const torch::Tensor& data,
+    ImageReadMode mode,
+    bool apply_exif_orientation) {
   TORCH_CHECK(
       false, "decode_jpeg: torchvision not compiled with libjpeg support");
 }
 #else
 
 using namespace detail;
+using namespace exif_private;
 
 namespace {
 
@@ -65,19 +71,72 @@ static void torch_jpeg_set_source_mgr(
   src->len = len;
   src->pub.bytes_in_buffer = len;
   src->pub.next_input_byte = src->data;
+
+  jpeg_save_markers(cinfo, APP1, 0xffff);
+}
+
+inline unsigned char clamped_cmyk_rgb_convert(
+    unsigned char k,
+    unsigned char cmy) {
+  // Inspired from Pillow:
+  // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L568-L569
+  int v = k * cmy + 128;
+  v = ((v >> 8) + v) >> 8;
+  return std::clamp(k - v, 0, 255);
+}
+
+void convert_line_cmyk_to_rgb(
+    j_decompress_ptr cinfo,
+    const unsigned char* cmyk_line,
+    unsigned char* rgb_line) {
+  int width = cinfo->output_width;
+  for (int i = 0; i < width; ++i) {
+    int c = cmyk_line[i * 4 + 0];
+    int m = cmyk_line[i * 4 + 1];
+    int y = cmyk_line[i * 4 + 2];
+    int k = cmyk_line[i * 4 + 3];
+
+    rgb_line[i * 3 + 0] = clamped_cmyk_rgb_convert(k, 255 - c);
+    rgb_line[i * 3 + 1] = clamped_cmyk_rgb_convert(k, 255 - m);
+    rgb_line[i * 3 + 2] = clamped_cmyk_rgb_convert(k, 255 - y);
+  }
+}
+
+inline unsigned char rgb_to_gray(int r, int g, int b) {
+  // Inspired from Pillow:
+  // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L226
+  return (r * 19595 + g * 38470 + b * 7471 + 0x8000) >> 16;
+}
+
+void convert_line_cmyk_to_gray(
+    j_decompress_ptr cinfo,
+    const unsigned char* cmyk_line,
+    unsigned char* gray_line) {
+  int width = cinfo->output_width;
+  for (int i = 0; i < width; ++i) {
+    int c = cmyk_line[i * 4 + 0];
+    int m = cmyk_line[i * 4 + 1];
+    int y = cmyk_line[i * 4 + 2];
+    int k = cmyk_line[i * 4 + 3];
+
+    int r = clamped_cmyk_rgb_convert(k, 255 - c);
+    int g = clamped_cmyk_rgb_convert(k, 255 - m);
+    int b = clamped_cmyk_rgb_convert(k, 255 - y);
+
+    gray_line[i] = rgb_to_gray(r, g, b);
+  }
 }
 
 } // namespace
 
-torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
+torch::Tensor decode_jpeg(
+    const torch::Tensor& data,
+    ImageReadMode mode,
+    bool apply_exif_orientation) {
   C10_LOG_API_USAGE_ONCE(
       "torchvision.csrc.io.image.cpu.decode_jpeg.decode_jpeg");
-  // Check that the input tensor dtype is uint8
-  TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
-  // Check that the input tensor is 1-dimensional
-  TORCH_CHECK(
-      data.dim() == 1 && data.numel() > 0,
-      "Expected a non empty 1-dimensional tensor");
+
+  validate_encoded_data(data);
 
   struct jpeg_decompress_struct cinfo;
   struct torch_jpeg_error_mgr jerr;
@@ -102,20 +161,29 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   jpeg_read_header(&cinfo, TRUE);
 
   int channels = cinfo.num_components;
+  bool cmyk_to_rgb_or_gray = false;
 
   if (mode != IMAGE_READ_MODE_UNCHANGED) {
     switch (mode) {
       case IMAGE_READ_MODE_GRAY:
-        if (cinfo.jpeg_color_space != JCS_GRAYSCALE) {
+        if (cinfo.jpeg_color_space == JCS_CMYK ||
+            cinfo.jpeg_color_space == JCS_YCCK) {
+          cinfo.out_color_space = JCS_CMYK;
+          cmyk_to_rgb_or_gray = true;
+        } else {
           cinfo.out_color_space = JCS_GRAYSCALE;
-          channels = 1;
         }
+        channels = 1;
         break;
       case IMAGE_READ_MODE_RGB:
-        if (cinfo.jpeg_color_space != JCS_RGB) {
+        if (cinfo.jpeg_color_space == JCS_CMYK ||
+            cinfo.jpeg_color_space == JCS_YCCK) {
+          cinfo.out_color_space = JCS_CMYK;
+          cmyk_to_rgb_or_gray = true;
+        } else {
           cinfo.out_color_space = JCS_RGB;
-          channels = 3;
         }
+        channels = 3;
         break;
       /*
        * Libjpeg does not support converting from CMYK to grayscale etc. There
@@ -130,6 +198,11 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
     jpeg_calc_output_dimensions(&cinfo);
   }
 
+  int exif_orientation = -1;
+  if (apply_exif_orientation) {
+    exif_orientation = fetch_jpeg_exif_orientation(&cinfo);
+  }
+
   jpeg_start_decompress(&cinfo);
 
   int height = cinfo.output_height;
@@ -139,21 +212,57 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   auto tensor =
       torch::empty({int64_t(height), int64_t(width), channels}, torch::kU8);
   auto ptr = tensor.data_ptr<uint8_t>();
+  torch::Tensor cmyk_line_tensor;
+  if (cmyk_to_rgb_or_gray) {
+    cmyk_line_tensor = torch::empty({int64_t(width), 4}, torch::kU8);
+  }
+
   while (cinfo.output_scanline < cinfo.output_height) {
     /* jpeg_read_scanlines expects an array of pointers to scanlines.
      * Here the array is only one element long, but you could ask for
      * more than one scanline at a time if that's more convenient.
      */
-    jpeg_read_scanlines(&cinfo, &ptr, 1);
+    if (cmyk_to_rgb_or_gray) {
+      auto cmyk_line_ptr = cmyk_line_tensor.data_ptr<uint8_t>();
+      jpeg_read_scanlines(&cinfo, &cmyk_line_ptr, 1);
+
+      if (channels == 3) {
+        convert_line_cmyk_to_rgb(&cinfo, cmyk_line_ptr, ptr);
+      } else if (channels == 1) {
+        convert_line_cmyk_to_gray(&cinfo, cmyk_line_ptr, ptr);
+      }
+    } else {
+      jpeg_read_scanlines(&cinfo, &ptr, 1);
+    }
     ptr += stride;
   }
 
   jpeg_finish_decompress(&cinfo);
   jpeg_destroy_decompress(&cinfo);
-  return tensor.permute({2, 0, 1});
+  auto output = tensor.permute({2, 0, 1});
+
+  if (apply_exif_orientation) {
+    return exif_orientation_transform(output, exif_orientation);
+  }
+  return output;
+}
+#endif // #if !JPEG_FOUND
+
+int64_t _jpeg_version() {
+#if JPEG_FOUND
+  return JPEG_LIB_VERSION;
+#else
+  return -1;
+#endif
 }
 
+bool _is_compiled_against_turbo() {
+#ifdef LIBJPEG_TURBO_VERSION
+  return true;
+#else
+  return false;
 #endif
+}
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.h b/torchvision/csrc/io/image/cpu/decode_jpeg.h
index 97ed3d51a54..7412a46d2ea 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.h
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.h
@@ -1,14 +1,18 @@
 #pragma once
 
 #include <torch/types.h>
-#include "../image_read_mode.h"
+#include "../common.h"
 
 namespace vision {
 namespace image {
 
 C10_EXPORT torch::Tensor decode_jpeg(
     const torch::Tensor& data,
-    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
+    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
+    bool apply_exif_orientation = false);
+
+C10_EXPORT int64_t _jpeg_version();
+C10_EXPORT bool _is_compiled_against_turbo();
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_png.cpp b/torchvision/csrc/io/image/cpu/decode_png.cpp
index b1ceaf1badd..5ea6f073975 100644
--- a/torchvision/csrc/io/image/cpu/decode_png.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_png.cpp
@@ -1,14 +1,18 @@
 #include "decode_png.h"
+#include "../common.h"
 #include "common_png.h"
+#include "exif.h"
 
 namespace vision {
 namespace image {
 
+using namespace exif_private;
+
 #if !PNG_FOUND
 torch::Tensor decode_png(
     const torch::Tensor& data,
     ImageReadMode mode,
-    bool allow_16_bits) {
+    bool apply_exif_orientation) {
   TORCH_CHECK(
       false, "decode_png: torchvision not compiled with libPNG support");
 }
@@ -22,14 +26,10 @@ bool is_little_endian() {
 torch::Tensor decode_png(
     const torch::Tensor& data,
     ImageReadMode mode,
-    bool allow_16_bits) {
+    bool apply_exif_orientation) {
   C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.image.cpu.decode_png.decode_png");
-  // Check that the input tensor dtype is uint8
-  TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
-  // Check that the input tensor is 1-dimensional
-  TORCH_CHECK(
-      data.dim() == 1 && data.numel() > 0,
-      "Expected a non empty 1-dimensional tensor");
+
+  validate_encoded_data(data);
 
   auto png_ptr =
       png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
@@ -49,6 +49,7 @@ torch::Tensor decode_png(
     png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
     TORCH_CHECK(false, "Internal error.");
   }
+  TORCH_CHECK(datap_len >= 8, "Content is too small for png!")
   auto is_png = !png_sig_cmp(datap, 0, 8);
   TORCH_CHECK(is_png, "Content is not png!")
 
@@ -93,18 +94,19 @@ torch::Tensor decode_png(
     TORCH_CHECK(retval == 1, "Could read image metadata from content.")
   }
 
-  auto max_bit_depth = allow_16_bits ? 16 : 8;
-  auto err_msg = "At most " + std::to_string(max_bit_depth) +
-      "-bit PNG images are supported currently.";
-  if (bit_depth > max_bit_depth) {
+  if (bit_depth > 8 && bit_depth != 16) {
     png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
-    TORCH_CHECK(false, err_msg)
+    TORCH_CHECK(
+        false,
+        "bit depth of png image is " + std::to_string(bit_depth) +
+            ". Only <=8 and 16 are supported.")
   }
 
   int channels = png_get_channels(png_ptr, info_ptr);
 
-  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
+  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
     png_set_expand_gray_1_2_4_to_8(png_ptr);
+  }
 
   int number_of_passes;
   if (interlace_type == PNG_INTERLACE_ADAM7) {
@@ -193,48 +195,34 @@ torch::Tensor decode_png(
   }
 
   auto num_pixels_per_row = width * channels;
+  auto is_16_bits = bit_depth == 16;
   auto tensor = torch::empty(
       {int64_t(height), int64_t(width), channels},
-      bit_depth <= 8 ? torch::kU8 : torch::kI32);
-
-  if (bit_depth <= 8) {
-    auto t_ptr = tensor.accessor<uint8_t, 3>().data();
-    for (int pass = 0; pass < number_of_passes; pass++) {
-      for (png_uint_32 i = 0; i < height; ++i) {
-        png_read_row(png_ptr, t_ptr, nullptr);
-        t_ptr += num_pixels_per_row;
-      }
-      t_ptr = tensor.accessor<uint8_t, 3>().data();
-    }
-  } else {
-    // We're reading a 16bits png, but pytorch doesn't support uint16.
-    // So we read each row in a 16bits tmp_buffer which we then cast into
-    // a int32 tensor instead.
-    if (is_little_endian()) {
-      png_set_swap(png_ptr);
-    }
-    int32_t* t_ptr = tensor.accessor<int32_t, 3>().data();
-
-    // We create a tensor instead of malloc-ing for automatic memory management
-    auto tmp_buffer_tensor = torch::empty(
-        {int64_t(num_pixels_per_row * sizeof(uint16_t))}, torch::kU8);
-    uint16_t* tmp_buffer =
-        (uint16_t*)tmp_buffer_tensor.accessor<uint8_t, 1>().data();
-
-    for (int pass = 0; pass < number_of_passes; pass++) {
-      for (png_uint_32 i = 0; i < height; ++i) {
-        png_read_row(png_ptr, (uint8_t*)tmp_buffer, nullptr);
-        // Now we copy the uint16 values into the int32 tensor.
-        for (size_t j = 0; j < num_pixels_per_row; ++j) {
-          t_ptr[j] = (int32_t)tmp_buffer[j];
-        }
-        t_ptr += num_pixels_per_row;
-      }
-      t_ptr = tensor.accessor<int32_t, 3>().data();
+      is_16_bits ? at::kUInt16 : torch::kU8);
+  if (is_little_endian()) {
+    png_set_swap(png_ptr);
+  }
+  auto t_ptr = (uint8_t*)tensor.data_ptr();
+  for (int pass = 0; pass < number_of_passes; pass++) {
+    for (png_uint_32 i = 0; i < height; ++i) {
+      png_read_row(png_ptr, t_ptr, nullptr);
+      t_ptr += num_pixels_per_row * (is_16_bits ? 2 : 1);
     }
+    t_ptr = (uint8_t*)tensor.data_ptr();
+  }
+
+  int exif_orientation = -1;
+  if (apply_exif_orientation) {
+    exif_orientation = fetch_png_exif_orientation(png_ptr, info_ptr);
   }
+
   png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
-  return tensor.permute({2, 0, 1});
+
+  auto output = tensor.permute({2, 0, 1});
+  if (apply_exif_orientation) {
+    return exif_orientation_transform(output, exif_orientation);
+  }
+  return output;
 }
 #endif
 
diff --git a/torchvision/csrc/io/image/cpu/decode_png.h b/torchvision/csrc/io/image/cpu/decode_png.h
index fed89327cdb..faaffa7ae49 100644
--- a/torchvision/csrc/io/image/cpu/decode_png.h
+++ b/torchvision/csrc/io/image/cpu/decode_png.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <torch/types.h>
-#include "../image_read_mode.h"
+#include "../common.h"
 
 namespace vision {
 namespace image {
@@ -9,7 +9,7 @@ namespace image {
 C10_EXPORT torch::Tensor decode_png(
     const torch::Tensor& data,
     ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
-    bool allow_16_bits = false);
+    bool apply_exif_orientation = false);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_webp.cpp b/torchvision/csrc/io/image/cpu/decode_webp.cpp
new file mode 100644
index 00000000000..80fe68862fb
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/decode_webp.cpp
@@ -0,0 +1,66 @@
+#include "decode_webp.h"
+#include "../common.h"
+
+#if WEBP_FOUND
+#include "webp/decode.h"
+#include "webp/types.h"
+#endif // WEBP_FOUND
+
+namespace vision {
+namespace image {
+
+#if !WEBP_FOUND
+torch::Tensor decode_webp(
+    const torch::Tensor& encoded_data,
+    ImageReadMode mode) {
+  TORCH_CHECK(
+      false, "decode_webp: torchvision not compiled with libwebp support");
+}
+#else
+
+torch::Tensor decode_webp(
+    const torch::Tensor& encoded_data,
+    ImageReadMode mode) {
+  validate_encoded_data(encoded_data);
+
+  auto encoded_data_p = encoded_data.data_ptr<uint8_t>();
+  auto encoded_data_size = encoded_data.numel();
+
+  WebPBitstreamFeatures features;
+  auto res = WebPGetFeatures(encoded_data_p, encoded_data_size, &features);
+  TORCH_CHECK(
+      res == VP8_STATUS_OK, "WebPGetFeatures failed with error code ", res);
+  TORCH_CHECK(
+      !features.has_animation, "Animated webp files are not supported.");
+
+  if (mode == IMAGE_READ_MODE_GRAY || mode == IMAGE_READ_MODE_GRAY_ALPHA) {
+    TORCH_WARN_ONCE(
+        "Webp does not support grayscale conversions. "
+        "The returned tensor will be in the colorspace of the original image.");
+  }
+
+  auto return_rgb =
+      should_this_return_rgb_or_rgba_let_me_know_in_the_comments_down_below_guys_see_you_in_the_next_video(
+          mode, features.has_alpha);
+
+  auto decoding_func = return_rgb ? WebPDecodeRGB : WebPDecodeRGBA;
+  auto num_channels = return_rgb ? 3 : 4;
+
+  int width = 0;
+  int height = 0;
+
+  auto decoded_data =
+      decoding_func(encoded_data_p, encoded_data_size, &width, &height);
+
+  TORCH_CHECK(decoded_data != nullptr, "WebPDecodeRGB[A] failed.");
+
+  auto deleter = [decoded_data](void*) { WebPFree(decoded_data); };
+  auto out = torch::from_blob(
+      decoded_data, {height, width, num_channels}, deleter, torch::kUInt8);
+
+  return out.permute({2, 0, 1});
+}
+#endif // WEBP_FOUND
+
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_webp.h b/torchvision/csrc/io/image/cpu/decode_webp.h
new file mode 100644
index 00000000000..d5c81547c42
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/decode_webp.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/types.h>
+#include "../common.h"
+
+namespace vision {
+namespace image {
+
+C10_EXPORT torch::Tensor decode_webp(
+    const torch::Tensor& encoded_data,
+    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
+
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/encode_png.cpp b/torchvision/csrc/io/image/cpu/encode_png.cpp
index a9b7d76ff61..d55a0ed3ff6 100644
--- a/torchvision/csrc/io/image/cpu/encode_png.cpp
+++ b/torchvision/csrc/io/image/cpu/encode_png.cpp
@@ -47,13 +47,15 @@ void torch_png_write_data(
   size_t nsize = p->size + length;
 
   /* allocate or grow buffer */
-  if (p->buffer)
+  if (p->buffer) {
     p->buffer = (char*)realloc(p->buffer, nsize);
-  else
+  } else {
     p->buffer = (char*)malloc(nsize);
+  }
 
-  if (!p->buffer)
+  if (!p->buffer) {
     png_error(png_ptr, "Write Error");
+  }
 
   /* copy new bytes to end of buffer */
   memcpy(p->buffer + p->size, data, length);
@@ -71,7 +73,7 @@ torch::Tensor encode_png(const torch::Tensor& data, int64_t compression_level) {
 
   // Define output buffer
   struct torch_mem_encode buf_info;
-  buf_info.buffer = NULL;
+  buf_info.buffer = nullptr;
   buf_info.size = 0;
 
   /* Establish the setjmp return context for my_error_exit to use. */
@@ -79,15 +81,15 @@ torch::Tensor encode_png(const torch::Tensor& data, int64_t compression_level) {
     /* If we get here, the PNG code has signaled an error.
      * We need to clean up the PNG object and the buffer.
      */
-    if (info_ptr != NULL) {
+    if (info_ptr != nullptr) {
       png_destroy_info_struct(png_write, &info_ptr);
     }
 
-    if (png_write != NULL) {
-      png_destroy_write_struct(&png_write, NULL);
+    if (png_write != nullptr) {
+      png_destroy_write_struct(&png_write, nullptr);
     }
 
-    if (buf_info.buffer != NULL) {
+    if (buf_info.buffer != nullptr) {
       free(buf_info.buffer);
     }
 
@@ -121,12 +123,12 @@ torch::Tensor encode_png(const torch::Tensor& data, int64_t compression_level) {
 
   // Initialize PNG structures
   png_write = png_create_write_struct(
-      PNG_LIBPNG_VER_STRING, &err_ptr, torch_png_error, NULL);
+      PNG_LIBPNG_VER_STRING, &err_ptr, torch_png_error, nullptr);
 
   info_ptr = png_create_info_struct(png_write);
 
   // Define custom buffer output
-  png_set_write_fn(png_write, &buf_info, torch_png_write_data, NULL);
+  png_set_write_fn(png_write, &buf_info, torch_png_write_data, nullptr);
 
   // Set output image information
   auto color_type = channels == 1 ? PNG_COLOR_TYPE_GRAY : PNG_COLOR_TYPE_RGB;
diff --git a/torchvision/csrc/io/image/cpu/exif.h b/torchvision/csrc/io/image/cpu/exif.h
new file mode 100644
index 00000000000..7680737f8c0
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/exif.h
@@ -0,0 +1,257 @@
+// @nolint (improperly imported third-party code)
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this
+license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without
+modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright
+notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote
+products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is"
+and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are
+disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any
+direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#pragma once
+// Functions in this module are taken from OpenCV
+// https://github.com/opencv/opencv/blob/097891e311fae1d8354eb092a0fd0171e630d78c/modules/imgcodecs/src/exif.cpp
+
+#if JPEG_FOUND
+#include <jpeglib.h>
+#endif
+#if PNG_FOUND
+#include <png.h>
+#endif
+
+#include <torch/types.h>
+
+namespace vision {
+namespace image {
+namespace exif_private {
+
+constexpr uint16_t APP1 = 0xe1;
+constexpr uint16_t ENDIANNESS_INTEL = 0x49;
+constexpr uint16_t ENDIANNESS_MOTO = 0x4d;
+constexpr uint16_t REQ_EXIF_TAG_MARK = 0x2a;
+constexpr uint16_t ORIENTATION_EXIF_TAG = 0x0112;
+constexpr uint16_t INCORRECT_TAG = -1;
+
+class ExifDataReader {
+ public:
+  ExifDataReader(unsigned char* p, size_t s) : _ptr(p), _size(s) {}
+  size_t size() const {
+    return _size;
+  }
+  const unsigned char& operator[](size_t index) const {
+    TORCH_CHECK(index >= 0 && index < _size);
+    return _ptr[index];
+  }
+
+ protected:
+  unsigned char* _ptr;
+  size_t _size;
+};
+
+inline uint16_t get_endianness(const ExifDataReader& exif_data) {
+  if ((exif_data.size() < 1) ||
+      (exif_data.size() > 1 && exif_data[0] != exif_data[1])) {
+    return 0;
+  }
+  if (exif_data[0] == 'I') {
+    return ENDIANNESS_INTEL;
+  }
+  if (exif_data[0] == 'M') {
+    return ENDIANNESS_MOTO;
+  }
+  return 0;
+}
+
+inline uint16_t get_uint16(
+    const ExifDataReader& exif_data,
+    uint16_t endianness,
+    const size_t offset) {
+  if (offset + 1 >= exif_data.size()) {
+    return INCORRECT_TAG;
+  }
+
+  if (endianness == ENDIANNESS_INTEL) {
+    return exif_data[offset] + (exif_data[offset + 1] << 8);
+  }
+  return (exif_data[offset] << 8) + exif_data[offset + 1];
+}
+
+inline uint32_t get_uint32(
+    const ExifDataReader& exif_data,
+    uint16_t endianness,
+    const size_t offset) {
+  if (offset + 3 >= exif_data.size()) {
+    return INCORRECT_TAG;
+  }
+
+  if (endianness == ENDIANNESS_INTEL) {
+    return exif_data[offset] + (exif_data[offset + 1] << 8) +
+        (exif_data[offset + 2] << 16) + (exif_data[offset + 3] << 24);
+  }
+  return (exif_data[offset] << 24) + (exif_data[offset + 1] << 16) +
+      (exif_data[offset + 2] << 8) + exif_data[offset + 3];
+}
+
+inline int fetch_exif_orientation(unsigned char* exif_data_ptr, size_t size) {
+  int exif_orientation = -1;
+
+  // Exif binary structure looks like this
+  // First 6 bytes: [E, x, i, f, 0, 0]
+  // Endianness, 2 bytes : [M, M] or [I, I]
+  // Tag mark, 2 bytes: [0, 0x2a]
+  // Offset, 4 bytes
+  // Num entries, 2 bytes
+  // Tag entries and data, tag has 2 bytes and its data has 10 bytes
+  // For more details:
+  // http://www.media.mit.edu/pia/Research/deepview/exif.html
+
+  ExifDataReader exif_data(exif_data_ptr, size);
+  auto endianness = get_endianness(exif_data);
+
+  // Checking whether Tag Mark (0x002A) correspond to one contained in the
+  // Jpeg file
+  uint16_t tag_mark = get_uint16(exif_data, endianness, 2);
+  if (tag_mark == REQ_EXIF_TAG_MARK) {
+    auto offset = get_uint32(exif_data, endianness, 4);
+    size_t num_entry = get_uint16(exif_data, endianness, offset);
+    offset += 2; // go to start of tag fields
+    constexpr size_t tiff_field_size = 12;
+    for (size_t entry = 0; entry < num_entry; entry++) {
+      // Here we just search for orientation tag and parse it
+      auto tag_num = get_uint16(exif_data, endianness, offset);
+      if (tag_num == INCORRECT_TAG) {
+        break;
+      }
+      if (tag_num == ORIENTATION_EXIF_TAG) {
+        exif_orientation = get_uint16(exif_data, endianness, offset + 8);
+        break;
+      }
+      offset += tiff_field_size;
+    }
+  }
+  return exif_orientation;
+}
+
+#if JPEG_FOUND
+inline int fetch_jpeg_exif_orientation(j_decompress_ptr cinfo) {
+  // Check for Exif marker APP1
+  jpeg_saved_marker_ptr exif_marker = 0;
+  jpeg_saved_marker_ptr cmarker = cinfo->marker_list;
+  while (cmarker && exif_marker == 0) {
+    if (cmarker->marker == APP1) {
+      exif_marker = cmarker;
+    }
+    cmarker = cmarker->next;
+  }
+
+  if (!exif_marker) {
+    return -1;
+  }
+
+  constexpr size_t start_offset = 6;
+  if (exif_marker->data_length <= start_offset) {
+    return -1;
+  }
+
+  auto* exif_data_ptr = exif_marker->data + start_offset;
+  auto size = exif_marker->data_length - start_offset;
+
+  return fetch_exif_orientation(exif_data_ptr, size);
+}
+#endif // #if JPEG_FOUND
+
+#if PNG_FOUND && defined(PNG_eXIf_SUPPORTED)
+inline int fetch_png_exif_orientation(png_structp png_ptr, png_infop info_ptr) {
+  png_uint_32 num_exif = 0;
+  png_bytep exif = 0;
+
+  // Exif info could be in info_ptr
+  if (png_get_valid(png_ptr, info_ptr, PNG_INFO_eXIf)) {
+    png_get_eXIf_1(png_ptr, info_ptr, &num_exif, &exif);
+  }
+
+  if (exif && num_exif > 0) {
+    return fetch_exif_orientation(exif, num_exif);
+  }
+  return -1;
+}
+#endif // #if PNG_FOUND && defined(PNG_eXIf_SUPPORTED)
+
+constexpr uint16_t IMAGE_ORIENTATION_TL = 1; // normal orientation
+constexpr uint16_t IMAGE_ORIENTATION_TR = 2; // needs horizontal flip
+constexpr uint16_t IMAGE_ORIENTATION_BR = 3; // needs 180 rotation
+constexpr uint16_t IMAGE_ORIENTATION_BL = 4; // needs vertical flip
+constexpr uint16_t IMAGE_ORIENTATION_LT =
+    5; // mirrored horizontal & rotate 270 CW
+constexpr uint16_t IMAGE_ORIENTATION_RT = 6; // rotate 90 CW
+constexpr uint16_t IMAGE_ORIENTATION_RB =
+    7; // mirrored horizontal & rotate 90 CW
+constexpr uint16_t IMAGE_ORIENTATION_LB = 8; // needs 270 CW rotation
+
+inline torch::Tensor exif_orientation_transform(
+    const torch::Tensor& image,
+    int orientation) {
+  if (orientation == IMAGE_ORIENTATION_TL) {
+    return image;
+  } else if (orientation == IMAGE_ORIENTATION_TR) {
+    return image.flip(-1);
+  } else if (orientation == IMAGE_ORIENTATION_BR) {
+    // needs 180 rotation equivalent to
+    // flip both horizontally and vertically
+    return image.flip({-2, -1});
+  } else if (orientation == IMAGE_ORIENTATION_BL) {
+    return image.flip(-2);
+  } else if (orientation == IMAGE_ORIENTATION_LT) {
+    return image.transpose(-1, -2);
+  } else if (orientation == IMAGE_ORIENTATION_RT) {
+    return image.transpose(-1, -2).flip(-1);
+  } else if (orientation == IMAGE_ORIENTATION_RB) {
+    return image.transpose(-1, -2).flip({-2, -1});
+  } else if (orientation == IMAGE_ORIENTATION_LB) {
+    return image.transpose(-1, -2).flip(-2);
+  }
+  return image;
+}
+
+} // namespace exif_private
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/giflib/README b/torchvision/csrc/io/image/cpu/giflib/README
new file mode 100644
index 00000000000..7353453e32e
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/README
@@ -0,0 +1,28 @@
+These files come from the GIFLIB project (https://giflib.sourceforge.net/) and
+are licensed under the MIT license.
+
+Some modifications have been made to the original files:
+- Remove use of "register" keyword in gifalloc.c for C++17 compatibility.
+- Declare loop variable i in DGifGetImageHeader as int instead of unsigned int.
+
+Below is the original license text from the COPYING file of the GIFLIB project:
+
+= MIT LICENSE
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c b/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c
new file mode 100644
index 00000000000..7d35fff87ee
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c
@@ -0,0 +1,1313 @@
+// @nolint (improperly imported third-party code)
+/******************************************************************************
+
+dgif_lib.c - GIF decoding
+
+The functions here and in egif_lib.c are partitioned carefully so that
+if you only require one of read and write capability, only one of these
+two modules will be linked.  Preserve this property!
+
+*****************************************************************************/
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Copyright (C) Eric S. Raymond <esr@thyrsus.com>
+
+#include <fcntl.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif /* _WIN32 */
+
+#include "gif_lib.h"
+#include "gif_lib_private.h"
+
+/* compose unsigned little endian value */
+#define UNSIGNED_LITTLE_ENDIAN(lo, hi) ((lo) | ((hi) << 8))
+
+/* avoid extra function call in case we use fread (TVT) */
+static int InternalRead(GifFileType *gif, GifByteType *buf, int len) {
+	// fprintf(stderr, "### Read: %d\n", len);
+	return (((GifFilePrivateType *)gif->Private)->Read
+	            ? ((GifFilePrivateType *)gif->Private)->Read(gif, buf, len)
+	            : fread(buf, 1, len,
+	                    ((GifFilePrivateType *)gif->Private)->File));
+}
+
+static int DGifGetWord(GifFileType *GifFile, GifWord *Word);
+static int DGifSetupDecompress(GifFileType *GifFile);
+static int DGifDecompressLine(GifFileType *GifFile, GifPixelType *Line,
+                              int LineLen);
+static int DGifGetPrefixChar(const GifPrefixType *Prefix, int Code,
+                             int ClearCode);
+static int DGifDecompressInput(GifFileType *GifFile, int *Code);
+static int DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf,
+                             GifByteType *NextByte);
+
+/******************************************************************************
+ Open a new GIF file for read, given by its name.
+ Returns dynamically allocated GifFileType pointer which serves as the GIF
+ info record.
+******************************************************************************/
+GifFileType *DGifOpenFileName(const char *FileName, int *Error) {
+	int FileHandle;
+	GifFileType *GifFile;
+
+	if ((FileHandle = open(FileName, O_RDONLY)) == -1) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_OPEN_FAILED;
+		}
+		return NULL;
+	}
+
+	GifFile = DGifOpenFileHandle(FileHandle, Error);
+	return GifFile;
+}
+
+/******************************************************************************
+ Update a new GIF file, given its file handle.
+ Returns dynamically allocated GifFileType pointer which serves as the GIF
+ info record.
+******************************************************************************/
+GifFileType *DGifOpenFileHandle(int FileHandle, int *Error) {
+	char Buf[GIF_STAMP_LEN + 1];
+	GifFileType *GifFile;
+	GifFilePrivateType *Private;
+	FILE *f;
+
+	GifFile = (GifFileType *)malloc(sizeof(GifFileType));
+	if (GifFile == NULL) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+		}
+		(void)close(FileHandle);
+		return NULL;
+	}
+
+	/*@i1@*/ memset(GifFile, '\0', sizeof(GifFileType));
+
+	/* Belt and suspenders, in case the null pointer isn't zero */
+	GifFile->SavedImages = NULL;
+	GifFile->SColorMap = NULL;
+
+	Private = (GifFilePrivateType *)calloc(1, sizeof(GifFilePrivateType));
+	if (Private == NULL) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+		}
+		(void)close(FileHandle);
+		free((char *)GifFile);
+		return NULL;
+	}
+
+	/*@i1@*/ memset(Private, '\0', sizeof(GifFilePrivateType));
+
+#ifdef _WIN32
+	_setmode(FileHandle, O_BINARY); /* Make sure it is in binary mode. */
+#endif                                  /* _WIN32 */
+
+	f = fdopen(FileHandle, "rb"); /* Make it into a stream: */
+
+	/*@-mustfreeonly@*/
+	GifFile->Private = (void *)Private;
+	Private->FileHandle = FileHandle;
+	Private->File = f;
+	Private->FileState = FILE_STATE_READ;
+	Private->Read = NULL;     /* don't use alternate input method (TVT) */
+	GifFile->UserData = NULL; /* TVT */
+	/*@=mustfreeonly@*/
+
+	/* Let's see if this is a GIF file: */
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, (unsigned char *)Buf, GIF_STAMP_LEN) !=
+	    GIF_STAMP_LEN) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_READ_FAILED;
+		}
+		(void)fclose(f);
+		free((char *)Private);
+		free((char *)GifFile);
+		return NULL;
+	}
+
+	/* Check for GIF prefix at start of file */
+	Buf[GIF_STAMP_LEN] = 0;
+	if (strncmp(GIF_STAMP, Buf, GIF_VERSION_POS) != 0) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NOT_GIF_FILE;
+		}
+		(void)fclose(f);
+		free((char *)Private);
+		free((char *)GifFile);
+		return NULL;
+	}
+
+	if (DGifGetScreenDesc(GifFile) == GIF_ERROR) {
+		(void)fclose(f);
+		free((char *)Private);
+		free((char *)GifFile);
+		return NULL;
+	}
+
+	GifFile->Error = 0;
+
+	/* What version of GIF? */
+	Private->gif89 = (Buf[GIF_VERSION_POS + 1] == '9');
+
+	return GifFile;
+}
+
+/******************************************************************************
+ GifFileType constructor with user supplied input function (TVT)
+******************************************************************************/
+GifFileType *DGifOpen(void *userData, InputFunc readFunc, int *Error) {
+	char Buf[GIF_STAMP_LEN + 1];
+	GifFileType *GifFile;
+	GifFilePrivateType *Private;
+
+	GifFile = (GifFileType *)malloc(sizeof(GifFileType));
+	if (GifFile == NULL) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+		}
+		return NULL;
+	}
+
+	memset(GifFile, '\0', sizeof(GifFileType));
+
+	/* Belt and suspenders, in case the null pointer isn't zero */
+	GifFile->SavedImages = NULL;
+	GifFile->SColorMap = NULL;
+
+	Private = (GifFilePrivateType *)calloc(1, sizeof(GifFilePrivateType));
+	if (!Private) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+		}
+		free((char *)GifFile);
+		return NULL;
+	}
+	/*@i1@*/ memset(Private, '\0', sizeof(GifFilePrivateType));
+
+	GifFile->Private = (void *)Private;
+	Private->FileHandle = 0;
+	Private->File = NULL;
+	Private->FileState = FILE_STATE_READ;
+
+	Private->Read = readFunc;     /* TVT */
+	GifFile->UserData = userData; /* TVT */
+
+	/* Lets see if this is a GIF file: */
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, (unsigned char *)Buf, GIF_STAMP_LEN) !=
+	    GIF_STAMP_LEN) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_READ_FAILED;
+		}
+		free((char *)Private);
+		free((char *)GifFile);
+		return NULL;
+	}
+
+	/* Check for GIF prefix at start of file */
+	Buf[GIF_STAMP_LEN] = '\0';
+	if (strncmp(GIF_STAMP, Buf, GIF_VERSION_POS) != 0) {
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NOT_GIF_FILE;
+		}
+		free((char *)Private);
+		free((char *)GifFile);
+		return NULL;
+	}
+
+	if (DGifGetScreenDesc(GifFile) == GIF_ERROR) {
+		free((char *)Private);
+		free((char *)GifFile);
+		if (Error != NULL) {
+			*Error = D_GIF_ERR_NO_SCRN_DSCR;
+		}
+		return NULL;
+	}
+
+	GifFile->Error = 0;
+
+	/* What version of GIF? */
+	Private->gif89 = (Buf[GIF_VERSION_POS + 1] == '9');
+
+	return GifFile;
+}
+
+/******************************************************************************
+ This routine should be called before any other DGif calls. Note that
+ this routine is called automatically from DGif file open routines.
+******************************************************************************/
+int DGifGetScreenDesc(GifFileType *GifFile) {
+	int BitsPerPixel;
+	bool SortFlag;
+	GifByteType Buf[3];
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	/* Put the screen descriptor into the file: */
+	if (DGifGetWord(GifFile, &GifFile->SWidth) == GIF_ERROR ||
+	    DGifGetWord(GifFile, &GifFile->SHeight) == GIF_ERROR) {
+		return GIF_ERROR;
+	}
+
+	if (InternalRead(GifFile, Buf, 3) != 3) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		GifFreeMapObject(GifFile->SColorMap);
+		GifFile->SColorMap = NULL;
+		return GIF_ERROR;
+	}
+	GifFile->SColorResolution = (((Buf[0] & 0x70) + 1) >> 4) + 1;
+	SortFlag = (Buf[0] & 0x08) != 0;
+	BitsPerPixel = (Buf[0] & 0x07) + 1;
+	GifFile->SBackGroundColor = Buf[1];
+	GifFile->AspectByte = Buf[2];
+	if (Buf[0] & 0x80) { /* Do we have global color map? */
+		int i;
+
+		GifFile->SColorMap = GifMakeMapObject(1 << BitsPerPixel, NULL);
+		if (GifFile->SColorMap == NULL) {
+			GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+			return GIF_ERROR;
+		}
+
+		/* Get the global color map: */
+		GifFile->SColorMap->SortFlag = SortFlag;
+		for (i = 0; i < GifFile->SColorMap->ColorCount; i++) {
+			/* coverity[check_return] */
+			if (InternalRead(GifFile, Buf, 3) != 3) {
+				GifFreeMapObject(GifFile->SColorMap);
+				GifFile->SColorMap = NULL;
+				GifFile->Error = D_GIF_ERR_READ_FAILED;
+				return GIF_ERROR;
+			}
+			GifFile->SColorMap->Colors[i].Red = Buf[0];
+			GifFile->SColorMap->Colors[i].Green = Buf[1];
+			GifFile->SColorMap->Colors[i].Blue = Buf[2];
+		}
+	} else {
+		GifFile->SColorMap = NULL;
+	}
+
+	/*
+	 * No check here for whether the background color is in range for the
+	 * screen color map.  Possibly there should be.
+	 */
+
+	return GIF_OK;
+}
+
+const char *DGifGetGifVersion(GifFileType *GifFile) {
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (Private->gif89) {
+		return GIF89_STAMP;
+	} else {
+		return GIF87_STAMP;
+	}
+}
+
+/******************************************************************************
+ This routine should be called before any attempt to read an image.
+******************************************************************************/
+int DGifGetRecordType(GifFileType *GifFile, GifRecordType *Type) {
+	GifByteType Buf;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, &Buf, 1) != 1) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		return GIF_ERROR;
+	}
+
+	// fprintf(stderr, "### DGifGetRecordType: %02x\n", Buf);
+	switch (Buf) {
+	case DESCRIPTOR_INTRODUCER:
+		*Type = IMAGE_DESC_RECORD_TYPE;
+		break;
+	case EXTENSION_INTRODUCER:
+		*Type = EXTENSION_RECORD_TYPE;
+		break;
+	case TERMINATOR_INTRODUCER:
+		*Type = TERMINATE_RECORD_TYPE;
+		break;
+	default:
+		*Type = UNDEFINED_RECORD_TYPE;
+		GifFile->Error = D_GIF_ERR_WRONG_RECORD;
+		return GIF_ERROR;
+	}
+
+	return GIF_OK;
+}
+
+int DGifGetImageHeader(GifFileType *GifFile) {
+	unsigned int BitsPerPixel;
+	GifByteType Buf[3];
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	if (DGifGetWord(GifFile, &GifFile->Image.Left) == GIF_ERROR ||
+	    DGifGetWord(GifFile, &GifFile->Image.Top) == GIF_ERROR ||
+	    DGifGetWord(GifFile, &GifFile->Image.Width) == GIF_ERROR ||
+	    DGifGetWord(GifFile, &GifFile->Image.Height) == GIF_ERROR) {
+		return GIF_ERROR;
+	}
+	if (InternalRead(GifFile, Buf, 1) != 1) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		GifFreeMapObject(GifFile->Image.ColorMap);
+		GifFile->Image.ColorMap = NULL;
+		return GIF_ERROR;
+	}
+	BitsPerPixel = (Buf[0] & 0x07) + 1;
+	GifFile->Image.Interlace = (Buf[0] & 0x40) ? true : false;
+
+	/* Setup the colormap */
+	if (GifFile->Image.ColorMap) {
+		GifFreeMapObject(GifFile->Image.ColorMap);
+		GifFile->Image.ColorMap = NULL;
+	}
+	/* Does this image have local color map? */
+	if (Buf[0] & 0x80) {
+		int i;
+
+		GifFile->Image.ColorMap =
+		    GifMakeMapObject(1 << BitsPerPixel, NULL);
+		if (GifFile->Image.ColorMap == NULL) {
+			GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+			return GIF_ERROR;
+		}
+
+		/* Get the image local color map: */
+		for (i = 0; i < GifFile->Image.ColorMap->ColorCount; i++) {
+			/* coverity[check_return] */
+			if (InternalRead(GifFile, Buf, 3) != 3) {
+				GifFreeMapObject(GifFile->Image.ColorMap);
+				GifFile->Error = D_GIF_ERR_READ_FAILED;
+				GifFile->Image.ColorMap = NULL;
+				return GIF_ERROR;
+			}
+			GifFile->Image.ColorMap->Colors[i].Red = Buf[0];
+			GifFile->Image.ColorMap->Colors[i].Green = Buf[1];
+			GifFile->Image.ColorMap->Colors[i].Blue = Buf[2];
+		}
+	}
+
+	Private->PixelCount =
+	    (long)GifFile->Image.Width * (long)GifFile->Image.Height;
+
+	/* Reset decompress algorithm parameters. */
+	return DGifSetupDecompress(GifFile);
+}
+
+/******************************************************************************
+ This routine should be called before any attempt to read an image.
+ Note it is assumed the Image desc. header has been read.
+******************************************************************************/
+int DGifGetImageDesc(GifFileType *GifFile) {
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+	SavedImage *sp;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	if (DGifGetImageHeader(GifFile) == GIF_ERROR) {
+		return GIF_ERROR;
+	}
+
+	if (GifFile->SavedImages) {
+		SavedImage *new_saved_images = (SavedImage *)reallocarray(
+		    GifFile->SavedImages, (GifFile->ImageCount + 1),
+		    sizeof(SavedImage));
+		if (new_saved_images == NULL) {
+			GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+			return GIF_ERROR;
+		}
+		GifFile->SavedImages = new_saved_images;
+	} else {
+		if ((GifFile->SavedImages =
+		         (SavedImage *)malloc(sizeof(SavedImage))) == NULL) {
+			GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+			return GIF_ERROR;
+		}
+	}
+
+	sp = &GifFile->SavedImages[GifFile->ImageCount];
+	memcpy(&sp->ImageDesc, &GifFile->Image, sizeof(GifImageDesc));
+	if (GifFile->Image.ColorMap != NULL) {
+		sp->ImageDesc.ColorMap =
+		    GifMakeMapObject(GifFile->Image.ColorMap->ColorCount,
+		                     GifFile->Image.ColorMap->Colors);
+		if (sp->ImageDesc.ColorMap == NULL) {
+			GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM;
+			return GIF_ERROR;
+		}
+	}
+	sp->RasterBits = (unsigned char *)NULL;
+	sp->ExtensionBlockCount = 0;
+	sp->ExtensionBlocks = (ExtensionBlock *)NULL;
+
+	GifFile->ImageCount++;
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Get one full scanned line (Line) of length LineLen from GIF file.
+******************************************************************************/
+int DGifGetLine(GifFileType *GifFile, GifPixelType *Line, int LineLen) {
+	GifByteType *Dummy;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	if (!LineLen) {
+		LineLen = GifFile->Image.Width;
+	}
+
+	if ((Private->PixelCount -= LineLen) > 0xffff0000UL) {
+		GifFile->Error = D_GIF_ERR_DATA_TOO_BIG;
+		return GIF_ERROR;
+	}
+
+	if (DGifDecompressLine(GifFile, Line, LineLen) == GIF_OK) {
+		if (Private->PixelCount == 0) {
+			/* We probably won't be called any more, so let's clean
+			 * up everything before we return: need to flush out all
+			 * the rest of image until an empty block (size 0)
+			 * detected. We use GetCodeNext.
+			 */
+			do {
+				if (DGifGetCodeNext(GifFile, &Dummy) ==
+				    GIF_ERROR) {
+					return GIF_ERROR;
+				}
+			} while (Dummy != NULL);
+		}
+		return GIF_OK;
+	} else {
+		return GIF_ERROR;
+	}
+}
+
+/******************************************************************************
+ Put one pixel (Pixel) into GIF file.
+******************************************************************************/
+int DGifGetPixel(GifFileType *GifFile, GifPixelType Pixel) {
+	GifByteType *Dummy;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+	if (--Private->PixelCount > 0xffff0000UL) {
+		GifFile->Error = D_GIF_ERR_DATA_TOO_BIG;
+		return GIF_ERROR;
+	}
+
+	if (DGifDecompressLine(GifFile, &Pixel, 1) == GIF_OK) {
+		if (Private->PixelCount == 0) {
+			/* We probably won't be called any more, so let's clean
+			 * up everything before we return: need to flush out all
+			 * the rest of image until an empty block (size 0)
+			 * detected. We use GetCodeNext.
+			 */
+			do {
+				if (DGifGetCodeNext(GifFile, &Dummy) ==
+				    GIF_ERROR) {
+					return GIF_ERROR;
+				}
+			} while (Dummy != NULL);
+		}
+		return GIF_OK;
+	} else {
+		return GIF_ERROR;
+	}
+}
+
+/******************************************************************************
+ Get an extension block (see GIF manual) from GIF file. This routine only
+ returns the first data block, and DGifGetExtensionNext should be called
+ after this one until NULL extension is returned.
+ The Extension should NOT be freed by the user (not dynamically allocated).
+ Note it is assumed the Extension description header has been read.
+******************************************************************************/
+int DGifGetExtension(GifFileType *GifFile, int *ExtCode,
+                     GifByteType **Extension) {
+	GifByteType Buf;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	// fprintf(stderr, "### -> DGifGetExtension:\n");
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, &Buf, 1) != 1) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		return GIF_ERROR;
+	}
+	*ExtCode = Buf;
+	// fprintf(stderr, "### <- DGifGetExtension: %02x, about to call
+	// next\n", Buf);
+
+	return DGifGetExtensionNext(GifFile, Extension);
+}
+
+/******************************************************************************
+ Get a following extension block (see GIF manual) from GIF file. This
+ routine should be called until NULL Extension is returned.
+ The Extension should NOT be freed by the user (not dynamically allocated).
+******************************************************************************/
+int DGifGetExtensionNext(GifFileType *GifFile, GifByteType **Extension) {
+	GifByteType Buf;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	// fprintf(stderr, "### -> DGifGetExtensionNext\n");
+	if (InternalRead(GifFile, &Buf, 1) != 1) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		return GIF_ERROR;
+	}
+	// fprintf(stderr, "### DGifGetExtensionNext sees %d\n", Buf);
+
+	if (Buf > 0) {
+		*Extension = Private->Buf; /* Use private unused buffer. */
+		(*Extension)[0] =
+		    Buf; /* Pascal strings notation (pos. 0 is len.). */
+		         /* coverity[tainted_data,check_return] */
+		if (InternalRead(GifFile, &((*Extension)[1]), Buf) != Buf) {
+			GifFile->Error = D_GIF_ERR_READ_FAILED;
+			return GIF_ERROR;
+		}
+	} else {
+		*Extension = NULL;
+	}
+	// fprintf(stderr, "### <- DGifGetExtensionNext: %p\n", Extension);
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Extract a Graphics Control Block from raw extension data
+******************************************************************************/
+
+int DGifExtensionToGCB(const size_t GifExtensionLength,
+                       const GifByteType *GifExtension,
+                       GraphicsControlBlock *GCB) {
+	if (GifExtensionLength != 4) {
+		return GIF_ERROR;
+	}
+
+	GCB->DisposalMode = (GifExtension[0] >> 2) & 0x07;
+	GCB->UserInputFlag = (GifExtension[0] & 0x02) != 0;
+	GCB->DelayTime =
+	    UNSIGNED_LITTLE_ENDIAN(GifExtension[1], GifExtension[2]);
+	if (GifExtension[0] & 0x01) {
+		GCB->TransparentColor = (int)GifExtension[3];
+	} else {
+		GCB->TransparentColor = NO_TRANSPARENT_COLOR;
+	}
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Extract the Graphics Control Block for a saved image, if it exists.
+******************************************************************************/
+
+int DGifSavedExtensionToGCB(GifFileType *GifFile, int ImageIndex,
+                            GraphicsControlBlock *GCB) {
+	int i;
+
+	if (ImageIndex < 0 || ImageIndex > GifFile->ImageCount - 1) {
+		return GIF_ERROR;
+	}
+
+	GCB->DisposalMode = DISPOSAL_UNSPECIFIED;
+	GCB->UserInputFlag = false;
+	GCB->DelayTime = 0;
+	GCB->TransparentColor = NO_TRANSPARENT_COLOR;
+
+	for (i = 0; i < GifFile->SavedImages[ImageIndex].ExtensionBlockCount;
+	     i++) {
+		ExtensionBlock *ep =
+		    &GifFile->SavedImages[ImageIndex].ExtensionBlocks[i];
+		if (ep->Function == GRAPHICS_EXT_FUNC_CODE) {
+			return DGifExtensionToGCB(ep->ByteCount, ep->Bytes,
+			                          GCB);
+		}
+	}
+
+	return GIF_ERROR;
+}
+
+/******************************************************************************
+ This routine should be called last, to close the GIF file.
+******************************************************************************/
+int DGifCloseFile(GifFileType *GifFile, int *ErrorCode) {
+	GifFilePrivateType *Private;
+
+	if (GifFile == NULL || GifFile->Private == NULL) {
+		return GIF_ERROR;
+	}
+
+	if (GifFile->Image.ColorMap) {
+		GifFreeMapObject(GifFile->Image.ColorMap);
+		GifFile->Image.ColorMap = NULL;
+	}
+
+	if (GifFile->SColorMap) {
+		GifFreeMapObject(GifFile->SColorMap);
+		GifFile->SColorMap = NULL;
+	}
+
+	if (GifFile->SavedImages) {
+		GifFreeSavedImages(GifFile);
+		GifFile->SavedImages = NULL;
+	}
+
+	GifFreeExtensions(&GifFile->ExtensionBlockCount,
+	                  &GifFile->ExtensionBlocks);
+
+	Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		if (ErrorCode != NULL) {
+			*ErrorCode = D_GIF_ERR_NOT_READABLE;
+		}
+		free((char *)GifFile->Private);
+		free(GifFile);
+		return GIF_ERROR;
+	}
+
+	if (Private->File && (fclose(Private->File) != 0)) {
+		if (ErrorCode != NULL) {
+			*ErrorCode = D_GIF_ERR_CLOSE_FAILED;
+		}
+		free((char *)GifFile->Private);
+		free(GifFile);
+		return GIF_ERROR;
+	}
+
+	free((char *)GifFile->Private);
+	free(GifFile);
+	if (ErrorCode != NULL) {
+		*ErrorCode = D_GIF_SUCCEEDED;
+	}
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Get 2 bytes (word) from the given file:
+******************************************************************************/
+static int DGifGetWord(GifFileType *GifFile, GifWord *Word) {
+	unsigned char c[2];
+
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, c, 2) != 2) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		return GIF_ERROR;
+	}
+
+	*Word = (GifWord)UNSIGNED_LITTLE_ENDIAN(c[0], c[1]);
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Get the image code in compressed form.  This routine can be called if the
+ information needed to be piped out as is. Obviously this is much faster
+ than decoding and encoding again. This routine should be followed by calls
+ to DGifGetCodeNext, until NULL block is returned.
+ The block should NOT be freed by the user (not dynamically allocated).
+******************************************************************************/
+int DGifGetCode(GifFileType *GifFile, int *CodeSize, GifByteType **CodeBlock) {
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	*CodeSize = Private->BitsPerPixel;
+
+	return DGifGetCodeNext(GifFile, CodeBlock);
+}
+
+/******************************************************************************
+ Continue to get the image code in compressed form. This routine should be
+ called until NULL block is returned.
+ The block should NOT be freed by the user (not dynamically allocated).
+******************************************************************************/
+int DGifGetCodeNext(GifFileType *GifFile, GifByteType **CodeBlock) {
+	GifByteType Buf;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	/* coverity[tainted_data_argument] */
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, &Buf, 1) != 1) {
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		return GIF_ERROR;
+	}
+
+	/* coverity[lower_bounds] */
+	if (Buf > 0) {
+		*CodeBlock = Private->Buf; /* Use private unused buffer. */
+		(*CodeBlock)[0] =
+		    Buf; /* Pascal strings notation (pos. 0 is len.). */
+		         /* coverity[tainted_data] */
+		if (InternalRead(GifFile, &((*CodeBlock)[1]), Buf) != Buf) {
+			GifFile->Error = D_GIF_ERR_READ_FAILED;
+			return GIF_ERROR;
+		}
+	} else {
+		*CodeBlock = NULL;
+		Private->Buf[0] = 0; /* Make sure the buffer is empty! */
+		Private->PixelCount =
+		    0; /* And local info. indicate image read. */
+	}
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Setup the LZ decompression for this image:
+******************************************************************************/
+static int DGifSetupDecompress(GifFileType *GifFile) {
+	int i, BitsPerPixel;
+	GifByteType CodeSize;
+	GifPrefixType *Prefix;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	/* coverity[check_return] */
+	if (InternalRead(GifFile, &CodeSize, 1) <
+	    1) { /* Read Code size from file. */
+		GifFile->Error = D_GIF_ERR_READ_FAILED;
+		return GIF_ERROR; /* Failed to read Code size. */
+	}
+	BitsPerPixel = CodeSize;
+
+	/* this can only happen on a severely malformed GIF */
+	if (BitsPerPixel > 8) {
+		GifFile->Error =
+		    D_GIF_ERR_READ_FAILED; /* somewhat bogus error code */
+		return GIF_ERROR;          /* Failed to read Code size. */
+	}
+
+	Private->Buf[0] = 0; /* Input Buffer empty. */
+	Private->BitsPerPixel = BitsPerPixel;
+	Private->ClearCode = (1 << BitsPerPixel);
+	Private->EOFCode = Private->ClearCode + 1;
+	Private->RunningCode = Private->EOFCode + 1;
+	Private->RunningBits = BitsPerPixel + 1; /* Number of bits per code. */
+	Private->MaxCode1 = 1 << Private->RunningBits; /* Max. code + 1. */
+	Private->StackPtr = 0; /* No pixels on the pixel stack. */
+	Private->LastCode = NO_SUCH_CODE;
+	Private->CrntShiftState = 0; /* No information in CrntShiftDWord. */
+	Private->CrntShiftDWord = 0;
+
+	Prefix = Private->Prefix;
+	for (i = 0; i <= LZ_MAX_CODE; i++) {
+		Prefix[i] = NO_SUCH_CODE;
+	}
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ The LZ decompression routine:
+ This version decompress the given GIF file into Line of length LineLen.
+ This routine can be called few times (one per scan line, for example), in
+ order the complete the whole image.
+******************************************************************************/
+static int DGifDecompressLine(GifFileType *GifFile, GifPixelType *Line,
+                              int LineLen) {
+	int i = 0;
+	int j, CrntCode, EOFCode, ClearCode, CrntPrefix, LastCode, StackPtr;
+	GifByteType *Stack, *Suffix;
+	GifPrefixType *Prefix;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	StackPtr = Private->StackPtr;
+	Prefix = Private->Prefix;
+	Suffix = Private->Suffix;
+	Stack = Private->Stack;
+	EOFCode = Private->EOFCode;
+	ClearCode = Private->ClearCode;
+	LastCode = Private->LastCode;
+
+	if (StackPtr > LZ_MAX_CODE) {
+		return GIF_ERROR;
+	}
+
+	if (StackPtr != 0) {
+		/* Let pop the stack off before continueing to read the GIF
+		 * file: */
+		while (StackPtr != 0 && i < LineLen) {
+			Line[i++] = Stack[--StackPtr];
+		}
+	}
+
+	while (i < LineLen) { /* Decode LineLen items. */
+		if (DGifDecompressInput(GifFile, &CrntCode) == GIF_ERROR) {
+			return GIF_ERROR;
+		}
+
+		if (CrntCode == EOFCode) {
+			/* Note however that usually we will not be here as we
+			 * will stop decoding as soon as we got all the pixel,
+			 * or EOF code will not be read at all, and
+			 * DGifGetLine/Pixel clean everything.  */
+			GifFile->Error = D_GIF_ERR_EOF_TOO_SOON;
+			return GIF_ERROR;
+		} else if (CrntCode == ClearCode) {
+			/* We need to start over again: */
+			for (j = 0; j <= LZ_MAX_CODE; j++) {
+				Prefix[j] = NO_SUCH_CODE;
+			}
+			Private->RunningCode = Private->EOFCode + 1;
+			Private->RunningBits = Private->BitsPerPixel + 1;
+			Private->MaxCode1 = 1 << Private->RunningBits;
+			LastCode = Private->LastCode = NO_SUCH_CODE;
+		} else {
+			/* Its regular code - if in pixel range simply add it to
+			 * output stream, otherwise trace to codes linked list
+			 * until the prefix is in pixel range: */
+			if (CrntCode < ClearCode) {
+				/* This is simple - its pixel scalar, so add it
+				 * to output: */
+				Line[i++] = CrntCode;
+			} else {
+				/* Its a code to needed to be traced: trace the
+				 * linked list until the prefix is a pixel,
+				 * while pushing the suffix pixels on our stack.
+				 * If we done, pop the stack in reverse (thats
+				 * what stack is good for!) order to output.  */
+				if (Prefix[CrntCode] == NO_SUCH_CODE) {
+					CrntPrefix = LastCode;
+
+					/* Only allowed if CrntCode is exactly
+					 * the running code: In that case
+					 * CrntCode = XXXCode, CrntCode or the
+					 * prefix code is last code and the
+					 * suffix char is exactly the prefix of
+					 * last code! */
+					if (CrntCode ==
+					    Private->RunningCode - 2) {
+						Suffix[Private->RunningCode -
+						       2] = Stack[StackPtr++] =
+						    DGifGetPrefixChar(
+						        Prefix, LastCode,
+						        ClearCode);
+					} else {
+						Suffix[Private->RunningCode -
+						       2] = Stack[StackPtr++] =
+						    DGifGetPrefixChar(
+						        Prefix, CrntCode,
+						        ClearCode);
+					}
+				} else {
+					CrntPrefix = CrntCode;
+				}
+
+				/* Now (if image is O.K.) we should not get a
+				 * NO_SUCH_CODE during the trace. As we might
+				 * loop forever, in case of defective image, we
+				 * use StackPtr as loop counter and stop before
+				 * overflowing Stack[]. */
+				while (StackPtr < LZ_MAX_CODE &&
+				       CrntPrefix > ClearCode &&
+				       CrntPrefix <= LZ_MAX_CODE) {
+					Stack[StackPtr++] = Suffix[CrntPrefix];
+					CrntPrefix = Prefix[CrntPrefix];
+				}
+				if (StackPtr >= LZ_MAX_CODE ||
+				    CrntPrefix > LZ_MAX_CODE) {
+					GifFile->Error = D_GIF_ERR_IMAGE_DEFECT;
+					return GIF_ERROR;
+				}
+				/* Push the last character on stack: */
+				Stack[StackPtr++] = CrntPrefix;
+
+				/* Now lets pop all the stack into output: */
+				while (StackPtr != 0 && i < LineLen) {
+					Line[i++] = Stack[--StackPtr];
+				}
+			}
+			if (LastCode != NO_SUCH_CODE &&
+			    Private->RunningCode - 2 < (LZ_MAX_CODE + 1) &&
+			    Prefix[Private->RunningCode - 2] == NO_SUCH_CODE) {
+				Prefix[Private->RunningCode - 2] = LastCode;
+
+				if (CrntCode == Private->RunningCode - 2) {
+					/* Only allowed if CrntCode is exactly
+					 * the running code: In that case
+					 * CrntCode = XXXCode, CrntCode or the
+					 * prefix code is last code and the
+					 * suffix char is exactly the prefix of
+					 * last code! */
+					Suffix[Private->RunningCode - 2] =
+					    DGifGetPrefixChar(Prefix, LastCode,
+					                      ClearCode);
+				} else {
+					Suffix[Private->RunningCode - 2] =
+					    DGifGetPrefixChar(Prefix, CrntCode,
+					                      ClearCode);
+				}
+			}
+			LastCode = CrntCode;
+		}
+	}
+
+	Private->LastCode = LastCode;
+	Private->StackPtr = StackPtr;
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ Routine to trace the Prefixes linked list until we get a prefix which is
+ not code, but a pixel value (less than ClearCode). Returns that pixel value.
+ If image is defective, we might loop here forever, so we limit the loops to
+ the maximum possible if image O.k. - LZ_MAX_CODE times.
+******************************************************************************/
+static int DGifGetPrefixChar(const GifPrefixType *Prefix, int Code,
+                             int ClearCode) {
+	int i = 0;
+
+	while (Code > ClearCode && i++ <= LZ_MAX_CODE) {
+		if (Code > LZ_MAX_CODE) {
+			return NO_SUCH_CODE;
+		}
+		Code = Prefix[Code];
+	}
+	return Code;
+}
+
+/******************************************************************************
+ Interface for accessing the LZ codes directly. Set Code to the real code
+ (12bits), or to -1 if EOF code is returned.
+******************************************************************************/
+int DGifGetLZCodes(GifFileType *GifFile, int *Code) {
+	GifByteType *CodeBlock;
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	if (!IS_READABLE(Private)) {
+		/* This file was NOT open for reading: */
+		GifFile->Error = D_GIF_ERR_NOT_READABLE;
+		return GIF_ERROR;
+	}
+
+	if (DGifDecompressInput(GifFile, Code) == GIF_ERROR) {
+		return GIF_ERROR;
+	}
+
+	if (*Code == Private->EOFCode) {
+		/* Skip rest of codes (hopefully only NULL terminating block):
+		 */
+		do {
+			if (DGifGetCodeNext(GifFile, &CodeBlock) == GIF_ERROR) {
+				return GIF_ERROR;
+			}
+		} while (CodeBlock != NULL);
+
+		*Code = -1;
+	} else if (*Code == Private->ClearCode) {
+		/* We need to start over again: */
+		Private->RunningCode = Private->EOFCode + 1;
+		Private->RunningBits = Private->BitsPerPixel + 1;
+		Private->MaxCode1 = 1 << Private->RunningBits;
+	}
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ The LZ decompression input routine:
+ This routine is responsable for the decompression of the bit stream from
+ 8 bits (bytes) packets, into the real codes.
+ Returns GIF_OK if read successfully.
+******************************************************************************/
+static int DGifDecompressInput(GifFileType *GifFile, int *Code) {
+	static const unsigned short CodeMasks[] = {
+	    0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f,
+	    0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff};
+
+	GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private;
+
+	GifByteType NextByte;
+
+	/* The image can't contain more than LZ_BITS per code. */
+	if (Private->RunningBits > LZ_BITS) {
+		GifFile->Error = D_GIF_ERR_IMAGE_DEFECT;
+		return GIF_ERROR;
+	}
+
+	while (Private->CrntShiftState < Private->RunningBits) {
+		/* Needs to get more bytes from input stream for next code: */
+		if (DGifBufferedInput(GifFile, Private->Buf, &NextByte) ==
+		    GIF_ERROR) {
+			return GIF_ERROR;
+		}
+		Private->CrntShiftDWord |= ((unsigned long)NextByte)
+		                           << Private->CrntShiftState;
+		Private->CrntShiftState += 8;
+	}
+	*Code = Private->CrntShiftDWord & CodeMasks[Private->RunningBits];
+
+	Private->CrntShiftDWord >>= Private->RunningBits;
+	Private->CrntShiftState -= Private->RunningBits;
+
+	/* If code cannot fit into RunningBits bits, must raise its size. Note
+	 * however that codes above 4095 are used for special signaling.
+	 * If we're using LZ_BITS bits already and we're at the max code, just
+	 * keep using the table as it is, don't increment Private->RunningCode.
+	 */
+	if (Private->RunningCode < LZ_MAX_CODE + 2 &&
+	    ++Private->RunningCode > Private->MaxCode1 &&
+	    Private->RunningBits < LZ_BITS) {
+		Private->MaxCode1 <<= 1;
+		Private->RunningBits++;
+	}
+	return GIF_OK;
+}
+
+/******************************************************************************
+ This routines read one GIF data block at a time and buffers it internally
+ so that the decompression routine could access it.
+ The routine returns the next byte from its internal buffer (or read next
+ block in if buffer empty) and returns GIF_OK if succesful.
+******************************************************************************/
+static int DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf,
+                             GifByteType *NextByte) {
+	if (Buf[0] == 0) {
+		/* Needs to read the next buffer - this one is empty: */
+		/* coverity[check_return] */
+		if (InternalRead(GifFile, Buf, 1) != 1) {
+			GifFile->Error = D_GIF_ERR_READ_FAILED;
+			return GIF_ERROR;
+		}
+		/* There shouldn't be any empty data blocks here as the LZW spec
+		 * says the LZW termination code should come first.  Therefore
+		 * we shouldn't be inside this routine at that point.
+		 */
+		if (Buf[0] == 0) {
+			GifFile->Error = D_GIF_ERR_IMAGE_DEFECT;
+			return GIF_ERROR;
+		}
+		if (InternalRead(GifFile, &Buf[1], Buf[0]) != Buf[0]) {
+			GifFile->Error = D_GIF_ERR_READ_FAILED;
+			return GIF_ERROR;
+		}
+		*NextByte = Buf[1];
+		Buf[1] = 2; /* We use now the second place as last char read! */
+		Buf[0]--;
+	} else {
+		*NextByte = Buf[Buf[1]++];
+		Buf[0]--;
+	}
+
+	return GIF_OK;
+}
+
+/******************************************************************************
+ This routine is called in case of error during parsing image. We need to
+ decrease image counter and reallocate memory for saved images. Not decreasing
+ ImageCount may lead to null pointer dereference, because the last element in
+ SavedImages may point to the spoilt image and null pointer buffers.
+*******************************************************************************/
+void DGifDecreaseImageCounter(GifFileType *GifFile) {
+	GifFile->ImageCount--;
+	if (GifFile->SavedImages[GifFile->ImageCount].RasterBits != NULL) {
+		free(GifFile->SavedImages[GifFile->ImageCount].RasterBits);
+	}
+
+	// Realloc array according to the new image counter.
+	SavedImage *correct_saved_images = (SavedImage *)reallocarray(
+	    GifFile->SavedImages, GifFile->ImageCount, sizeof(SavedImage));
+	if (correct_saved_images != NULL) {
+		GifFile->SavedImages = correct_saved_images;
+	}
+}
+
+/******************************************************************************
+ This routine reads an entire GIF into core, hanging all its state info off
+ the GifFileType pointer.  Call DGifOpenFileName() or DGifOpenFileHandle()
+ first to initialize I/O.  Its inverse is EGifSpew().
+*******************************************************************************/
+int DGifSlurp(GifFileType *GifFile) {
+	size_t ImageSize;
+	GifRecordType RecordType;
+	SavedImage *sp;
+	GifByteType *ExtData;
+	int ExtFunction;
+
+	GifFile->ExtensionBlocks = NULL;
+	GifFile->ExtensionBlockCount = 0;
+
+	do {
+		if (DGifGetRecordType(GifFile, &RecordType) == GIF_ERROR) {
+			return (GIF_ERROR);
+		}
+
+		switch (RecordType) {
+		case IMAGE_DESC_RECORD_TYPE:
+			if (DGifGetImageDesc(GifFile) == GIF_ERROR) {
+				return (GIF_ERROR);
+			}
+
+			sp = &GifFile->SavedImages[GifFile->ImageCount - 1];
+			/* Allocate memory for the image */
+			if (sp->ImageDesc.Width <= 0 ||
+			    sp->ImageDesc.Height <= 0 ||
+			    sp->ImageDesc.Width >
+			        (INT_MAX / sp->ImageDesc.Height)) {
+				DGifDecreaseImageCounter(GifFile);
+				return GIF_ERROR;
+			}
+			ImageSize = sp->ImageDesc.Width * sp->ImageDesc.Height;
+
+			if (ImageSize > (SIZE_MAX / sizeof(GifPixelType))) {
+				DGifDecreaseImageCounter(GifFile);
+				return GIF_ERROR;
+			}
+			sp->RasterBits = (unsigned char *)reallocarray(
+			    NULL, ImageSize, sizeof(GifPixelType));
+
+			if (sp->RasterBits == NULL) {
+				DGifDecreaseImageCounter(GifFile);
+				return GIF_ERROR;
+			}
+
+			if (sp->ImageDesc.Interlace) {
+				int i, j;
+				/*
+				 * The way an interlaced image should be read -
+				 * offsets and jumps...
+				 */
+				static const int InterlacedOffset[] = {0, 4, 2,
+				                                       1};
+				static const int InterlacedJumps[] = {8, 8, 4,
+				                                      2};
+				/* Need to perform 4 passes on the image */
+				for (i = 0; i < 4; i++) {
+					for (j = InterlacedOffset[i];
+					     j < sp->ImageDesc.Height;
+					     j += InterlacedJumps[i]) {
+						if (DGifGetLine(
+						        GifFile,
+						        sp->RasterBits +
+						            j * sp->ImageDesc
+						                    .Width,
+						        sp->ImageDesc.Width) ==
+						    GIF_ERROR) {
+							DGifDecreaseImageCounter(
+							    GifFile);
+							return GIF_ERROR;
+						}
+					}
+				}
+			} else {
+				if (DGifGetLine(GifFile, sp->RasterBits,
+				                ImageSize) == GIF_ERROR) {
+					DGifDecreaseImageCounter(GifFile);
+					return GIF_ERROR;
+				}
+			}
+
+			if (GifFile->ExtensionBlocks) {
+				sp->ExtensionBlocks = GifFile->ExtensionBlocks;
+				sp->ExtensionBlockCount =
+				    GifFile->ExtensionBlockCount;
+
+				GifFile->ExtensionBlocks = NULL;
+				GifFile->ExtensionBlockCount = 0;
+			}
+			break;
+
+		case EXTENSION_RECORD_TYPE:
+			if (DGifGetExtension(GifFile, &ExtFunction, &ExtData) ==
+			    GIF_ERROR) {
+				return (GIF_ERROR);
+			}
+			/* Create an extension block with our data */
+			if (ExtData != NULL) {
+				if (GifAddExtensionBlock(
+				        &GifFile->ExtensionBlockCount,
+				        &GifFile->ExtensionBlocks, ExtFunction,
+				        ExtData[0], &ExtData[1]) == GIF_ERROR) {
+					return (GIF_ERROR);
+				}
+			}
+			for (;;) {
+				if (DGifGetExtensionNext(GifFile, &ExtData) ==
+				    GIF_ERROR) {
+					return (GIF_ERROR);
+				}
+				if (ExtData == NULL) {
+					break;
+				}
+				/* Continue the extension block */
+				if (GifAddExtensionBlock(
+				        &GifFile->ExtensionBlockCount,
+				        &GifFile->ExtensionBlocks,
+				        CONTINUE_EXT_FUNC_CODE, ExtData[0],
+				        &ExtData[1]) == GIF_ERROR) {
+					return (GIF_ERROR);
+				}
+			}
+			break;
+
+		case TERMINATE_RECORD_TYPE:
+			break;
+
+		default: /* Should be trapped by DGifGetRecordType */
+			break;
+		}
+	} while (RecordType != TERMINATE_RECORD_TYPE);
+
+	/* Sanity check for corrupted file */
+	if (GifFile->ImageCount == 0) {
+		GifFile->Error = D_GIF_ERR_NO_IMAG_DSCR;
+		return (GIF_ERROR);
+	}
+
+	return (GIF_OK);
+}
+
+/* end */
diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_hash.c b/torchvision/csrc/io/image/cpu/giflib/gif_hash.c
new file mode 100644
index 00000000000..42efbe8de68
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/gif_hash.c
@@ -0,0 +1,129 @@
+// @nolint (improperly imported third-party code)
+/*****************************************************************************
+
+gif_hash.c -- module to support the following operations:
+
+1. InitHashTable - initialize hash table.
+2. ClearHashTable - clear the hash table to an empty state.
+2. InsertHashTable - insert one item into data structure.
+3. ExistsHashTable - test if item exists in data structure.
+
+This module is used to hash the GIF codes during encoding.
+
+*****************************************************************************/
+// SPDX-License-Identifier: MIT
+// SPDX-File-Copyright-Txt: (C) Copyright 1989 Gershon Elber
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "gif_hash.h"
+#include "gif_lib.h"
+#include "gif_lib_private.h"
+
+/* #define  DEBUG_HIT_RATE    Debug number of misses per hash Insert/Exists. */
+
+#ifdef DEBUG_HIT_RATE
+static long NumberOfTests = 0, NumberOfMisses = 0;
+#endif /* DEBUG_HIT_RATE */
+
+static int KeyItem(uint32_t Item);
+
+/******************************************************************************
+ Initialize HashTable - allocate the memory needed and clear it.	      *
+******************************************************************************/
+GifHashTableType *_InitHashTable(void) {
+	GifHashTableType *HashTable;
+
+	if ((HashTable = (GifHashTableType *)malloc(
+	         sizeof(GifHashTableType))) == NULL) {
+		return NULL;
+	}
+
+	_ClearHashTable(HashTable);
+
+	return HashTable;
+}
+
+/******************************************************************************
+ Routine to clear the HashTable to an empty state.			      *
+ This part is a little machine depended. Use the commented part otherwise.   *
+******************************************************************************/
+void _ClearHashTable(GifHashTableType *HashTable) {
+	memset(HashTable->HTable, 0xFF, HT_SIZE * sizeof(uint32_t));
+}
+
+/******************************************************************************
+ Routine to insert a new Item into the HashTable. The data is assumed to be  *
+ new one.								      *
+******************************************************************************/
+void _InsertHashTable(GifHashTableType *HashTable, uint32_t Key, int Code) {
+	int HKey = KeyItem(Key);
+	uint32_t *HTable = HashTable->HTable;
+
+#ifdef DEBUG_HIT_RATE
+	NumberOfTests++;
+	NumberOfMisses++;
+#endif /* DEBUG_HIT_RATE */
+
+	while (HT_GET_KEY(HTable[HKey]) != 0xFFFFFL) {
+#ifdef DEBUG_HIT_RATE
+		NumberOfMisses++;
+#endif /* DEBUG_HIT_RATE */
+		HKey = (HKey + 1) & HT_KEY_MASK;
+	}
+	HTable[HKey] = HT_PUT_KEY(Key) | HT_PUT_CODE(Code);
+}
+
+/******************************************************************************
+ Routine to test if given Key exists in HashTable and if so returns its code *
+ Returns the Code if key was found, -1 if not.				      *
+******************************************************************************/
+int _ExistsHashTable(GifHashTableType *HashTable, uint32_t Key) {
+	int HKey = KeyItem(Key);
+	uint32_t *HTable = HashTable->HTable, HTKey;
+
+#ifdef DEBUG_HIT_RATE
+	NumberOfTests++;
+	NumberOfMisses++;
+#endif /* DEBUG_HIT_RATE */
+
+	while ((HTKey = HT_GET_KEY(HTable[HKey])) != 0xFFFFFL) {
+#ifdef DEBUG_HIT_RATE
+		NumberOfMisses++;
+#endif /* DEBUG_HIT_RATE */
+		if (Key == HTKey) {
+			return HT_GET_CODE(HTable[HKey]);
+		}
+		HKey = (HKey + 1) & HT_KEY_MASK;
+	}
+
+	return -1;
+}
+
+/******************************************************************************
+ Routine to generate an HKey for the hashtable out of the given unique key.  *
+ The given Key is assumed to be 20 bits as follows: lower 8 bits are the     *
+ new postfix character, while the upper 12 bits are the prefix code.	      *
+ Because the average hit ratio is only 2 (2 hash references per entry),      *
+ evaluating more complex keys (such as twin prime keys) does not worth it!   *
+******************************************************************************/
+static int KeyItem(uint32_t Item) {
+	return ((Item >> 12) ^ Item) & HT_KEY_MASK;
+}
+
+#ifdef DEBUG_HIT_RATE
+/******************************************************************************
+ Debugging routine to print the hit ratio - number of times the hash table   *
+ was tested per operation. This routine was used to test the KeyItem routine *
+******************************************************************************/
+void HashTablePrintHitRatio(void) {
+	printf("Hash Table Hit Ratio is %ld/%ld = %ld%%.\n", NumberOfMisses,
+	       NumberOfTests, NumberOfMisses * 100 / NumberOfTests);
+}
+#endif /* DEBUG_HIT_RATE */
+
+/* end */
diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_hash.h b/torchvision/csrc/io/image/cpu/giflib/gif_hash.h
new file mode 100644
index 00000000000..3066fb14592
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/gif_hash.h
@@ -0,0 +1,43 @@
+// @nolint (improperly imported third-party code)
+/******************************************************************************
+
+gif_hash.h - magfic constants and declarations for GIF LZW
+
+******************************************************************************/
+// SPDX-License-Identifier: MIT
+
+#ifndef _GIF_HASH_H_
+#define _GIF_HASH_H_
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif /* _WIN32 */
+#include <stdint.h>
+
+#define HT_SIZE 8192       /* 12bits = 4096 or twice as big! */
+#define HT_KEY_MASK 0x1FFF /* 13bits keys */
+#define HT_KEY_NUM_BITS 13 /* 13bits keys */
+#define HT_MAX_KEY 8191    /* 13bits - 1, maximal code possible */
+#define HT_MAX_CODE 4095   /* Biggest code possible in 12 bits. */
+
+/* The 32 bits of the long are divided into two parts for the key & code:   */
+/* 1. The code is 12 bits as our compression algorithm is limited to 12bits */
+/* 2. The key is 12 bits Prefix code + 8 bit new char or 20 bits.	    */
+/* The key is the upper 20 bits.  The code is the lower 12. */
+#define HT_GET_KEY(l) (l >> 12)
+#define HT_GET_CODE(l) (l & 0x0FFF)
+#define HT_PUT_KEY(l) (l << 12)
+#define HT_PUT_CODE(l) (l & 0x0FFF)
+
+typedef struct GifHashTableType {
+	uint32_t HTable[HT_SIZE];
+} GifHashTableType;
+
+GifHashTableType *_InitHashTable(void);
+void _ClearHashTable(GifHashTableType *HashTable);
+void _InsertHashTable(GifHashTableType *HashTable, uint32_t Key, int Code);
+int _ExistsHashTable(GifHashTableType *HashTable, uint32_t Key);
+
+#endif /* _GIF_HASH_H_ */
+
+/* end */
diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_lib.h b/torchvision/csrc/io/image/cpu/giflib/gif_lib.h
new file mode 100644
index 00000000000..7bed0430450
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/gif_lib.h
@@ -0,0 +1,292 @@
+// @nolint (improperly imported third-party code)
+/******************************************************************************
+
+gif_lib.h - service library for decoding and encoding GIF images
+
+SPDX-License-Identifier: MIT
+
+*****************************************************************************/
+
+#ifndef _GIF_LIB_H_
+#define _GIF_LIB_H_ 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define GIFLIB_MAJOR 5
+#define GIFLIB_MINOR 2
+#define GIFLIB_RELEASE 2
+
+#define GIF_ERROR 0
+#define GIF_OK 1
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#define GIF_STAMP "GIFVER" /* First chars in file - GIF stamp.  */
+#define GIF_STAMP_LEN sizeof(GIF_STAMP) - 1
+#define GIF_VERSION_POS 3    /* Version first character in stamp. */
+#define GIF87_STAMP "GIF87a" /* First chars in file - GIF stamp.  */
+#define GIF89_STAMP "GIF89a" /* First chars in file - GIF stamp.  */
+
+typedef unsigned char GifPixelType;
+typedef unsigned char *GifRowType;
+typedef unsigned char GifByteType;
+typedef unsigned int GifPrefixType;
+typedef int GifWord;
+
+typedef struct GifColorType {
+	GifByteType Red, Green, Blue;
+} GifColorType;
+
+typedef struct ColorMapObject {
+	int ColorCount;
+	int BitsPerPixel;
+	bool SortFlag;
+	GifColorType *Colors; /* on malloc(3) heap */
+} ColorMapObject;
+
+typedef struct GifImageDesc {
+	GifWord Left, Top, Width, Height; /* Current image dimensions. */
+	bool Interlace;                   /* Sequential/Interlaced lines. */
+	ColorMapObject *ColorMap;         /* The local color map */
+} GifImageDesc;
+
+typedef struct ExtensionBlock {
+	int ByteCount;
+	GifByteType *Bytes;            /* on malloc(3) heap */
+	int Function;                  /* The block function code */
+#define CONTINUE_EXT_FUNC_CODE 0x00    /* continuation subblock */
+#define COMMENT_EXT_FUNC_CODE 0xfe     /* comment */
+#define GRAPHICS_EXT_FUNC_CODE 0xf9    /* graphics control (GIF89) */
+#define PLAINTEXT_EXT_FUNC_CODE 0x01   /* plaintext */
+#define APPLICATION_EXT_FUNC_CODE 0xff /* application block (GIF89) */
+} ExtensionBlock;
+
+typedef struct SavedImage {
+	GifImageDesc ImageDesc;
+	GifByteType *RasterBits;         /* on malloc(3) heap */
+	int ExtensionBlockCount;         /* Count of extensions before image */
+	ExtensionBlock *ExtensionBlocks; /* Extensions before image */
+} SavedImage;
+
+typedef struct GifFileType {
+	GifWord SWidth, SHeight;   /* Size of virtual canvas */
+	GifWord SColorResolution;  /* How many colors can we generate? */
+	GifWord SBackGroundColor;  /* Background color for virtual canvas */
+	GifByteType AspectByte;    /* Used to compute pixel aspect ratio */
+	ColorMapObject *SColorMap; /* Global colormap, NULL if nonexistent. */
+	int ImageCount;            /* Number of current image (both APIs) */
+	GifImageDesc Image;        /* Current image (low-level API) */
+	SavedImage *SavedImages;   /* Image sequence (high-level API) */
+	int ExtensionBlockCount;   /* Count extensions past last image */
+	ExtensionBlock *ExtensionBlocks; /* Extensions past last image */
+	int Error;                       /* Last error condition reported */
+	void *UserData;                  /* hook to attach user data (TVT) */
+	void *Private;                   /* Don't mess with this! */
+} GifFileType;
+
+#define GIF_ASPECT_RATIO(n) ((n) + 15.0 / 64.0)
+
+typedef enum {
+	UNDEFINED_RECORD_TYPE,
+	SCREEN_DESC_RECORD_TYPE,
+	IMAGE_DESC_RECORD_TYPE, /* Begin with ',' */
+	EXTENSION_RECORD_TYPE,  /* Begin with '!' */
+	TERMINATE_RECORD_TYPE   /* Begin with ';' */
+} GifRecordType;
+
+/* func type to read gif data from arbitrary sources (TVT) */
+typedef int (*InputFunc)(GifFileType *, GifByteType *, int);
+
+/* func type to write gif data to arbitrary targets.
+ * Returns count of bytes written. (MRB)
+ */
+typedef int (*OutputFunc)(GifFileType *, const GifByteType *, int);
+
+/******************************************************************************
+ GIF89 structures
+******************************************************************************/
+
+typedef struct GraphicsControlBlock {
+	int DisposalMode;
+#define DISPOSAL_UNSPECIFIED 0 /* No disposal specified. */
+#define DISPOSE_DO_NOT 1       /* Leave image in place */
+#define DISPOSE_BACKGROUND 2   /* Set area too background color */
+#define DISPOSE_PREVIOUS 3     /* Restore to previous content */
+	bool UserInputFlag;    /* User confirmation required before disposal */
+	int DelayTime;         /* pre-display delay in 0.01sec units */
+	int TransparentColor;  /* Palette index for transparency, -1 if none */
+#define NO_TRANSPARENT_COLOR -1
+} GraphicsControlBlock;
+
+/******************************************************************************
+ GIF encoding routines
+******************************************************************************/
+
+/* Main entry points */
+GifFileType *EGifOpenFileName(const char *GifFileName,
+                              const bool GifTestExistence, int *Error);
+GifFileType *EGifOpenFileHandle(const int GifFileHandle, int *Error);
+GifFileType *EGifOpen(void *userPtr, OutputFunc writeFunc, int *Error);
+int EGifSpew(GifFileType *GifFile);
+const char *EGifGetGifVersion(GifFileType *GifFile); /* new in 5.x */
+int EGifCloseFile(GifFileType *GifFile, int *ErrorCode);
+
+#define E_GIF_SUCCEEDED 0
+#define E_GIF_ERR_OPEN_FAILED 1 /* And EGif possible errors. */
+#define E_GIF_ERR_WRITE_FAILED 2
+#define E_GIF_ERR_HAS_SCRN_DSCR 3
+#define E_GIF_ERR_HAS_IMAG_DSCR 4
+#define E_GIF_ERR_NO_COLOR_MAP 5
+#define E_GIF_ERR_DATA_TOO_BIG 6
+#define E_GIF_ERR_NOT_ENOUGH_MEM 7
+#define E_GIF_ERR_DISK_IS_FULL 8
+#define E_GIF_ERR_CLOSE_FAILED 9
+#define E_GIF_ERR_NOT_WRITEABLE 10
+
+/* These are legacy.  You probably do not want to call them directly */
+int EGifPutScreenDesc(GifFileType *GifFile, const int GifWidth,
+                      const int GifHeight, const int GifColorRes,
+                      const int GifBackGround,
+                      const ColorMapObject *GifColorMap);
+int EGifPutImageDesc(GifFileType *GifFile, const int GifLeft, const int GifTop,
+                     const int GifWidth, const int GifHeight,
+                     const bool GifInterlace,
+                     const ColorMapObject *GifColorMap);
+void EGifSetGifVersion(GifFileType *GifFile, const bool gif89);
+int EGifPutLine(GifFileType *GifFile, GifPixelType *GifLine, int GifLineLen);
+int EGifPutPixel(GifFileType *GifFile, const GifPixelType GifPixel);
+int EGifPutComment(GifFileType *GifFile, const char *GifComment);
+int EGifPutExtensionLeader(GifFileType *GifFile, const int GifExtCode);
+int EGifPutExtensionBlock(GifFileType *GifFile, const int GifExtLen,
+                          const void *GifExtension);
+int EGifPutExtensionTrailer(GifFileType *GifFile);
+int EGifPutExtension(GifFileType *GifFile, const int GifExtCode,
+                     const int GifExtLen, const void *GifExtension);
+int EGifPutCode(GifFileType *GifFile, int GifCodeSize,
+                const GifByteType *GifCodeBlock);
+int EGifPutCodeNext(GifFileType *GifFile, const GifByteType *GifCodeBlock);
+
+/******************************************************************************
+ GIF decoding routines
+******************************************************************************/
+
+/* Main entry points */
+GifFileType *DGifOpenFileName(const char *GifFileName, int *Error);
+GifFileType *DGifOpenFileHandle(int GifFileHandle, int *Error);
+int DGifSlurp(GifFileType *GifFile);
+GifFileType *DGifOpen(void *userPtr, InputFunc readFunc,
+                      int *Error); /* new one (TVT) */
+int DGifCloseFile(GifFileType *GifFile, int *ErrorCode);
+
+#define D_GIF_SUCCEEDED 0
+#define D_GIF_ERR_OPEN_FAILED 101 /* And DGif possible errors. */
+#define D_GIF_ERR_READ_FAILED 102
+#define D_GIF_ERR_NOT_GIF_FILE 103
+#define D_GIF_ERR_NO_SCRN_DSCR 104
+#define D_GIF_ERR_NO_IMAG_DSCR 105
+#define D_GIF_ERR_NO_COLOR_MAP 106
+#define D_GIF_ERR_WRONG_RECORD 107
+#define D_GIF_ERR_DATA_TOO_BIG 108
+#define D_GIF_ERR_NOT_ENOUGH_MEM 109
+#define D_GIF_ERR_CLOSE_FAILED 110
+#define D_GIF_ERR_NOT_READABLE 111
+#define D_GIF_ERR_IMAGE_DEFECT 112
+#define D_GIF_ERR_EOF_TOO_SOON 113
+
+/* These are legacy.  You probably do not want to call them directly */
+int DGifGetScreenDesc(GifFileType *GifFile);
+int DGifGetRecordType(GifFileType *GifFile, GifRecordType *GifType);
+int DGifGetImageHeader(GifFileType *GifFile);
+int DGifGetImageDesc(GifFileType *GifFile);
+int DGifGetLine(GifFileType *GifFile, GifPixelType *GifLine, int GifLineLen);
+int DGifGetPixel(GifFileType *GifFile, GifPixelType GifPixel);
+int DGifGetExtension(GifFileType *GifFile, int *GifExtCode,
+                     GifByteType **GifExtension);
+int DGifGetExtensionNext(GifFileType *GifFile, GifByteType **GifExtension);
+int DGifGetCode(GifFileType *GifFile, int *GifCodeSize,
+                GifByteType **GifCodeBlock);
+int DGifGetCodeNext(GifFileType *GifFile, GifByteType **GifCodeBlock);
+int DGifGetLZCodes(GifFileType *GifFile, int *GifCode);
+const char *DGifGetGifVersion(GifFileType *GifFile);
+
+/******************************************************************************
+ Error handling and reporting.
+******************************************************************************/
+extern const char *GifErrorString(int ErrorCode); /* new in 2012 - ESR */
+
+/*****************************************************************************
+ it g in core.
+******************************************************************************/
+
+/******************************************************************************
+ Color map handling from gif_alloc.c
+******************************************************************************/
+
+extern ColorMapObject *GifMakeMapObject(int ColorCount,
+                                        const GifColorType *ColorMap);
+extern void GifFreeMapObject(ColorMapObject *Object);
+extern ColorMapObject *GifUnionColorMap(const ColorMapObject *ColorIn1,
+                                        const ColorMapObject *ColorIn2,
+                                        GifPixelType ColorTransIn2[]);
+extern int GifBitSize(int n);
+
+/******************************************************************************
+ Support for the in-core structures allocation (slurp mode).
+******************************************************************************/
+
+extern void GifApplyTranslation(SavedImage *Image,
+                                const GifPixelType Translation[]);
+extern int GifAddExtensionBlock(int *ExtensionBlock_Count,
+                                ExtensionBlock **ExtensionBlocks, int Function,
+                                unsigned int Len, unsigned char ExtData[]);
+extern void GifFreeExtensions(int *ExtensionBlock_Count,
+                              ExtensionBlock **ExtensionBlocks);
+extern SavedImage *GifMakeSavedImage(GifFileType *GifFile,
+                                     const SavedImage *CopyFrom);
+extern void GifFreeSavedImages(GifFileType *GifFile);
+
+/******************************************************************************
+ 5.x functions for GIF89 graphics control blocks
+******************************************************************************/
+
+int DGifExtensionToGCB(const size_t GifExtensionLength,
+                       const GifByteType *GifExtension,
+                       GraphicsControlBlock *GCB);
+size_t EGifGCBToExtension(const GraphicsControlBlock *GCB,
+                          GifByteType *GifExtension);
+
+int DGifSavedExtensionToGCB(GifFileType *GifFile, int ImageIndex,
+                            GraphicsControlBlock *GCB);
+int EGifGCBToSavedExtension(const GraphicsControlBlock *GCB,
+                            GifFileType *GifFile, int ImageIndex);
+
+/******************************************************************************
+ The library's internal utility font
+******************************************************************************/
+
+#define GIF_FONT_WIDTH 8
+#define GIF_FONT_HEIGHT 8
+extern const unsigned char GifAsciiTable8x8[][GIF_FONT_WIDTH];
+
+extern void GifDrawText8x8(SavedImage *Image, const int x, const int y,
+                           const char *legend, const int color);
+
+extern void GifDrawBox(SavedImage *Image, const int x, const int y, const int w,
+                       const int d, const int color);
+
+extern void GifDrawRectangle(SavedImage *Image, const int x, const int y,
+                             const int w, const int d, const int color);
+
+extern void GifDrawBoxedText8x8(SavedImage *Image, const int x, const int y,
+                                const char *legend, const int border,
+                                const int bg, const int fg);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* _GIF_LIB_H */
+
+/* end */
diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h b/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h
new file mode 100644
index 00000000000..04987150321
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h
@@ -0,0 +1,73 @@
+// @nolint (improperly imported third-party code)
+/****************************************************************************
+
+gif_lib_private.h - internal giflib routines and structures
+
+SPDX-License-Identifier: MIT
+
+****************************************************************************/
+
+#ifndef _GIF_LIB_PRIVATE_H
+#define _GIF_LIB_PRIVATE_H
+
+#include "gif_hash.h"
+#include "gif_lib.h"
+
+#ifndef SIZE_MAX
+#define SIZE_MAX UINTPTR_MAX
+#endif
+
+#define EXTENSION_INTRODUCER 0x21
+#define DESCRIPTOR_INTRODUCER 0x2c
+#define TERMINATOR_INTRODUCER 0x3b
+
+#define LZ_MAX_CODE 4095 /* Biggest code possible in 12 bits. */
+#define LZ_BITS 12
+
+#define FLUSH_OUTPUT 4096 /* Impossible code, to signal flush. */
+#define FIRST_CODE 4097   /* Impossible code, to signal first. */
+#define NO_SUCH_CODE 4098 /* Impossible code, to signal empty. */
+
+#define FILE_STATE_WRITE 0x01
+#define FILE_STATE_SCREEN 0x02
+#define FILE_STATE_IMAGE 0x04
+#define FILE_STATE_READ 0x08
+
+#define IS_READABLE(Private) (Private->FileState & FILE_STATE_READ)
+#define IS_WRITEABLE(Private) (Private->FileState & FILE_STATE_WRITE)
+
+typedef struct GifFilePrivateType {
+	GifWord FileState, FileHandle, /* Where all this data goes to! */
+	    BitsPerPixel, /* Bits per pixel (Codes uses at least this + 1). */
+	    ClearCode,    /* The CLEAR LZ code. */
+	    EOFCode,      /* The EOF LZ code. */
+	    RunningCode,  /* The next code algorithm can generate. */
+	    RunningBits,  /* The number of bits required to represent
+	                     RunningCode. */
+	    MaxCode1, /* 1 bigger than max. possible code, in RunningBits bits.
+	               */
+	    LastCode, /* The code before the current code. */
+	    CrntCode, /* Current algorithm code. */
+	    StackPtr, /* For character stack (see below). */
+	    CrntShiftState;           /* Number of bits in CrntShiftDWord. */
+	unsigned long CrntShiftDWord; /* For bytes decomposition into codes. */
+	unsigned long PixelCount;     /* Number of pixels in image. */
+	FILE *File;                   /* File as stream. */
+	InputFunc Read;               /* function to read gif input (TVT) */
+	OutputFunc Write;             /* function to write gif output (MRB) */
+	GifByteType Buf[256];         /* Compressed input is buffered here. */
+	GifByteType Stack[LZ_MAX_CODE]; /* Decoded pixels are stacked here. */
+	GifByteType Suffix[LZ_MAX_CODE + 1]; /* So we can trace the codes. */
+	GifPrefixType Prefix[LZ_MAX_CODE + 1];
+	GifHashTableType *HashTable;
+	bool gif89;
+} GifFilePrivateType;
+
+#ifndef HAVE_REALLOCARRAY
+extern void *openbsd_reallocarray(void *optr, size_t nmemb, size_t size);
+#define reallocarray openbsd_reallocarray
+#endif
+
+#endif /* _GIF_LIB_PRIVATE_H */
+
+/* end */
diff --git a/torchvision/csrc/io/image/cpu/giflib/gifalloc.c b/torchvision/csrc/io/image/cpu/giflib/gifalloc.c
new file mode 100644
index 00000000000..65679d22804
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/gifalloc.c
@@ -0,0 +1,426 @@
+// @nolint (improperly imported third-party code)
+/*****************************************************************************
+
+ GIF construction tools
+
+****************************************************************************/
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Copyright (C) Eric S. Raymond <esr@thyrsus.com>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "gif_lib.h"
+#include "gif_lib_private.h"
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+/******************************************************************************
+ Miscellaneous utility functions
+******************************************************************************/
+
+/* return smallest bitfield size n will fit in */
+int GifBitSize(int n) {
+	int i;
+
+	for (i = 1; i <= 8; i++) {
+		if ((1 << i) >= n) {
+			break;
+		}
+	}
+	return (i);
+}
+
+/******************************************************************************
+ Color map object functions
+******************************************************************************/
+
+/*
+ * Allocate a color map of given size; initialize with contents of
+ * ColorMap if that pointer is non-NULL.
+ */
+ColorMapObject *GifMakeMapObject(int ColorCount, const GifColorType *ColorMap) {
+	ColorMapObject *Object;
+
+	/*** FIXME: Our ColorCount has to be a power of two.  Is it necessary to
+	 * make the user know that or should we automatically round up instead?
+	 */
+	if (ColorCount != (1 << GifBitSize(ColorCount))) {
+		return ((ColorMapObject *)NULL);
+	}
+
+	Object = (ColorMapObject *)malloc(sizeof(ColorMapObject));
+	if (Object == (ColorMapObject *)NULL) {
+		return ((ColorMapObject *)NULL);
+	}
+
+	Object->Colors =
+	    (GifColorType *)calloc(ColorCount, sizeof(GifColorType));
+	if (Object->Colors == (GifColorType *)NULL) {
+		free(Object);
+		return ((ColorMapObject *)NULL);
+	}
+
+	Object->ColorCount = ColorCount;
+	Object->BitsPerPixel = GifBitSize(ColorCount);
+	Object->SortFlag = false;
+
+	if (ColorMap != NULL) {
+		memcpy((char *)Object->Colors, (char *)ColorMap,
+		       ColorCount * sizeof(GifColorType));
+	}
+
+	return (Object);
+}
+
+/*******************************************************************************
+ Free a color map object
+*******************************************************************************/
+void GifFreeMapObject(ColorMapObject *Object) {
+	if (Object != NULL) {
+		(void)free(Object->Colors);
+		(void)free(Object);
+	}
+}
+
+#ifdef DEBUG
+void DumpColorMap(ColorMapObject *Object, FILE *fp) {
+	if (Object != NULL) {
+		int i, j, Len = Object->ColorCount;
+
+		for (i = 0; i < Len; i += 4) {
+			for (j = 0; j < 4 && j < Len; j++) {
+				(void)fprintf(fp, "%3d: %02x %02x %02x   ",
+				              i + j, Object->Colors[i + j].Red,
+				              Object->Colors[i + j].Green,
+				              Object->Colors[i + j].Blue);
+			}
+			(void)fprintf(fp, "\n");
+		}
+	}
+}
+#endif /* DEBUG */
+
+/*******************************************************************************
+ Compute the union of two given color maps and return it.  If result can't
+ fit into 256 colors, NULL is returned, the allocated union otherwise.
+ ColorIn1 is copied as is to ColorUnion, while colors from ColorIn2 are
+ copied iff they didn't exist before.  ColorTransIn2 maps the old
+ ColorIn2 into the ColorUnion color map table./
+*******************************************************************************/
+ColorMapObject *GifUnionColorMap(const ColorMapObject *ColorIn1,
+                                 const ColorMapObject *ColorIn2,
+                                 GifPixelType ColorTransIn2[]) {
+	int i, j, CrntSlot, RoundUpTo, NewGifBitSize;
+	ColorMapObject *ColorUnion;
+
+	/*
+	 * We don't worry about duplicates within either color map; if
+	 * the caller wants to resolve those, he can perform unions
+	 * with an empty color map.
+	 */
+
+	/* Allocate table which will hold the result for sure. */
+	ColorUnion = GifMakeMapObject(
+	    MAX(ColorIn1->ColorCount, ColorIn2->ColorCount) * 2, NULL);
+
+	if (ColorUnion == NULL) {
+		return (NULL);
+	}
+
+	/*
+	 * Copy ColorIn1 to ColorUnion.
+	 */
+	for (i = 0; i < ColorIn1->ColorCount; i++) {
+		ColorUnion->Colors[i] = ColorIn1->Colors[i];
+	}
+	CrntSlot = ColorIn1->ColorCount;
+
+	/*
+	 * Potentially obnoxious hack:
+	 *
+	 * Back CrntSlot down past all contiguous {0, 0, 0} slots at the end
+	 * of table 1.  This is very useful if your display is limited to
+	 * 16 colors.
+	 */
+	while (ColorIn1->Colors[CrntSlot - 1].Red == 0 &&
+	       ColorIn1->Colors[CrntSlot - 1].Green == 0 &&
+	       ColorIn1->Colors[CrntSlot - 1].Blue == 0) {
+		CrntSlot--;
+	}
+
+	/* Copy ColorIn2 to ColorUnion (use old colors if they exist): */
+	for (i = 0; i < ColorIn2->ColorCount && CrntSlot <= 256; i++) {
+		/* Let's see if this color already exists: */
+		for (j = 0; j < ColorIn1->ColorCount; j++) {
+			if (memcmp(&ColorIn1->Colors[j], &ColorIn2->Colors[i],
+			           sizeof(GifColorType)) == 0) {
+				break;
+			}
+		}
+
+		if (j < ColorIn1->ColorCount) {
+			ColorTransIn2[i] = j; /* color exists in Color1 */
+		} else {
+			/* Color is new - copy it to a new slot: */
+			ColorUnion->Colors[CrntSlot] = ColorIn2->Colors[i];
+			ColorTransIn2[i] = CrntSlot++;
+		}
+	}
+
+	if (CrntSlot > 256) {
+		GifFreeMapObject(ColorUnion);
+		return ((ColorMapObject *)NULL);
+	}
+
+	NewGifBitSize = GifBitSize(CrntSlot);
+	RoundUpTo = (1 << NewGifBitSize);
+
+	if (RoundUpTo != ColorUnion->ColorCount) {
+		GifColorType *Map = ColorUnion->Colors;
+
+		/*
+		 * Zero out slots up to next power of 2.
+		 * We know these slots exist because of the way ColorUnion's
+		 * start dimension was computed.
+		 */
+		for (j = CrntSlot; j < RoundUpTo; j++) {
+			Map[j].Red = Map[j].Green = Map[j].Blue = 0;
+		}
+
+		/* perhaps we can shrink the map? */
+		if (RoundUpTo < ColorUnion->ColorCount) {
+			GifColorType *new_map = (GifColorType *)reallocarray(
+			    Map, RoundUpTo, sizeof(GifColorType));
+			if (new_map == NULL) {
+				GifFreeMapObject(ColorUnion);
+				return ((ColorMapObject *)NULL);
+			}
+			ColorUnion->Colors = new_map;
+		}
+	}
+
+	ColorUnion->ColorCount = RoundUpTo;
+	ColorUnion->BitsPerPixel = NewGifBitSize;
+
+	return (ColorUnion);
+}
+
+/*******************************************************************************
+ Apply a given color translation to the raster bits of an image
+*******************************************************************************/
+void GifApplyTranslation(SavedImage *Image, const GifPixelType Translation[]) {
+	int i;
+	int RasterSize =
+	    Image->ImageDesc.Height * Image->ImageDesc.Width;
+
+	for (i = 0; i < RasterSize; i++) {
+		Image->RasterBits[i] = Translation[Image->RasterBits[i]];
+	}
+}
+
+/******************************************************************************
+ Extension record functions
+******************************************************************************/
+int GifAddExtensionBlock(int *ExtensionBlockCount,
+                         ExtensionBlock **ExtensionBlocks, int Function,
+                         unsigned int Len, unsigned char ExtData[]) {
+	ExtensionBlock *ep;
+
+	if (*ExtensionBlocks == NULL) {
+		*ExtensionBlocks =
+		    (ExtensionBlock *)malloc(sizeof(ExtensionBlock));
+	} else {
+		ExtensionBlock *ep_new = (ExtensionBlock *)reallocarray(
+		    *ExtensionBlocks, (*ExtensionBlockCount + 1),
+		    sizeof(ExtensionBlock));
+		if (ep_new == NULL) {
+			return (GIF_ERROR);
+		}
+		*ExtensionBlocks = ep_new;
+	}
+
+	if (*ExtensionBlocks == NULL) {
+		return (GIF_ERROR);
+	}
+
+	ep = &(*ExtensionBlocks)[(*ExtensionBlockCount)++];
+
+	ep->Function = Function;
+	ep->ByteCount = Len;
+	ep->Bytes = (GifByteType *)malloc(ep->ByteCount);
+	if (ep->Bytes == NULL) {
+		return (GIF_ERROR);
+	}
+
+	if (ExtData != NULL) {
+		memcpy(ep->Bytes, ExtData, Len);
+	}
+
+	return (GIF_OK);
+}
+
+void GifFreeExtensions(int *ExtensionBlockCount,
+                       ExtensionBlock **ExtensionBlocks) {
+	ExtensionBlock *ep;
+
+	if (*ExtensionBlocks == NULL) {
+		return;
+	}
+
+	for (ep = *ExtensionBlocks;
+	     ep < (*ExtensionBlocks + *ExtensionBlockCount); ep++) {
+		(void)free((char *)ep->Bytes);
+	}
+	(void)free((char *)*ExtensionBlocks);
+	*ExtensionBlocks = NULL;
+	*ExtensionBlockCount = 0;
+}
+
+/******************************************************************************
+   Image block allocation functions
+******************************************************************************/
+
+/* Private Function:
+ * Frees the last image in the GifFile->SavedImages array
+ */
+void FreeLastSavedImage(GifFileType *GifFile) {
+	SavedImage *sp;
+
+	if ((GifFile == NULL) || (GifFile->SavedImages == NULL)) {
+		return;
+	}
+
+	/* Remove one SavedImage from the GifFile */
+	GifFile->ImageCount--;
+	sp = &GifFile->SavedImages[GifFile->ImageCount];
+
+	/* Deallocate its Colormap */
+	if (sp->ImageDesc.ColorMap != NULL) {
+		GifFreeMapObject(sp->ImageDesc.ColorMap);
+		sp->ImageDesc.ColorMap = NULL;
+	}
+
+	/* Deallocate the image data */
+	if (sp->RasterBits != NULL) {
+		free((char *)sp->RasterBits);
+	}
+
+	/* Deallocate any extensions */
+	GifFreeExtensions(&sp->ExtensionBlockCount, &sp->ExtensionBlocks);
+
+	/*** FIXME: We could realloc the GifFile->SavedImages structure but is
+	 * there a point to it? Saves some memory but we'd have to do it every
+	 * time.  If this is used in GifFreeSavedImages then it would be
+	 * inefficient (The whole array is going to be deallocated.)  If we just
+	 * use it when we want to free the last Image it's convenient to do it
+	 * here.
+	 */
+}
+
+/*
+ * Append an image block to the SavedImages array
+ */
+SavedImage *GifMakeSavedImage(GifFileType *GifFile,
+                              const SavedImage *CopyFrom) {
+	// cppcheck-suppress ctunullpointer
+	if (GifFile->SavedImages == NULL) {
+		GifFile->SavedImages = (SavedImage *)malloc(sizeof(SavedImage));
+	} else {
+		SavedImage *newSavedImages = (SavedImage *)reallocarray(
+		    GifFile->SavedImages, (GifFile->ImageCount + 1),
+		    sizeof(SavedImage));
+		if (newSavedImages == NULL) {
+			return ((SavedImage *)NULL);
+		}
+		GifFile->SavedImages = newSavedImages;
+	}
+	if (GifFile->SavedImages == NULL) {
+		return ((SavedImage *)NULL);
+	} else {
+		SavedImage *sp = &GifFile->SavedImages[GifFile->ImageCount++];
+
+		if (CopyFrom != NULL) {
+			memcpy((char *)sp, CopyFrom, sizeof(SavedImage));
+
+			/*
+			 * Make our own allocated copies of the heap fields in
+			 * the copied record.  This guards against potential
+			 * aliasing problems.
+			 */
+
+			/* first, the local color map */
+			if (CopyFrom->ImageDesc.ColorMap != NULL) {
+				sp->ImageDesc.ColorMap = GifMakeMapObject(
+				    CopyFrom->ImageDesc.ColorMap->ColorCount,
+				    CopyFrom->ImageDesc.ColorMap->Colors);
+				if (sp->ImageDesc.ColorMap == NULL) {
+					FreeLastSavedImage(GifFile);
+					return (SavedImage *)(NULL);
+				}
+			}
+
+			/* next, the raster */
+			sp->RasterBits = (unsigned char *)reallocarray(
+			    NULL,
+			    (CopyFrom->ImageDesc.Height *
+			     CopyFrom->ImageDesc.Width),
+			    sizeof(GifPixelType));
+			if (sp->RasterBits == NULL) {
+				FreeLastSavedImage(GifFile);
+				return (SavedImage *)(NULL);
+			}
+			memcpy(sp->RasterBits, CopyFrom->RasterBits,
+			       sizeof(GifPixelType) *
+			           CopyFrom->ImageDesc.Height *
+			           CopyFrom->ImageDesc.Width);
+
+			/* finally, the extension blocks */
+			if (CopyFrom->ExtensionBlocks != NULL) {
+				sp->ExtensionBlocks =
+				    (ExtensionBlock *)reallocarray(
+				        NULL, CopyFrom->ExtensionBlockCount,
+				        sizeof(ExtensionBlock));
+				if (sp->ExtensionBlocks == NULL) {
+					FreeLastSavedImage(GifFile);
+					return (SavedImage *)(NULL);
+				}
+				memcpy(sp->ExtensionBlocks,
+				       CopyFrom->ExtensionBlocks,
+				       sizeof(ExtensionBlock) *
+				           CopyFrom->ExtensionBlockCount);
+			}
+		} else {
+			memset((char *)sp, '\0', sizeof(SavedImage));
+		}
+
+		return (sp);
+	}
+}
+
+void GifFreeSavedImages(GifFileType *GifFile) {
+	SavedImage *sp;
+
+	if ((GifFile == NULL) || (GifFile->SavedImages == NULL)) {
+		return;
+	}
+	for (sp = GifFile->SavedImages;
+	     sp < GifFile->SavedImages + GifFile->ImageCount; sp++) {
+		if (sp->ImageDesc.ColorMap != NULL) {
+			GifFreeMapObject(sp->ImageDesc.ColorMap);
+			sp->ImageDesc.ColorMap = NULL;
+		}
+
+		if (sp->RasterBits != NULL) {
+			free((char *)sp->RasterBits);
+		}
+
+		GifFreeExtensions(&sp->ExtensionBlockCount,
+		                  &sp->ExtensionBlocks);
+	}
+	free((char *)GifFile->SavedImages);
+	GifFile->SavedImages = NULL;
+}
+
+/* end */
diff --git a/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c b/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c
new file mode 100644
index 00000000000..7d5f1e73a7d
--- /dev/null
+++ b/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c
@@ -0,0 +1,74 @@
+// @nolint (improperly imported third-party code)
+/*
+ * SPDX-FileCopyrightText: Copyright (C) 2008 Otto Moerbeek <otto@drijf.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#ifndef SIZE_MAX
+#define SIZE_MAX UINTPTR_MAX
+#endif
+
+/*
+ * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX
+ * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW
+ */
+#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4))
+
+void *openbsd_reallocarray(void *optr, size_t nmemb, size_t size) {
+	if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
+	    nmemb > 0 && SIZE_MAX / nmemb < size) {
+		errno = ENOMEM;
+		return NULL;
+	}
+	/*
+	 * Head off variations in realloc behavior on different
+	 * platforms (reported by MarkR <mrogers6@users.sf.net>)
+	 *
+	 * The behaviour of reallocarray is implementation-defined if
+	 * nmemb or size is zero. It can return NULL or non-NULL
+	 * depending on the platform.
+	 * https://www.securecoding.cert.org/confluence/display/c/MEM04-C.Beware+of+zero-lengthallocations
+	 *
+	 * Here are some extracts from realloc man pages on different platforms.
+	 *
+	 * void realloc( void memblock, size_t size );
+	 *
+	 * Windows:
+	 *
+	 * If there is not enough available memory to expand the block
+	 * to the given size, the original block is left unchanged,
+	 * and NULL is returned.  If size is zero, then the block
+	 * pointed to by memblock is freed; the return value is NULL,
+	 * and memblock is left pointing at a freed block.
+	 *
+	 * OpenBSD:
+	 *
+	 * If size or nmemb is equal to 0, a unique pointer to an
+	 * access protected, zero sized object is returned. Access via
+	 * this pointer will generate a SIGSEGV exception.
+	 *
+	 * Linux:
+	 *
+	 * If size was equal to 0, either NULL or a pointer suitable
+	 * to be passed to free() is returned.
+	 *
+	 * OS X:
+	 *
+	 * If size is zero and ptr is not NULL, a new, minimum sized
+	 * object is allocated and the original object is freed.
+	 *
+	 * It looks like images with zero width or height can trigger
+	 * this, and fuzzing behaviour will differ by platform, so
+	 * fuzzing on one platform may not detect zero-size allocation
+	 * problems on other platforms.
+	 */
+	if (size == 0 || nmemb == 0) {
+		return NULL;
+	}
+	return realloc(optr, size * nmemb);
+}
diff --git a/torchvision/csrc/io/image/cpu/read_write_file.cpp b/torchvision/csrc/io/image/cpu/read_write_file.cpp
index def74c6721a..06de72a5053 100644
--- a/torchvision/csrc/io/image/cpu/read_write_file.cpp
+++ b/torchvision/csrc/io/image/cpu/read_write_file.cpp
@@ -17,7 +17,7 @@ std::wstring utf8_decode(const std::string& str) {
     return std::wstring();
   }
   int size_needed = MultiByteToWideChar(
-      CP_UTF8, 0, str.c_str(), static_cast<int>(str.size()), NULL, 0);
+      CP_UTF8, 0, str.c_str(), static_cast<int>(str.size()), nullptr, 0);
   TORCH_CHECK(size_needed > 0, "Error converting the content to Unicode");
   std::wstring wstrTo(size_needed, 0);
   MultiByteToWideChar(
diff --git a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp b/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp
deleted file mode 100644
index ee7d432f30d..00000000000
--- a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-#include "decode_jpeg_cuda.h"
-
-#include <ATen/ATen.h>
-
-#if NVJPEG_FOUND
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <nvjpeg.h>
-#endif
-
-#include <string>
-
-namespace vision {
-namespace image {
-
-#if !NVJPEG_FOUND
-
-torch::Tensor decode_jpeg_cuda(
-    const torch::Tensor& data,
-    ImageReadMode mode,
-    torch::Device device) {
-  TORCH_CHECK(
-      false, "decode_jpeg_cuda: torchvision not compiled with nvJPEG support");
-}
-
-#else
-
-namespace {
-static nvjpegHandle_t nvjpeg_handle = nullptr;
-}
-
-torch::Tensor decode_jpeg_cuda(
-    const torch::Tensor& data,
-    ImageReadMode mode,
-    torch::Device device) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.image.cuda.decode_jpeg_cuda.decode_jpeg_cuda");
-  TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
-
-  TORCH_CHECK(
-      !data.is_cuda(),
-      "The input tensor must be on CPU when decoding with nvjpeg")
-
-  TORCH_CHECK(
-      data.dim() == 1 && data.numel() > 0,
-      "Expected a non empty 1-dimensional tensor");
-
-  TORCH_CHECK(device.is_cuda(), "Expected a cuda device")
-
-  int major_version;
-  int minor_version;
-  nvjpegStatus_t get_major_property_status =
-      nvjpegGetProperty(MAJOR_VERSION, &major_version);
-  nvjpegStatus_t get_minor_property_status =
-      nvjpegGetProperty(MINOR_VERSION, &minor_version);
-
-  TORCH_CHECK(
-      get_major_property_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegGetProperty failed: ",
-      get_major_property_status);
-  TORCH_CHECK(
-      get_minor_property_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegGetProperty failed: ",
-      get_minor_property_status);
-  if ((major_version < 11) || ((major_version == 11) && (minor_version < 6))) {
-    TORCH_WARN_ONCE(
-        "There is a memory leak issue in the nvjpeg library for CUDA versions < 11.6. "
-        "Make sure to rely on CUDA 11.6 or above before using decode_jpeg(..., device='cuda').");
-  }
-
-  at::cuda::CUDAGuard device_guard(device);
-
-  // Create global nvJPEG handle
-  static std::once_flag nvjpeg_handle_creation_flag;
-  std::call_once(nvjpeg_handle_creation_flag, []() {
-    if (nvjpeg_handle == nullptr) {
-      nvjpegStatus_t create_status = nvjpegCreateSimple(&nvjpeg_handle);
-
-      if (create_status != NVJPEG_STATUS_SUCCESS) {
-        // Reset handle so that one can still call the function again in the
-        // same process if there was a failure
-        free(nvjpeg_handle);
-        nvjpeg_handle = nullptr;
-      }
-      TORCH_CHECK(
-          create_status == NVJPEG_STATUS_SUCCESS,
-          "nvjpegCreateSimple failed: ",
-          create_status);
-    }
-  });
-
-  // Create the jpeg state
-  nvjpegJpegState_t jpeg_state;
-  nvjpegStatus_t state_status =
-      nvjpegJpegStateCreate(nvjpeg_handle, &jpeg_state);
-
-  TORCH_CHECK(
-      state_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegJpegStateCreate failed: ",
-      state_status);
-
-  auto datap = data.data_ptr<uint8_t>();
-
-  // Get the image information
-  int num_channels;
-  nvjpegChromaSubsampling_t subsampling;
-  int widths[NVJPEG_MAX_COMPONENT];
-  int heights[NVJPEG_MAX_COMPONENT];
-  nvjpegStatus_t info_status = nvjpegGetImageInfo(
-      nvjpeg_handle,
-      datap,
-      data.numel(),
-      &num_channels,
-      &subsampling,
-      widths,
-      heights);
-
-  if (info_status != NVJPEG_STATUS_SUCCESS) {
-    nvjpegJpegStateDestroy(jpeg_state);
-    TORCH_CHECK(false, "nvjpegGetImageInfo failed: ", info_status);
-  }
-
-  if (subsampling == NVJPEG_CSS_UNKNOWN) {
-    nvjpegJpegStateDestroy(jpeg_state);
-    TORCH_CHECK(false, "Unknown NVJPEG chroma subsampling");
-  }
-
-  int width = widths[0];
-  int height = heights[0];
-
-  nvjpegOutputFormat_t ouput_format;
-  int num_channels_output;
-
-  switch (mode) {
-    case IMAGE_READ_MODE_UNCHANGED:
-      num_channels_output = num_channels;
-      // For some reason, setting output_format to NVJPEG_OUTPUT_UNCHANGED will
-      // not properly decode RGB images (it's fine for grayscale), so we set
-      // output_format manually here
-      if (num_channels == 1) {
-        ouput_format = NVJPEG_OUTPUT_Y;
-      } else if (num_channels == 3) {
-        ouput_format = NVJPEG_OUTPUT_RGB;
-      } else {
-        nvjpegJpegStateDestroy(jpeg_state);
-        TORCH_CHECK(
-            false,
-            "When mode is UNCHANGED, only 1 or 3 input channels are allowed.");
-      }
-      break;
-    case IMAGE_READ_MODE_GRAY:
-      ouput_format = NVJPEG_OUTPUT_Y;
-      num_channels_output = 1;
-      break;
-    case IMAGE_READ_MODE_RGB:
-      ouput_format = NVJPEG_OUTPUT_RGB;
-      num_channels_output = 3;
-      break;
-    default:
-      nvjpegJpegStateDestroy(jpeg_state);
-      TORCH_CHECK(
-          false, "The provided mode is not supported for JPEG decoding on GPU");
-  }
-
-  auto out_tensor = torch::empty(
-      {int64_t(num_channels_output), int64_t(height), int64_t(width)},
-      torch::dtype(torch::kU8).device(device));
-
-  // nvjpegImage_t is a struct with
-  // - an array of pointers to each channel
-  // - the pitch for each channel
-  // which must be filled in manually
-  nvjpegImage_t out_image;
-
-  for (int c = 0; c < num_channels_output; c++) {
-    out_image.channel[c] = out_tensor[c].data_ptr<uint8_t>();
-    out_image.pitch[c] = width;
-  }
-  for (int c = num_channels_output; c < NVJPEG_MAX_COMPONENT; c++) {
-    out_image.channel[c] = nullptr;
-    out_image.pitch[c] = 0;
-  }
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(device.index());
-
-  nvjpegStatus_t decode_status = nvjpegDecode(
-      nvjpeg_handle,
-      jpeg_state,
-      datap,
-      data.numel(),
-      ouput_format,
-      &out_image,
-      stream);
-
-  nvjpegJpegStateDestroy(jpeg_state);
-
-  TORCH_CHECK(
-      decode_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegDecode failed: ",
-      decode_status);
-
-  return out_tensor;
-}
-
-#endif // NVJPEG_FOUND
-
-} // namespace image
-} // namespace vision
diff --git a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h b/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h
deleted file mode 100644
index 496b355e9b7..00000000000
--- a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-#include "../image_read_mode.h"
-
-namespace vision {
-namespace image {
-
-C10_EXPORT torch::Tensor decode_jpeg_cuda(
-    const torch::Tensor& data,
-    ImageReadMode mode,
-    torch::Device device);
-
-} // namespace image
-} // namespace vision
diff --git a/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp
new file mode 100644
index 00000000000..85aa6c760c1
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp
@@ -0,0 +1,602 @@
+#include "decode_jpegs_cuda.h"
+#if !NVJPEG_FOUND
+namespace vision {
+namespace image {
+std::vector<torch::Tensor> decode_jpegs_cuda(
+    const std::vector<torch::Tensor>& encoded_images,
+    vision::image::ImageReadMode mode,
+    torch::Device device) {
+  TORCH_CHECK(
+      false, "decode_jpegs_cuda: torchvision not compiled with nvJPEG support");
+}
+} // namespace image
+} // namespace vision
+
+#else
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+namespace vision {
+namespace image {
+
+std::mutex decoderMutex;
+std::unique_ptr<CUDAJpegDecoder> cudaJpegDecoder;
+
+std::vector<torch::Tensor> decode_jpegs_cuda(
+    const std::vector<torch::Tensor>& encoded_images,
+    vision::image::ImageReadMode mode,
+    torch::Device device) {
+  C10_LOG_API_USAGE_ONCE(
+      "torchvision.csrc.io.image.cuda.decode_jpegs_cuda.decode_jpegs_cuda");
+
+  std::lock_guard<std::mutex> lock(decoderMutex);
+  std::vector<torch::Tensor> contig_images;
+  contig_images.reserve(encoded_images.size());
+
+  TORCH_CHECK(
+      device.is_cuda(), "Expected the device parameter to be a cuda device");
+
+  for (auto& encoded_image : encoded_images) {
+    TORCH_CHECK(
+        encoded_image.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
+
+    TORCH_CHECK(
+        !encoded_image.is_cuda(),
+        "The input tensor must be on CPU when decoding with nvjpeg")
+
+    TORCH_CHECK(
+        encoded_image.dim() == 1 && encoded_image.numel() > 0,
+        "Expected a non empty 1-dimensional tensor");
+
+    // nvjpeg requires images to be contiguous
+    if (encoded_image.is_contiguous()) {
+      contig_images.push_back(encoded_image);
+    } else {
+      contig_images.push_back(encoded_image.contiguous());
+    }
+  }
+
+  int major_version;
+  int minor_version;
+  nvjpegStatus_t get_major_property_status =
+      nvjpegGetProperty(MAJOR_VERSION, &major_version);
+  nvjpegStatus_t get_minor_property_status =
+      nvjpegGetProperty(MINOR_VERSION, &minor_version);
+
+  TORCH_CHECK(
+      get_major_property_status == NVJPEG_STATUS_SUCCESS,
+      "nvjpegGetProperty failed: ",
+      get_major_property_status);
+  TORCH_CHECK(
+      get_minor_property_status == NVJPEG_STATUS_SUCCESS,
+      "nvjpegGetProperty failed: ",
+      get_minor_property_status);
+  if ((major_version < 11) || ((major_version == 11) && (minor_version < 6))) {
+    TORCH_WARN_ONCE(
+        "There is a memory leak issue in the nvjpeg library for CUDA versions < 11.6. "
+        "Make sure to rely on CUDA 11.6 or above before using decode_jpeg(..., device='cuda').");
+  }
+
+  at::cuda::CUDAGuard device_guard(device);
+
+  if (cudaJpegDecoder == nullptr || device != cudaJpegDecoder->target_device) {
+    if (cudaJpegDecoder != nullptr) {
+      cudaJpegDecoder.reset(new CUDAJpegDecoder(device));
+    } else {
+      cudaJpegDecoder = std::make_unique<CUDAJpegDecoder>(device);
+      std::atexit([]() { cudaJpegDecoder.reset(); });
+    }
+  }
+
+  nvjpegOutputFormat_t output_format;
+
+  switch (mode) {
+    case vision::image::IMAGE_READ_MODE_UNCHANGED:
+      // Using NVJPEG_OUTPUT_UNCHANGED causes differently sized output channels
+      // which is related to the subsampling used I'm not sure why this is the
+      // case, but for now we're just using RGB and later removing channels from
+      // grayscale images.
+      output_format = NVJPEG_OUTPUT_UNCHANGED;
+      break;
+    case vision::image::IMAGE_READ_MODE_GRAY:
+      output_format = NVJPEG_OUTPUT_Y;
+      break;
+    case vision::image::IMAGE_READ_MODE_RGB:
+      output_format = NVJPEG_OUTPUT_RGB;
+      break;
+    default:
+      TORCH_CHECK(
+          false, "The provided mode is not supported for JPEG decoding on GPU");
+  }
+
+  try {
+    at::cuda::CUDAEvent event;
+    auto result = cudaJpegDecoder->decode_images(contig_images, output_format);
+    auto current_stream{
+        device.has_index() ? at::cuda::getCurrentCUDAStream(
+                                 cudaJpegDecoder->original_device.index())
+                           : at::cuda::getCurrentCUDAStream()};
+    event.record(cudaJpegDecoder->stream);
+    event.block(current_stream);
+    return result;
+  } catch (const std::exception& e) {
+    if (typeid(e) != typeid(std::runtime_error)) {
+      TORCH_CHECK(false, "Error while decoding JPEG images: ", e.what());
+    } else {
+      throw;
+    }
+  }
+}
+
+CUDAJpegDecoder::CUDAJpegDecoder(const torch::Device& target_device)
+    : original_device{torch::kCUDA, c10::cuda::current_device()},
+      target_device{target_device},
+      stream{
+          target_device.has_index()
+              ? at::cuda::getStreamFromPool(false, target_device.index())
+              : at::cuda::getStreamFromPool(false)} {
+  nvjpegStatus_t status;
+
+  hw_decode_available = true;
+  status = nvjpegCreateEx(
+      NVJPEG_BACKEND_HARDWARE,
+      NULL,
+      NULL,
+      NVJPEG_FLAGS_DEFAULT,
+      &nvjpeg_handle);
+  if (status == NVJPEG_STATUS_ARCH_MISMATCH) {
+    status = nvjpegCreateEx(
+        NVJPEG_BACKEND_DEFAULT,
+        NULL,
+        NULL,
+        NVJPEG_FLAGS_DEFAULT,
+        &nvjpeg_handle);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to initialize nvjpeg with default backend: ",
+        status);
+    hw_decode_available = false;
+  } else {
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to initialize nvjpeg with hardware backend: ",
+        status);
+  }
+
+  status = nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg state: ",
+      status);
+
+  status = nvjpegDecoderCreate(
+      nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, &nvjpeg_decoder);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg decoder: ",
+      status);
+
+  status = nvjpegDecoderStateCreate(
+      nvjpeg_handle, nvjpeg_decoder, &nvjpeg_decoupled_state);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg decoder state: ",
+      status);
+
+  status = nvjpegBufferPinnedCreate(nvjpeg_handle, NULL, &pinned_buffers[0]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create pinned buffer: ",
+      status);
+
+  status = nvjpegBufferPinnedCreate(nvjpeg_handle, NULL, &pinned_buffers[1]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create pinned buffer: ",
+      status);
+
+  status = nvjpegBufferDeviceCreate(nvjpeg_handle, NULL, &device_buffer);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create device buffer: ",
+      status);
+
+  status = nvjpegJpegStreamCreate(nvjpeg_handle, &jpeg_streams[0]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create jpeg stream: ",
+      status);
+
+  status = nvjpegJpegStreamCreate(nvjpeg_handle, &jpeg_streams[1]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create jpeg stream: ",
+      status);
+
+  status = nvjpegDecodeParamsCreate(nvjpeg_handle, &nvjpeg_decode_params);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create decode params: ",
+      status);
+}
+
+CUDAJpegDecoder::~CUDAJpegDecoder() {
+  /*
+  The below code works on Mac and Linux, but fails on Windows.
+  This is because on Windows, the atexit hook which calls this
+  destructor executes after cuda is already shut down causing SIGSEGV.
+  We do not have a solution to this problem at the moment, so we'll
+  just leak the libnvjpeg & cuda variables for the time being and hope
+  that the CUDA runtime handles cleanup for us.
+  Please send a PR if you have a solution for this problem.
+  */
+
+  // nvjpegStatus_t status;
+
+  // status = nvjpegDecodeParamsDestroy(nvjpeg_decode_params);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg decode params: ",
+  //     status);
+
+  // status = nvjpegJpegStreamDestroy(jpeg_streams[0]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy jpeg stream: ",
+  //     status);
+
+  // status = nvjpegJpegStreamDestroy(jpeg_streams[1]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy jpeg stream: ",
+  //     status);
+
+  // status = nvjpegBufferPinnedDestroy(pinned_buffers[0]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy pinned buffer[0]: ",
+  //     status);
+
+  // status = nvjpegBufferPinnedDestroy(pinned_buffers[1]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy pinned buffer[1]: ",
+  //     status);
+
+  // status = nvjpegBufferDeviceDestroy(device_buffer);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy device buffer: ",
+  //     status);
+
+  // status = nvjpegJpegStateDestroy(nvjpeg_decoupled_state);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg decoupled state: ",
+  //     status);
+
+  // status = nvjpegDecoderDestroy(nvjpeg_decoder);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg decoder: ",
+  //     status);
+
+  // status = nvjpegJpegStateDestroy(nvjpeg_state);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg state: ",
+  //     status);
+
+  // status = nvjpegDestroy(nvjpeg_handle);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS, "nvjpegDestroy failed: ", status);
+}
+
+std::tuple<
+    std::vector<nvjpegImage_t>,
+    std::vector<torch::Tensor>,
+    std::vector<int>>
+CUDAJpegDecoder::prepare_buffers(
+    const std::vector<torch::Tensor>& encoded_images,
+    const nvjpegOutputFormat_t& output_format) {
+  /*
+    This function scans the encoded images' jpeg headers and
+    allocates decoding buffers based on the metadata found
+
+    Args:
+    - encoded_images (std::vector<torch::Tensor>): a vector of tensors
+    containing the jpeg bitstreams to be decoded. Each tensor must have dtype
+    torch.uint8 and device cpu
+    - output_format (nvjpegOutputFormat_t): NVJPEG_OUTPUT_RGB, NVJPEG_OUTPUT_Y
+    or NVJPEG_OUTPUT_UNCHANGED
+
+    Returns:
+    - decoded_images (std::vector<nvjpegImage_t>): a vector of nvjpegImages
+    containing pointers to the memory of the decoded images
+    - output_tensors (std::vector<torch::Tensor>): a vector of Tensors
+    containing the decoded images. `decoded_images` points to the memory of
+    output_tensors
+    - channels (std::vector<int>): a vector of ints containing the number of
+    output image channels for every image
+  */
+
+  int width[NVJPEG_MAX_COMPONENT];
+  int height[NVJPEG_MAX_COMPONENT];
+  std::vector<int> channels(encoded_images.size());
+  nvjpegChromaSubsampling_t subsampling;
+  nvjpegStatus_t status;
+
+  std::vector<torch::Tensor> output_tensors{encoded_images.size()};
+  std::vector<nvjpegImage_t> decoded_images{encoded_images.size()};
+
+  for (std::vector<at::Tensor>::size_type i = 0; i < encoded_images.size();
+       i++) {
+    // extract bitstream meta data to figure out the number of channels, height,
+    // width for every image
+    status = nvjpegGetImageInfo(
+        nvjpeg_handle,
+        (unsigned char*)encoded_images[i].data_ptr(),
+        encoded_images[i].numel(),
+        &channels[i],
+        &subsampling,
+        width,
+        height);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS, "Failed to get image info: ", status);
+
+    TORCH_CHECK(
+        subsampling != NVJPEG_CSS_UNKNOWN, "Unknown chroma subsampling");
+
+    // output channels may be different from the actual number of channels in
+    // the image, e.g. we decode a grayscale image as RGB and slice off the
+    // extra channels later
+    int output_channels = 3;
+    if (output_format == NVJPEG_OUTPUT_RGB ||
+        output_format == NVJPEG_OUTPUT_UNCHANGED) {
+      output_channels = 3;
+    } else if (output_format == NVJPEG_OUTPUT_Y) {
+      output_channels = 1;
+    }
+
+    // reserve output buffer
+    auto output_tensor = torch::empty(
+        {int64_t(output_channels), int64_t(height[0]), int64_t(width[0])},
+        torch::dtype(torch::kU8).device(target_device));
+    output_tensors[i] = output_tensor;
+
+    // fill nvjpegImage_t struct
+    for (int c = 0; c < output_channels; c++) {
+      decoded_images[i].channel[c] = output_tensor[c].data_ptr<uint8_t>();
+      decoded_images[i].pitch[c] = width[0];
+    }
+    for (int c = output_channels; c < NVJPEG_MAX_COMPONENT; c++) {
+      decoded_images[i].channel[c] = NULL;
+      decoded_images[i].pitch[c] = 0;
+    }
+  }
+  return {decoded_images, output_tensors, channels};
+}
+
+std::vector<torch::Tensor> CUDAJpegDecoder::decode_images(
+    const std::vector<torch::Tensor>& encoded_images,
+    const nvjpegOutputFormat_t& output_format) {
+  /*
+    This function decodes a batch of jpeg bitstreams.
+    We scan all encoded bitstreams and sort them into two groups:
+    1. Baseline JPEGs: Can be decoded with hardware support on A100+ GPUs.
+    2. Other JPEGs (e.g. progressive JPEGs): Can also be decoded on the
+    GPU (albeit with software support only) but need some preprocessing on the
+    host first.
+
+    See
+    https://github.com/NVIDIA/CUDALibrarySamples/blob/f17940ac4e705bf47a8c39f5365925c1665f6c98/nvJPEG/nvJPEG-Decoder/nvjpegDecoder.cpp#L33
+    for reference.
+
+    Args:
+    - encoded_images (std::vector<torch::Tensor>): a vector of tensors
+    containing the jpeg bitstreams to be decoded
+    - output_format (nvjpegOutputFormat_t): NVJPEG_OUTPUT_RGB, NVJPEG_OUTPUT_Y
+    or NVJPEG_OUTPUT_UNCHANGED
+    - device (torch::Device): The desired CUDA device for the returned Tensors
+
+    Returns:
+    - output_tensors (std::vector<torch::Tensor>): a vector of Tensors
+    containing the decoded images
+  */
+
+  auto [decoded_imgs_buf, output_tensors, channels] =
+      prepare_buffers(encoded_images, output_format);
+
+  nvjpegStatus_t status;
+  cudaError_t cudaStatus;
+
+  cudaStatus = cudaStreamSynchronize(stream);
+  TORCH_CHECK(
+      cudaStatus == cudaSuccess,
+      "Failed to synchronize CUDA stream: ",
+      cudaStatus);
+
+  // baseline JPEGs can be batch decoded with hardware support on A100+ GPUs
+  // ultra fast!
+  std::vector<const unsigned char*> hw_input_buffer;
+  std::vector<size_t> hw_input_buffer_size;
+  std::vector<nvjpegImage_t> hw_output_buffer;
+
+  // other JPEG types such as progressive JPEGs can be decoded one-by-one in
+  // software slow :(
+  std::vector<const unsigned char*> sw_input_buffer;
+  std::vector<size_t> sw_input_buffer_size;
+  std::vector<nvjpegImage_t> sw_output_buffer;
+
+  if (hw_decode_available) {
+    for (std::vector<at::Tensor>::size_type i = 0; i < encoded_images.size();
+         ++i) {
+      // extract bitstream meta data to figure out whether a bit-stream can be
+      // decoded
+      nvjpegJpegStreamParseHeader(
+          nvjpeg_handle,
+          encoded_images[i].data_ptr<uint8_t>(),
+          encoded_images[i].numel(),
+          jpeg_streams[0]);
+      int isSupported = -1;
+      nvjpegDecodeBatchedSupported(
+          nvjpeg_handle, jpeg_streams[0], &isSupported);
+
+      if (isSupported == 0) {
+        hw_input_buffer.push_back(encoded_images[i].data_ptr<uint8_t>());
+        hw_input_buffer_size.push_back(encoded_images[i].numel());
+        hw_output_buffer.push_back(decoded_imgs_buf[i]);
+      } else {
+        sw_input_buffer.push_back(encoded_images[i].data_ptr<uint8_t>());
+        sw_input_buffer_size.push_back(encoded_images[i].numel());
+        sw_output_buffer.push_back(decoded_imgs_buf[i]);
+      }
+    }
+  } else {
+    for (std::vector<at::Tensor>::size_type i = 0; i < encoded_images.size();
+         ++i) {
+      sw_input_buffer.push_back(encoded_images[i].data_ptr<uint8_t>());
+      sw_input_buffer_size.push_back(encoded_images[i].numel());
+      sw_output_buffer.push_back(decoded_imgs_buf[i]);
+    }
+  }
+
+  if (hw_input_buffer.size() > 0) {
+    // UNCHANGED behaves weird, so we use RGB instead
+    status = nvjpegDecodeBatchedInitialize(
+        nvjpeg_handle,
+        nvjpeg_state,
+        hw_input_buffer.size(),
+        1,
+        output_format == NVJPEG_OUTPUT_UNCHANGED ? NVJPEG_OUTPUT_RGB
+                                                 : output_format);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to initialize batch decoding: ",
+        status);
+
+    status = nvjpegDecodeBatched(
+        nvjpeg_handle,
+        nvjpeg_state,
+        hw_input_buffer.data(),
+        hw_input_buffer_size.data(),
+        hw_output_buffer.data(),
+        stream);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS, "Failed to decode batch: ", status);
+  }
+
+  if (sw_input_buffer.size() > 0) {
+    status =
+        nvjpegStateAttachDeviceBuffer(nvjpeg_decoupled_state, device_buffer);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to attach device buffer: ",
+        status);
+    int buffer_index = 0;
+    // UNCHANGED behaves weird, so we use RGB instead
+    status = nvjpegDecodeParamsSetOutputFormat(
+        nvjpeg_decode_params,
+        output_format == NVJPEG_OUTPUT_UNCHANGED ? NVJPEG_OUTPUT_RGB
+                                                 : output_format);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to set output format: ",
+        status);
+    for (std::vector<at::Tensor>::size_type i = 0; i < sw_input_buffer.size();
+         ++i) {
+      status = nvjpegJpegStreamParse(
+          nvjpeg_handle,
+          sw_input_buffer[i],
+          sw_input_buffer_size[i],
+          0,
+          0,
+          jpeg_streams[buffer_index]);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to parse jpeg stream: ",
+          status);
+
+      status = nvjpegStateAttachPinnedBuffer(
+          nvjpeg_decoupled_state, pinned_buffers[buffer_index]);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to attach pinned buffer: ",
+          status);
+
+      status = nvjpegDecodeJpegHost(
+          nvjpeg_handle,
+          nvjpeg_decoder,
+          nvjpeg_decoupled_state,
+          nvjpeg_decode_params,
+          jpeg_streams[buffer_index]);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to decode jpeg stream: ",
+          status);
+
+      cudaStatus = cudaStreamSynchronize(stream);
+      TORCH_CHECK(
+          cudaStatus == cudaSuccess,
+          "Failed to synchronize CUDA stream: ",
+          cudaStatus);
+
+      status = nvjpegDecodeJpegTransferToDevice(
+          nvjpeg_handle,
+          nvjpeg_decoder,
+          nvjpeg_decoupled_state,
+          jpeg_streams[buffer_index],
+          stream);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to transfer jpeg to device: ",
+          status);
+
+      buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode
+                                       // to avoid an extra sync
+
+      status = nvjpegDecodeJpegDevice(
+          nvjpeg_handle,
+          nvjpeg_decoder,
+          nvjpeg_decoupled_state,
+          &sw_output_buffer[i],
+          stream);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to decode jpeg stream: ",
+          status);
+    }
+  }
+
+  cudaStatus = cudaStreamSynchronize(stream);
+  TORCH_CHECK(
+      cudaStatus == cudaSuccess,
+      "Failed to synchronize CUDA stream: ",
+      cudaStatus);
+
+  // prune extraneous channels from single channel images
+  if (output_format == NVJPEG_OUTPUT_UNCHANGED) {
+    for (std::vector<at::Tensor>::size_type i = 0; i < output_tensors.size();
+         ++i) {
+      if (channels[i] == 1) {
+        output_tensors[i] = output_tensors[i][0].unsqueeze(0).clone();
+      }
+    }
+  }
+
+  return output_tensors;
+}
+
+} // namespace image
+} // namespace vision
+
+#endif
diff --git a/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h
new file mode 100644
index 00000000000..6f72d9e35b2
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h
@@ -0,0 +1,45 @@
+#pragma once
+#include <torch/types.h>
+#include <vector>
+#include "../common.h"
+
+#if NVJPEG_FOUND
+#include <c10/cuda/CUDAStream.h>
+#include <nvjpeg.h>
+
+namespace vision {
+namespace image {
+class CUDAJpegDecoder {
+ public:
+  CUDAJpegDecoder(const torch::Device& target_device);
+  ~CUDAJpegDecoder();
+
+  std::vector<torch::Tensor> decode_images(
+      const std::vector<torch::Tensor>& encoded_images,
+      const nvjpegOutputFormat_t& output_format);
+
+  const torch::Device original_device;
+  const torch::Device target_device;
+  const c10::cuda::CUDAStream stream;
+
+ private:
+  std::tuple<
+      std::vector<nvjpegImage_t>,
+      std::vector<torch::Tensor>,
+      std::vector<int>>
+  prepare_buffers(
+      const std::vector<torch::Tensor>& encoded_images,
+      const nvjpegOutputFormat_t& output_format);
+  nvjpegJpegState_t nvjpeg_state;
+  nvjpegJpegState_t nvjpeg_decoupled_state;
+  nvjpegBufferPinned_t pinned_buffers[2];
+  nvjpegBufferDevice_t device_buffer;
+  nvjpegJpegStream_t jpeg_streams[2];
+  nvjpegDecodeParams_t nvjpeg_decode_params;
+  nvjpegJpegDecoder_t nvjpeg_decoder;
+  bool hw_decode_available{false};
+  nvjpegHandle_t nvjpeg_handle;
+};
+} // namespace image
+} // namespace vision
+#endif
diff --git a/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h
new file mode 100644
index 00000000000..8c3ad8f9a9d
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/types.h>
+#include "../common.h"
+#include "decode_jpegs_cuda.h"
+#include "encode_jpegs_cuda.h"
+
+namespace vision {
+namespace image {
+
+/*
+
+Fast jpeg decoding with CUDA.
+A100+ GPUs have dedicated hardware support for jpeg decoding.
+
+Args:
+    - encoded_images (const std::vector<torch::Tensor>&): a vector of tensors
+    containing the jpeg bitstreams to be decoded. Each tensor must have dtype
+    torch.uint8 and device cpu
+    - mode (ImageReadMode): IMAGE_READ_MODE_UNCHANGED, IMAGE_READ_MODE_GRAY and
+IMAGE_READ_MODE_RGB are supported
+    - device (torch::Device): The desired CUDA device to run the decoding on and
+which will contain the output tensors
+
+Returns:
+    - decoded_images (std::vector<torch::Tensor>): a vector of torch::Tensors of
+dtype torch.uint8 on the specified <device> containing the decoded images
+
+Notes:
+    - If a single image fails, the whole batch fails.
+    - This function is thread-safe
+*/
+C10_EXPORT std::vector<torch::Tensor> decode_jpegs_cuda(
+    const std::vector<torch::Tensor>& encoded_images,
+    vision::image::ImageReadMode mode,
+    torch::Device device);
+
+/*
+Fast jpeg encoding with CUDA.
+
+Args:
+    - decoded_images (const std::vector<torch::Tensor>&): a vector of contiguous
+CUDA tensors of dtype torch.uint8 to be encoded.
+    - quality (int64_t): 0-100, 75 is the default
+
+Returns:
+    - encoded_images (std::vector<torch::Tensor>): a vector of CUDA
+torch::Tensors of dtype torch.uint8 containing the encoded images
+
+Notes:
+    - If a single image fails, the whole batch fails.
+    - This function is thread-safe
+*/
+C10_EXPORT std::vector<torch::Tensor> encode_jpegs_cuda(
+    const std::vector<torch::Tensor>& decoded_images,
+    const int64_t quality);
+
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp
new file mode 100644
index 00000000000..80accc1a241
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp
@@ -0,0 +1,278 @@
+#include "encode_jpegs_cuda.h"
+#if !NVJPEG_FOUND
+namespace vision {
+namespace image {
+std::vector<torch::Tensor> encode_jpegs_cuda(
+    const std::vector<torch::Tensor>& decoded_images,
+    const int64_t quality) {
+  TORCH_CHECK(
+      false, "encode_jpegs_cuda: torchvision not compiled with nvJPEG support");
+}
+} // namespace image
+} // namespace vision
+#else
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <iostream>
+#include <memory>
+#include <string>
+#include "c10/core/ScalarType.h"
+
+namespace vision {
+namespace image {
+
+// We use global variables to cache the encoder and decoder instances and
+// reuse them across calls to the corresponding pytorch functions
+std::mutex encoderMutex;
+std::unique_ptr<CUDAJpegEncoder> cudaJpegEncoder;
+
+std::vector<torch::Tensor> encode_jpegs_cuda(
+    const std::vector<torch::Tensor>& decoded_images,
+    const int64_t quality) {
+  C10_LOG_API_USAGE_ONCE(
+      "torchvision.csrc.io.image.cuda.encode_jpegs_cuda.encode_jpegs_cuda");
+
+  // Some nvjpeg structures are not thread safe so we're keeping it single
+  // threaded for now. In the future this may be an opportunity to unlock
+  // further speedups
+  std::lock_guard<std::mutex> lock(encoderMutex);
+  TORCH_CHECK(decoded_images.size() > 0, "Empty input tensor list");
+  torch::Device device = decoded_images[0].device();
+  at::cuda::CUDAGuard device_guard(device);
+
+  // lazy init of the encoder class
+  // the encoder object holds on to a lot of state and is expensive to create,
+  // so we reuse it across calls. NB: the cached structures are device specific
+  // and cannot be reused across devices
+  if (cudaJpegEncoder == nullptr || device != cudaJpegEncoder->target_device) {
+    if (cudaJpegEncoder != nullptr) {
+      delete cudaJpegEncoder.release();
+    }
+
+    cudaJpegEncoder = std::make_unique<CUDAJpegEncoder>(device);
+
+    // Unfortunately, we cannot rely on the smart pointer releasing the encoder
+    // object correctly upon program exit. This is because, when cudaJpegEncoder
+    // gets destroyed, the CUDA runtime may already be shut down, rendering all
+    // destroy* calls in the encoder destructor invalid. Instead, we use an
+    // atexit hook which executes after main() finishes, but hopefully before
+    // CUDA shuts down when the program exits. If CUDA is already shut down the
+    // destructor will detect this and will not attempt to destroy any encoder
+    // structures.
+    std::atexit([]() { delete cudaJpegEncoder.release(); });
+  }
+
+  std::vector<torch::Tensor> contig_images;
+  contig_images.reserve(decoded_images.size());
+  for (const auto& image : decoded_images) {
+    TORCH_CHECK(
+        image.dtype() == torch::kU8, "Input tensor dtype should be uint8");
+
+    TORCH_CHECK(
+        image.device() == device,
+        "All input tensors must be on the same CUDA device when encoding with nvjpeg")
+
+    TORCH_CHECK(
+        image.dim() == 3 && image.numel() > 0,
+        "Input data should be a 3-dimensional tensor");
+
+    TORCH_CHECK(
+        image.size(0) == 3,
+        "The number of channels should be 3, got: ",
+        image.size(0));
+
+    // nvjpeg requires images to be contiguous
+    if (image.is_contiguous()) {
+      contig_images.push_back(image);
+    } else {
+      contig_images.push_back(image.contiguous());
+    }
+  }
+
+  cudaJpegEncoder->set_quality(quality);
+  std::vector<torch::Tensor> encoded_images;
+  for (const auto& image : contig_images) {
+    auto encoded_image = cudaJpegEncoder->encode_jpeg(image);
+    encoded_images.push_back(encoded_image);
+  }
+  at::cuda::CUDAEvent event;
+  event.record(cudaJpegEncoder->stream);
+
+  // We use a dedicated stream to do the encoding and even though the results
+  // may be ready on that stream we cannot assume that they are also available
+  // on the current stream of the calling context when this function returns. We
+  // use a blocking event to ensure that this is indeed the case. Crucially, we
+  // do not want to block the host at this particular point
+  // (which is what cudaStreamSynchronize would do.) Events allow us to
+  // synchronize the streams without blocking the host.
+  event.block(cudaJpegEncoder->current_stream);
+  return encoded_images;
+}
+
+CUDAJpegEncoder::CUDAJpegEncoder(const torch::Device& target_device)
+    : original_device{torch::kCUDA, torch::cuda::current_device()},
+      target_device{target_device},
+      stream{
+          target_device.has_index()
+              ? at::cuda::getStreamFromPool(false, target_device.index())
+              : at::cuda::getStreamFromPool(false)},
+      current_stream{
+          original_device.has_index()
+              ? at::cuda::getCurrentCUDAStream(original_device.index())
+              : at::cuda::getCurrentCUDAStream()} {
+  nvjpegStatus_t status;
+  status = nvjpegCreateSimple(&nvjpeg_handle);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg handle: ",
+      status);
+
+  status = nvjpegEncoderStateCreate(nvjpeg_handle, &nv_enc_state, stream);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg encoder state: ",
+      status);
+
+  status = nvjpegEncoderParamsCreate(nvjpeg_handle, &nv_enc_params, stream);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg encoder params: ",
+      status);
+}
+
+CUDAJpegEncoder::~CUDAJpegEncoder() {
+  /*
+  The below code works on Mac and Linux, but fails on Windows.
+  This is because on Windows, the atexit hook which calls this
+  destructor executes after cuda is already shut down causing SIGSEGV.
+  We do not have a solution to this problem at the moment, so we'll
+  just leak the libnvjpeg & cuda variables for the time being and hope
+  that the CUDA runtime handles cleanup for us.
+  Please send a PR if you have a solution for this problem.
+  */
+
+  // // We run cudaGetDeviceCount as a dummy to test if the CUDA runtime is
+  // still
+  // // initialized. If it is not, we can skip the rest of this function as it
+  // is
+  // // unsafe to execute.
+  // int deviceCount = 0;
+  // cudaError_t error = cudaGetDeviceCount(&deviceCount);
+  // if (error != cudaSuccess)
+  //   return; // CUDA runtime has already shut down. There's nothing we can do
+  //           // now.
+
+  // nvjpegStatus_t status;
+
+  // status = nvjpegEncoderParamsDestroy(nv_enc_params);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg encoder params: ",
+  //     status);
+
+  // status = nvjpegEncoderStateDestroy(nv_enc_state);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg encoder state: ",
+  //     status);
+
+  // cudaStreamSynchronize(stream);
+
+  // status = nvjpegDestroy(nvjpeg_handle);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS, "nvjpegDestroy failed: ", status);
+}
+
+torch::Tensor CUDAJpegEncoder::encode_jpeg(const torch::Tensor& src_image) {
+  nvjpegStatus_t status;
+  cudaError_t cudaStatus;
+
+  // Ensure that the incoming src_image is safe to use
+  cudaStatus = cudaStreamSynchronize(current_stream);
+  TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus);
+
+  int channels = src_image.size(0);
+  int height = src_image.size(1);
+  int width = src_image.size(2);
+
+  status = nvjpegEncoderParamsSetSamplingFactors(
+      nv_enc_params, NVJPEG_CSS_444, stream);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to set nvjpeg encoder params sampling factors: ",
+      status);
+
+  nvjpegImage_t target_image;
+  for (int c = 0; c < channels; c++) {
+    target_image.channel[c] = src_image[c].data_ptr<uint8_t>();
+    // this is why we need contiguous tensors
+    target_image.pitch[c] = width;
+  }
+  for (int c = channels; c < NVJPEG_MAX_COMPONENT; c++) {
+    target_image.channel[c] = nullptr;
+    target_image.pitch[c] = 0;
+  }
+  // Encode the image
+  status = nvjpegEncodeImage(
+      nvjpeg_handle,
+      nv_enc_state,
+      nv_enc_params,
+      &target_image,
+      NVJPEG_INPUT_RGB,
+      width,
+      height,
+      stream);
+
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS, "image encoding failed: ", status);
+  // Retrieve length of the encoded image
+  size_t length;
+  status = nvjpegEncodeRetrieveBitstreamDevice(
+      nvjpeg_handle, nv_enc_state, NULL, &length, stream);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to retrieve encoded image stream state: ",
+      status);
+
+  // Synchronize the stream to ensure that the encoded image is ready
+  cudaStatus = cudaStreamSynchronize(stream);
+  TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus);
+
+  // Reserve buffer for the encoded image
+  torch::Tensor encoded_image = torch::empty(
+      {static_cast<long>(length)},
+      torch::TensorOptions()
+          .dtype(torch::kByte)
+          .layout(torch::kStrided)
+          .device(target_device)
+          .requires_grad(false));
+  cudaStatus = cudaStreamSynchronize(stream);
+  TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus);
+  // Retrieve the encoded image
+  status = nvjpegEncodeRetrieveBitstreamDevice(
+      nvjpeg_handle,
+      nv_enc_state,
+      encoded_image.data_ptr<uint8_t>(),
+      &length,
+      stream);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to retrieve encoded image: ",
+      status);
+  return encoded_image;
+}
+
+void CUDAJpegEncoder::set_quality(const int64_t quality) {
+  nvjpegStatus_t paramsQualityStatus =
+      nvjpegEncoderParamsSetQuality(nv_enc_params, quality, stream);
+  TORCH_CHECK(
+      paramsQualityStatus == NVJPEG_STATUS_SUCCESS,
+      "Failed to set nvjpeg encoder params quality: ",
+      paramsQualityStatus);
+}
+
+} // namespace image
+} // namespace vision
+
+#endif // NVJPEG_FOUND
diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h
new file mode 100644
index 00000000000..6ee0ad91df4
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <torch/types.h>
+#include <vector>
+#if NVJPEG_FOUND
+
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <nvjpeg.h>
+
+namespace vision {
+namespace image {
+
+class CUDAJpegEncoder {
+ public:
+  CUDAJpegEncoder(const torch::Device& device);
+  ~CUDAJpegEncoder();
+
+  torch::Tensor encode_jpeg(const torch::Tensor& src_image);
+
+  void set_quality(const int64_t quality);
+
+  const torch::Device original_device;
+  const torch::Device target_device;
+  const c10::cuda::CUDAStream stream;
+  const c10::cuda::CUDAStream current_stream;
+
+ protected:
+  nvjpegEncoderState_t nv_enc_state;
+  nvjpegEncoderParams_t nv_enc_params;
+  nvjpegHandle_t nvjpeg_handle;
+};
+} // namespace image
+} // namespace vision
+#endif
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
index 3c9d632f030..b4a4ed54a67 100644
--- a/torchvision/csrc/io/image/image.cpp
+++ b/torchvision/csrc/io/image/image.cpp
@@ -1,33 +1,29 @@
 #include "image.h"
 
 #include <ATen/core/op_registration/op_registration.h>
-#ifdef USE_PYTHON
-#include <Python.h>
-#endif
-
-// If we are in a Windows environment, we need to define
-// initialization functions for the _custom_ops extension
-#ifdef USE_PYTHON
-#ifdef _WIN32
-PyMODINIT_FUNC PyInit_image(void) {
-  // No need to do anything.
-  return NULL;
-}
-#endif
-#endif // USE_PYTHON
 
 namespace vision {
 namespace image {
 
-static auto registry = torch::RegisterOperators()
-                           .op("image::decode_png", &decode_png)
-                           .op("image::encode_png", &encode_png)
-                           .op("image::decode_jpeg", &decode_jpeg)
-                           .op("image::encode_jpeg", &encode_jpeg)
-                           .op("image::read_file", &read_file)
-                           .op("image::write_file", &write_file)
-                           .op("image::decode_image", &decode_image)
-                           .op("image::decode_jpeg_cuda", &decode_jpeg_cuda);
+static auto registry =
+    torch::RegisterOperators()
+        .op("image::decode_gif", &decode_gif)
+        .op("image::decode_png(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
+            &decode_png)
+        .op("image::encode_png", &encode_png)
+        .op("image::decode_jpeg(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
+            &decode_jpeg)
+        .op("image::decode_webp(Tensor encoded_data, int mode) -> Tensor",
+            &decode_webp)
+        .op("image::encode_jpeg", &encode_jpeg)
+        .op("image::read_file", &read_file)
+        .op("image::write_file", &write_file)
+        .op("image::decode_image(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
+            &decode_image)
+        .op("image::decode_jpegs_cuda", &decode_jpegs_cuda)
+        .op("image::encode_jpegs_cuda", &encode_jpegs_cuda)
+        .op("image::_jpeg_version", &_jpeg_version)
+        .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/image.h b/torchvision/csrc/io/image/image.h
index 05bac44c77d..3f47fdec65c 100644
--- a/torchvision/csrc/io/image/image.h
+++ b/torchvision/csrc/io/image/image.h
@@ -1,9 +1,11 @@
 #pragma once
 
+#include "cpu/decode_gif.h"
 #include "cpu/decode_image.h"
 #include "cpu/decode_jpeg.h"
 #include "cpu/decode_png.h"
+#include "cpu/decode_webp.h"
 #include "cpu/encode_jpeg.h"
 #include "cpu/encode_png.h"
 #include "cpu/read_write_file.h"
-#include "cuda/decode_jpeg_cuda.h"
+#include "cuda/encode_decode_jpegs_cuda.h"
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
index 38b35014595..8f1fb3fb5b9 100644
--- a/torchvision/csrc/io/video/video.cpp
+++ b/torchvision/csrc/io/video/video.cpp
@@ -2,6 +2,8 @@
 
 #include <regex>
 
+using namespace ffmpeg;
+
 namespace vision {
 namespace video {
 
@@ -77,7 +79,7 @@ std::tuple<std::string, long> _parseStream(const std::string& streamString) {
   long index_ = -1;
   if (match[2].matched) {
     try {
-      index_ = c10::stoi(match[2].str());
+      index_ = std::stoi(match[2].str());
     } catch (const std::exception&) {
       TORCH_CHECK(
           false,
@@ -156,14 +158,34 @@ void Video::_getDecoderParams(
 
 } // _get decoder params
 
-Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
-  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
+void Video::initFromFile(
+    std::string videoPath,
+    std::string stream,
+    int64_t numThreads) {
+  TORCH_CHECK(!initialized, "Video object can only be initialized once");
+  initialized = true;
+  params.uri = videoPath;
+  _init(stream, numThreads);
+}
+
+void Video::initFromMemory(
+    torch::Tensor videoTensor,
+    std::string stream,
+    int64_t numThreads) {
+  TORCH_CHECK(!initialized, "Video object can only be initialized once");
+  initialized = true;
+  callback = MemoryBuffer::getCallback(
+      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
+  _init(stream, numThreads);
+}
+
+void Video::_init(std::string stream, int64_t numThreads) {
   // set number of threads global
   numThreads_ = numThreads;
   // parse stream information
   current_stream = _parseStream(stream);
   // note that in the initial call we want to get all streams
-  Video::_getDecoderParams(
+  _getDecoderParams(
       0, // video start
       0, // headerOnly
       std::get<0>(current_stream), // stream info - remove that
@@ -175,11 +197,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
 
   std::string logMessage, logType;
 
-  // TODO: add read from memory option
-  params.uri = videoPath;
-  logType = "file";
-  logMessage = videoPath;
-
   // locals
   std::vector<double> audioFPS, videoFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
@@ -190,7 +207,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
   c10::Dict<std::string, std::vector<double>> subsMetadata;
 
   // callback and metadata defined in struct
-  succeeded = decoder.init(params, std::move(callback), &metadata);
+  DecoderInCallback tmp_callback = callback;
+  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
   if (succeeded) {
     for (const auto& header : metadata) {
       double fps = double(header.fps);
@@ -225,16 +243,23 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
   streamsMetadata.insert("subtitles", subsMetadata);
   streamsMetadata.insert("cc", ccMetadata);
 
-  succeeded = Video::setCurrentStream(stream);
-  LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
+  succeeded = setCurrentStream(stream);
   if (std::get<1>(current_stream) != -1) {
     LOG(INFO)
         << "Stream index set to " << std::get<1>(current_stream)
         << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
   }
+}
+
+Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
+  if (!videoPath.empty()) {
+    initFromFile(videoPath, stream, numThreads);
+  }
 } // video
 
 bool Video::setCurrentStream(std::string stream = "video") {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
     current_stream = _parseStream(stream);
   }
@@ -256,19 +281,23 @@ bool Video::setCurrentStream(std::string stream = "video") {
   );
 
   // callback and metadata defined in Video.h
-  return (decoder.init(params, std::move(callback), &metadata));
+  DecoderInCallback tmp_callback = callback;
+  return (decoder.init(params, std::move(tmp_callback), &metadata));
 }
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   return current_stream;
 }
 
 c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
     getStreamMetadata() const {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   return streamsMetadata;
 }
 
 void Video::Seek(double ts, bool fastSeek = false) {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   // initialize the class variables used for seeking and retrurn
   _getDecoderParams(
       ts, // video start
@@ -282,20 +311,21 @@ void Video::Seek(double ts, bool fastSeek = false) {
   );
 
   // callback and metadata defined in Video.h
-  succeeded = decoder.init(params, std::move(callback), &metadata);
-  LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
+  DecoderInCallback tmp_callback = callback;
+  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
 }
 
 std::tuple<torch::Tensor, double> Video::Next() {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   // if failing to decode simply return a null tensor (note, should we
-  // raise an exeption?)
+  // raise an exception?)
   double frame_pts_s;
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
   // decode single frame
   DecoderOutputMessage out;
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
-  // if successfull
+  // if successful
   if (res == 0) {
     frame_pts_s = double(double(out.header.pts) * 1e-6);
 
@@ -345,6 +375,8 @@ std::tuple<torch::Tensor, double> Video::Next() {
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, int64_t>())
+        .def("init_from_file", &Video::initFromFile)
+        .def("init_from_memory", &Video::initFromMemory)
         .def("get_current_stream", &Video::getCurrentStream)
         .def("set_current_stream", &Video::setCurrentStream)
         .def("get_metadata", &Video::getStreamMetadata)
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
index 7cd926b793c..e57fc3ae6b7 100644
--- a/torchvision/csrc/io/video/video.h
+++ b/torchvision/csrc/io/video/video.h
@@ -6,8 +6,6 @@
 #include "../decoder/memory_buffer.h"
 #include "../decoder/sync_decoder.h"
 
-using namespace ffmpeg;
-
 namespace vision {
 namespace video {
 
@@ -19,7 +17,19 @@ struct Video : torch::CustomClassHolder {
   int64_t numThreads_{0};
 
  public:
-  Video(std::string videoPath, std::string stream, int64_t numThreads);
+  Video(
+      std::string videoPath = std::string(),
+      std::string stream = std::string("video"),
+      int64_t numThreads = 0);
+  void initFromFile(
+      std::string videoPath,
+      std::string stream,
+      int64_t numThreads);
+  void initFromMemory(
+      torch::Tensor videoTensor,
+      std::string stream,
+      int64_t numThreads);
+
   std::tuple<std::string, int64_t> getCurrentStream() const;
   c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
   getStreamMetadata() const;
@@ -30,10 +40,16 @@ struct Video : torch::CustomClassHolder {
  private:
   bool succeeded = false; // decoder init flag
   // seekTS and doSeek act as a flag - if it's not set, next function simply
-  // retruns the next frame. If it's set, we look at the global seek
-  // time in comination with any_frame settings
+  // returns the next frame. If it's set, we look at the global seek
+  // time in combination with any_frame settings
   double seekTS = -1;
 
+  bool initialized = false;
+
+  void _init(
+      std::string stream,
+      int64_t numThreads); // expects params.uri OR callback to be set
+
   void _getDecoderParams(
       double videoStartS,
       int64_t getPtsOnly,
@@ -46,12 +62,12 @@ struct Video : torch::CustomClassHolder {
 
   std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
-  DecoderInCallback callback = nullptr;
-  std::vector<DecoderMetadata> metadata;
+  ffmpeg::DecoderInCallback callback = nullptr;
+  std::vector<ffmpeg::DecoderMetadata> metadata;
 
  protected:
-  SyncDecoder decoder;
-  DecoderParameters params;
+  ffmpeg::SyncDecoder decoder;
+  ffmpeg::DecoderParameters params;
 
 }; // struct Video
 
diff --git a/torchvision/csrc/io/video_reader/video_reader.cpp b/torchvision/csrc/io/video_reader/video_reader.cpp
index 39b9cacc224..f9a5e9085d8 100644
--- a/torchvision/csrc/io/video_reader/video_reader.cpp
+++ b/torchvision/csrc/io/video_reader/video_reader.cpp
@@ -1,22 +1,15 @@
 #include "video_reader.h"
 
-#ifdef USE_PYTHON
-#include <Python.h>
-#endif
-
 #include "../decoder/memory_buffer.h"
 #include "../decoder/sync_decoder.h"
 
-#ifdef USE_PYTHON
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension
 #ifdef _WIN32
-PyMODINIT_FUNC PyInit_video_reader(void) {
-  // No need to do anything.
-  return NULL;
+void* PyInit_video_reader(void) {
+  return nullptr;
 }
 #endif
-#endif // USE_PYTHONs
 
 using namespace ffmpeg;
 
diff --git a/torchvision/csrc/macros.h b/torchvision/csrc/macros.h
index 8a7136fad86..f907280e24e 100644
--- a/torchvision/csrc/macros.h
+++ b/torchvision/csrc/macros.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(TORCHVISION_BUILD_STATIC_LIBS)
 #if defined(torchvision_EXPORTS)
 #define VISION_API __declspec(dllexport)
 #else
@@ -9,14 +9,3 @@
 #else
 #define VISION_API
 #endif
-
-#if (defined __cpp_inline_variables) || __cplusplus >= 201703L
-#define VISION_INLINE_VARIABLE inline
-#else
-#ifdef _MSC_VER
-#define VISION_INLINE_VARIABLE __declspec(selectany)
-#define HINT_MSVC_LINKER_INCLUDE_SYMBOL
-#else
-#define VISION_INLINE_VARIABLE __attribute__((weak))
-#endif
-#endif
diff --git a/torchvision/csrc/models/alexnet.cpp b/torchvision/csrc/models/alexnet.cpp
deleted file mode 100644
index 8a6bfd9dacb..00000000000
--- a/torchvision/csrc/models/alexnet.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "alexnet.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-AlexNetImpl::AlexNetImpl(int64_t num_classes) {
-  features = torch::nn::Sequential(
-      torch::nn::Conv2d(
-          torch::nn::Conv2dOptions(3, 64, 11).stride(4).padding(2)),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Functional(modelsimpl::max_pool2d, 3, 2),
-      torch::nn::Conv2d(torch::nn::Conv2dOptions(64, 192, 5).padding(2)),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Functional(modelsimpl::max_pool2d, 3, 2),
-      torch::nn::Conv2d(torch::nn::Conv2dOptions(192, 384, 3).padding(1)),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Conv2d(torch::nn::Conv2dOptions(384, 256, 3).padding(1)),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Conv2d(torch::nn::Conv2dOptions(256, 256, 3).padding(1)),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Functional(modelsimpl::max_pool2d, 3, 2));
-
-  classifier = torch::nn::Sequential(
-      torch::nn::Dropout(),
-      torch::nn::Linear(256 * 6 * 6, 4096),
-      torch::nn::Functional(torch::relu),
-      torch::nn::Dropout(),
-      torch::nn::Linear(4096, 4096),
-      torch::nn::Functional(torch::relu),
-      torch::nn::Linear(4096, num_classes));
-
-  register_module("features", features);
-  register_module("classifier", classifier);
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor AlexNetImpl::forward(torch::Tensor x) {
-  x = features->forward(x);
-  x = torch::adaptive_avg_pool2d(x, {6, 6});
-  x = x.view({x.size(0), -1});
-  x = classifier->forward(x);
-
-  return x;
-}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/alexnet.h b/torchvision/csrc/models/alexnet.h
deleted file mode 100644
index e584446d205..00000000000
--- a/torchvision/csrc/models/alexnet.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-// AlexNet model architecture from the
-// "One weird trick..." <https://arxiv.org/abs/1404.5997> paper.
-struct VISION_API AlexNetImpl : torch::nn::Module {
-  torch::nn::Sequential features{nullptr}, classifier{nullptr};
-
-  explicit AlexNetImpl(int64_t num_classes = 1000);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(AlexNet);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/densenet.cpp b/torchvision/csrc/models/densenet.cpp
deleted file mode 100644
index 5eff294c1c4..00000000000
--- a/torchvision/csrc/models/densenet.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-#include "densenet.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-using Options = torch::nn::Conv2dOptions;
-
-struct _DenseLayerImpl : torch::nn::SequentialImpl {
-  double drop_rate;
-
-  _DenseLayerImpl(
-      int64_t num_input_features,
-      int64_t growth_rate,
-      int64_t bn_size,
-      double drop_rate)
-      : drop_rate(drop_rate) {
-    push_back("norm1", torch::nn::BatchNorm2d(num_input_features));
-    push_back("relu1", torch::nn::Functional(modelsimpl::relu_));
-    push_back(
-        "conv1",
-        torch::nn::Conv2d(Options(num_input_features, bn_size * growth_rate, 1)
-                              .stride(1)
-                              .bias(false)));
-    push_back("norm2", torch::nn::BatchNorm2d(bn_size * growth_rate));
-    push_back("relu2", torch::nn::Functional(modelsimpl::relu_));
-    push_back(
-        "conv2",
-        torch::nn::Conv2d(Options(bn_size * growth_rate, growth_rate, 3)
-                              .stride(1)
-                              .padding(1)
-                              .bias(false)));
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    auto new_features = torch::nn::SequentialImpl::forward(x);
-    if (drop_rate > 0)
-      new_features =
-          torch::dropout(new_features, drop_rate, this->is_training());
-    return torch::cat({x, new_features}, 1);
-  }
-};
-
-TORCH_MODULE(_DenseLayer);
-
-struct _DenseBlockImpl : torch::nn::SequentialImpl {
-  _DenseBlockImpl(
-      int64_t num_layers,
-      int64_t num_input_features,
-      int64_t bn_size,
-      int64_t growth_rate,
-      double drop_rate) {
-    for (int64_t i = 0; i < num_layers; ++i) {
-      auto layer = _DenseLayer(
-          num_input_features + i * growth_rate,
-          growth_rate,
-          bn_size,
-          drop_rate);
-      push_back("denselayer" + std::to_string(i + 1), layer);
-    }
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    return torch::nn::SequentialImpl::forward(x);
-  }
-};
-
-TORCH_MODULE(_DenseBlock);
-
-struct _TransitionImpl : torch::nn::SequentialImpl {
-  _TransitionImpl(int64_t num_input_features, int64_t num_output_features) {
-    push_back("norm", torch::nn::BatchNorm2d(num_input_features));
-    push_back("relu ", torch::nn::Functional(modelsimpl::relu_));
-    push_back(
-        "conv",
-        torch::nn::Conv2d(Options(num_input_features, num_output_features, 1)
-                              .stride(1)
-                              .bias(false)));
-    push_back("pool", torch::nn::Functional([](const torch::Tensor& input) {
-                return torch::avg_pool2d(input, 2, 2, 0, false, true);
-              }));
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    return torch::nn::SequentialImpl::forward(x);
-  }
-};
-
-TORCH_MODULE(_Transition);
-
-DenseNetImpl::DenseNetImpl(
-    int64_t num_classes,
-    int64_t growth_rate,
-    const std::vector<int64_t>& block_config,
-    int64_t num_init_features,
-    int64_t bn_size,
-    double drop_rate) {
-  // First convolution
-  features = torch::nn::Sequential();
-  features->push_back(
-      "conv0",
-      torch::nn::Conv2d(
-          Options(3, num_init_features, 7).stride(2).padding(3).bias(false)));
-
-  features->push_back("norm0", torch::nn::BatchNorm2d(num_init_features));
-  features->push_back("relu0", torch::nn::Functional(modelsimpl::relu_));
-  features->push_back(
-      "pool0", torch::nn::Functional(torch::max_pool2d, 3, 2, 1, 1, false));
-
-  // Each denseblock
-  auto num_features = num_init_features;
-  for (size_t i = 0; i < block_config.size(); ++i) {
-    auto num_layers = block_config[i];
-    _DenseBlock block(
-        num_layers, num_features, bn_size, growth_rate, drop_rate);
-
-    features->push_back("denseblock" + std::to_string(i + 1), block);
-    num_features = num_features + num_layers * growth_rate;
-
-    if (i != block_config.size() - 1) {
-      auto trans = _Transition(num_features, num_features / 2);
-      features->push_back("transition" + std::to_string(i + 1), trans);
-      num_features = num_features / 2;
-    }
-  }
-
-  // Final batch norm
-  features->push_back("norm5", torch::nn::BatchNorm2d(num_features));
-  // Linear layer
-  classifier = torch::nn::Linear(num_features, num_classes);
-
-  register_module("features", features);
-  register_module("classifier", classifier);
-
-  // Official init from torch repo.
-  for (auto& module : modules(/*include_self=*/false)) {
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get()))
-      torch::nn::init::kaiming_normal_(M->weight);
-    else if (auto M = dynamic_cast<torch::nn::BatchNorm2dImpl*>(module.get())) {
-      torch::nn::init::constant_(M->weight, 1);
-      torch::nn::init::constant_(M->bias, 0);
-    } else if (auto M = dynamic_cast<torch::nn::LinearImpl*>(module.get()))
-      torch::nn::init::constant_(M->bias, 0);
-  }
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor DenseNetImpl::forward(torch::Tensor x) {
-  auto features = this->features->forward(x);
-  auto out = torch::relu_(features);
-  out = torch::adaptive_avg_pool2d(out, {1, 1});
-
-  out = out.view({features.size(0), -1});
-  out = this->classifier->forward(out);
-  return out;
-}
-
-DenseNet121Impl::DenseNet121Impl(
-    int64_t num_classes,
-    int64_t growth_rate,
-    const std::vector<int64_t>& block_config,
-    int64_t num_init_features,
-    int64_t bn_size,
-    double drop_rate)
-    : DenseNetImpl(
-          num_classes,
-          growth_rate,
-          block_config,
-          num_init_features,
-          bn_size,
-          drop_rate) {}
-
-DenseNet169Impl::DenseNet169Impl(
-    int64_t num_classes,
-    int64_t growth_rate,
-    const std::vector<int64_t>& block_config,
-    int64_t num_init_features,
-    int64_t bn_size,
-    double drop_rate)
-    : DenseNetImpl(
-          num_classes,
-          growth_rate,
-          block_config,
-          num_init_features,
-          bn_size,
-          drop_rate) {}
-
-DenseNet201Impl::DenseNet201Impl(
-    int64_t num_classes,
-    int64_t growth_rate,
-    const std::vector<int64_t>& block_config,
-    int64_t num_init_features,
-    int64_t bn_size,
-    double drop_rate)
-    : DenseNetImpl(
-          num_classes,
-          growth_rate,
-          block_config,
-          num_init_features,
-          bn_size,
-          drop_rate) {}
-
-DenseNet161Impl::DenseNet161Impl(
-    int64_t num_classes,
-    int64_t growth_rate,
-    const std::vector<int64_t>& block_config,
-    int64_t num_init_features,
-    int64_t bn_size,
-    double drop_rate)
-    : DenseNetImpl(
-          num_classes,
-          growth_rate,
-          block_config,
-          num_init_features,
-          bn_size,
-          drop_rate) {}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/densenet.h b/torchvision/csrc/models/densenet.h
deleted file mode 100644
index d1d8f6645da..00000000000
--- a/torchvision/csrc/models/densenet.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-// Densenet-BC model class, based on
-// "Densely Connected Convolutional Networks"
-// <https://arxiv.org/pdf/1608.06993.pdf>
-
-// Args:
-//     num_classes (int) - number of classification classes
-//     growth_rate (int) - how many filters to add each layer (`k` in paper)
-//     block_config (list of 4 ints) - how many layers in each pooling block
-//     num_init_features (int) - the number of filters to learn in the first
-//         convolution layer
-//     bn_size (int) - multiplicative factor for number of bottle neck layers
-//         (i.e. bn_size * k features in the bottleneck layer)
-//     drop_rate (float) - dropout rate after each dense layer
-struct VISION_API DenseNetImpl : torch::nn::Module {
-  torch::nn::Sequential features{nullptr};
-  torch::nn::Linear classifier{nullptr};
-
-  explicit DenseNetImpl(
-      int64_t num_classes = 1000,
-      int64_t growth_rate = 32,
-      const std::vector<int64_t>& block_config = {6, 12, 24, 16},
-      int64_t num_init_features = 64,
-      int64_t bn_size = 4,
-      double drop_rate = 0);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-struct VISION_API DenseNet121Impl : DenseNetImpl {
-  explicit DenseNet121Impl(
-      int64_t num_classes = 1000,
-      int64_t growth_rate = 32,
-      const std::vector<int64_t>& block_config = {6, 12, 24, 16},
-      int64_t num_init_features = 64,
-      int64_t bn_size = 4,
-      double drop_rate = 0);
-};
-
-struct VISION_API DenseNet169Impl : DenseNetImpl {
-  explicit DenseNet169Impl(
-      int64_t num_classes = 1000,
-      int64_t growth_rate = 32,
-      const std::vector<int64_t>& block_config = {6, 12, 32, 32},
-      int64_t num_init_features = 64,
-      int64_t bn_size = 4,
-      double drop_rate = 0);
-};
-
-struct VISION_API DenseNet201Impl : DenseNetImpl {
-  explicit DenseNet201Impl(
-      int64_t num_classes = 1000,
-      int64_t growth_rate = 32,
-      const std::vector<int64_t>& block_config = {6, 12, 48, 32},
-      int64_t num_init_features = 64,
-      int64_t bn_size = 4,
-      double drop_rate = 0);
-};
-
-struct VISION_API DenseNet161Impl : DenseNetImpl {
-  explicit DenseNet161Impl(
-      int64_t num_classes = 1000,
-      int64_t growth_rate = 48,
-      const std::vector<int64_t>& block_config = {6, 12, 36, 24},
-      int64_t num_init_features = 96,
-      int64_t bn_size = 4,
-      double drop_rate = 0);
-};
-
-TORCH_MODULE(DenseNet);
-TORCH_MODULE(DenseNet121);
-TORCH_MODULE(DenseNet169);
-TORCH_MODULE(DenseNet201);
-TORCH_MODULE(DenseNet161);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/googlenet.cpp b/torchvision/csrc/models/googlenet.cpp
deleted file mode 100644
index 563f75d6380..00000000000
--- a/torchvision/csrc/models/googlenet.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-#include "googlenet.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-
-using Options = torch::nn::Conv2dOptions;
-
-namespace _googlenetimpl {
-BasicConv2dImpl::BasicConv2dImpl(torch::nn::Conv2dOptions options) {
-  options.bias(false);
-  conv = torch::nn::Conv2d(options);
-  bn = torch::nn::BatchNorm2d(
-      torch::nn::BatchNormOptions(options.out_channels()).eps(0.001));
-
-  register_module("conv", conv);
-  register_module("bn", bn);
-}
-
-torch::Tensor BasicConv2dImpl::forward(torch::Tensor x) {
-  x = conv->forward(x);
-  x = bn->forward(x);
-  return x.relu_();
-}
-
-InceptionImpl::InceptionImpl(
-    int64_t in_channels,
-    int64_t ch1x1,
-    int64_t ch3x3red,
-    int64_t ch3x3,
-    int64_t ch5x5red,
-    int64_t ch5x5,
-    int64_t pool_proj) {
-  branch1 = BasicConv2d(Options(in_channels, ch1x1, 1));
-
-  branch2->push_back(BasicConv2d(Options(in_channels, ch3x3red, 1)));
-  branch2->push_back(BasicConv2d(Options(ch3x3red, ch3x3, 3).padding(1)));
-
-  branch3->push_back(BasicConv2d(Options(in_channels, ch5x5red, 1)));
-  branch3->push_back(BasicConv2d(Options(ch5x5red, ch5x5, 3).padding(1)));
-
-  branch4->push_back(
-      torch::nn::Functional(torch::max_pool2d, 3, 1, 1, 1, true));
-  branch4->push_back(BasicConv2d(Options(in_channels, pool_proj, 1)));
-
-  register_module("branch1", branch1);
-  register_module("branch2", branch2);
-  register_module("branch3", branch3);
-  register_module("branch4", branch4);
-}
-
-torch::Tensor InceptionImpl::forward(torch::Tensor x) {
-  auto b1 = branch1->forward(x);
-  auto b2 = branch2->forward(x);
-  auto b3 = branch3->forward(x);
-  auto b4 = branch4->forward(x);
-
-  return torch::cat({b1, b2, b3, b4}, 1);
-}
-
-InceptionAuxImpl::InceptionAuxImpl(int64_t in_channels, int64_t num_classes) {
-  conv = BasicConv2d(Options(in_channels, 128, 1));
-  fc1 = torch::nn::Linear(2048, 1024);
-  fc2 = torch::nn::Linear(1024, num_classes);
-
-  register_module("conv", conv);
-  register_module("fc1", fc1);
-  register_module("fc2", fc2);
-}
-
-torch::Tensor InceptionAuxImpl::forward(at::Tensor x) {
-  // aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
-  x = torch::adaptive_avg_pool2d(x, {4, 4});
-  // aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
-  x = conv->forward(x);
-  // N x 128 x 4 x 4
-  x = x.view({x.size(0), -1});
-  // N x 2048
-  x = fc1->forward(x).relu_();
-  // N x 2048
-  x = torch::dropout(x, 0.7, is_training());
-  // N x 2048
-  x = fc2->forward(x);
-  // N x 1024
-
-  return x;
-}
-
-} // namespace _googlenetimpl
-
-GoogLeNetImpl::GoogLeNetImpl(
-    int64_t num_classes,
-    bool aux_logits,
-    bool transform_input,
-    bool init_weights) {
-  this->aux_logits = aux_logits;
-  this->transform_input = transform_input;
-
-  conv1 = _googlenetimpl::BasicConv2d(Options(3, 64, 7).stride(2).padding(3));
-  conv2 = _googlenetimpl::BasicConv2d(Options(64, 64, 1));
-  conv3 = _googlenetimpl::BasicConv2d(Options(64, 192, 3).padding(1));
-
-  inception3a = _googlenetimpl::Inception(192, 64, 96, 128, 16, 32, 32);
-  inception3b = _googlenetimpl::Inception(256, 128, 128, 192, 32, 96, 64);
-
-  inception4a = _googlenetimpl::Inception(480, 192, 96, 208, 16, 48, 64);
-  inception4b = _googlenetimpl::Inception(512, 160, 112, 224, 24, 64, 64);
-  inception4c = _googlenetimpl::Inception(512, 128, 128, 256, 24, 64, 64);
-  inception4d = _googlenetimpl::Inception(512, 112, 144, 288, 32, 64, 64);
-  inception4e = _googlenetimpl::Inception(528, 256, 160, 320, 32, 128, 128);
-
-  inception5a = _googlenetimpl::Inception(832, 256, 160, 320, 32, 128, 128);
-  inception5b = _googlenetimpl::Inception(832, 384, 192, 384, 48, 128, 128);
-
-  if (aux_logits) {
-    aux1 = _googlenetimpl::InceptionAux(512, num_classes);
-    aux2 = _googlenetimpl::InceptionAux(528, num_classes);
-
-    register_module("aux1", aux1);
-    register_module("aux2", aux2);
-  }
-
-  dropout = torch::nn::Dropout(0.2);
-  fc = torch::nn::Linear(1024, num_classes);
-
-  register_module("conv1", conv1);
-  register_module("conv2", conv2);
-  register_module("conv3", conv3);
-
-  register_module("inception3a", inception3a);
-  register_module("inception3b", inception3b);
-
-  register_module("inception4a", inception4a);
-  register_module("inception4b", inception4b);
-  register_module("inception4c", inception4c);
-  register_module("inception4d", inception4d);
-  register_module("inception4e", inception4e);
-
-  register_module("inception5a", inception5a);
-  register_module("inception5b", inception5b);
-
-  register_module("dropout", dropout);
-  register_module("fc", fc);
-
-  if (init_weights)
-    _initialize_weights();
-
-  modelsimpl::deprecation_warning();
-}
-
-void GoogLeNetImpl::_initialize_weights() {
-  for (auto& module : modules(/*include_self=*/false)) {
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get()))
-      torch::nn::init::normal_(M->weight); // Note: used instead of truncated
-                                           // normal initialization
-    else if (auto M = dynamic_cast<torch::nn::LinearImpl*>(module.get()))
-      torch::nn::init::normal_(M->weight); // Note: used instead of truncated
-                                           // normal initialization
-    else if (auto M = dynamic_cast<torch::nn::BatchNorm2dImpl*>(module.get())) {
-      torch::nn::init::ones_(M->weight);
-      torch::nn::init::zeros_(M->bias);
-    }
-  }
-}
-
-GoogLeNetOutput GoogLeNetImpl::forward(torch::Tensor x) {
-  if (transform_input) {
-    auto x_ch0 = torch::unsqueeze(x.select(1, 0), 1) * (0.229 / 0.5) +
-        (0.485 - 0.5) / 0.5;
-    auto x_ch1 = torch::unsqueeze(x.select(1, 1), 1) * (0.224 / 0.5) +
-        (0.456 - 0.5) / 0.5;
-    auto x_ch2 = torch::unsqueeze(x.select(1, 2), 1) * (0.225 / 0.5) +
-        (0.406 - 0.5) / 0.5;
-
-    x = torch::cat({x_ch0, x_ch1, x_ch2}, 1);
-  }
-
-  // N x 3 x 224 x 224
-  x = conv1->forward(x);
-  // N x 64 x 112 x 112
-  x = torch::max_pool2d(x, 3, 2, 0, 1, true);
-  // N x 64 x 56 x 56
-  x = conv2->forward(x);
-  // N x 64 x 56 x 56
-  x = conv3->forward(x);
-  // N x 192 x 56 x 56
-  x = torch::max_pool2d(x, 3, 2, 0, 1, true);
-
-  // N x 192 x 28 x 28
-  x = inception3a->forward(x);
-  // N x 256 x 28 x 28
-  x = inception3b->forward(x);
-  // N x 480 x 28 x 28
-  x = torch::max_pool2d(x, 3, 2, 0, 1, true);
-  // N x 480 x 14 x 14
-  x = inception4a->forward(x);
-  // N x 512 x 14 x 14
-  torch::Tensor aux1;
-  if (is_training() && aux_logits)
-    aux1 = this->aux1->forward(x);
-
-  x = inception4b->forward(x);
-  // N x 512 x 14 x 14
-  x = inception4c->forward(x);
-  // N x 512 x 14 x 14
-  x = inception4d->forward(x);
-  // N x 528 x 14 x 14
-  torch::Tensor aux2;
-  if (is_training() && aux_logits)
-    aux2 = this->aux2->forward(x);
-
-  x = inception4e(x);
-  // N x 832 x 14 x 14
-  x = torch::max_pool2d(x, 2, 2, 0, 1, true);
-  // N x 832 x 7 x 7
-  x = inception5a(x);
-  // N x 832 x 7 x 7
-  x = inception5b(x);
-  // N x 1024 x 7 x 7
-
-  x = torch::adaptive_avg_pool2d(x, {1, 1});
-  // N x 1024 x 1 x 1
-  x = x.view({x.size(0), -1});
-  // N x 1024
-  x = dropout->forward(x);
-  x = fc->forward(x);
-  // N x 1000(num_classes)
-
-  return {x, aux1, aux2};
-}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/googlenet.h b/torchvision/csrc/models/googlenet.h
deleted file mode 100644
index cb10a0b90b7..00000000000
--- a/torchvision/csrc/models/googlenet.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-
-namespace _googlenetimpl {
-struct VISION_API BasicConv2dImpl : torch::nn::Module {
-  torch::nn::Conv2d conv{nullptr};
-  torch::nn::BatchNorm2d bn{nullptr};
-
-  explicit BasicConv2dImpl(torch::nn::Conv2dOptions options);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(BasicConv2d);
-
-struct VISION_API InceptionImpl : torch::nn::Module {
-  BasicConv2d branch1{nullptr};
-  torch::nn::Sequential branch2, branch3, branch4;
-
-  InceptionImpl(
-      int64_t in_channels,
-      int64_t ch1x1,
-      int64_t ch3x3red,
-      int64_t ch3x3,
-      int64_t ch5x5red,
-      int64_t ch5x5,
-      int64_t pool_proj);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(Inception);
-
-struct VISION_API InceptionAuxImpl : torch::nn::Module {
-  BasicConv2d conv{nullptr};
-  torch::nn::Linear fc1{nullptr}, fc2{nullptr};
-
-  InceptionAuxImpl(int64_t in_channels, int64_t num_classes);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(InceptionAux);
-
-} // namespace _googlenetimpl
-
-struct VISION_API GoogLeNetOutput {
-  torch::Tensor output;
-  torch::Tensor aux1;
-  torch::Tensor aux2;
-};
-
-struct VISION_API GoogLeNetImpl : torch::nn::Module {
-  bool aux_logits, transform_input;
-
-  _googlenetimpl::BasicConv2d conv1{nullptr}, conv2{nullptr}, conv3{nullptr};
-
-  _googlenetimpl::Inception inception3a{nullptr}, inception3b{nullptr},
-      inception4a{nullptr}, inception4b{nullptr}, inception4c{nullptr},
-      inception4d{nullptr}, inception4e{nullptr}, inception5a{nullptr},
-      inception5b{nullptr};
-
-  _googlenetimpl::InceptionAux aux1{nullptr}, aux2{nullptr};
-
-  torch::nn::Dropout dropout{nullptr};
-  torch::nn::Linear fc{nullptr};
-
-  explicit GoogLeNetImpl(
-      int64_t num_classes = 1000,
-      bool aux_logits = true,
-      bool transform_input = false,
-      bool init_weights = true);
-
-  void _initialize_weights();
-
-  GoogLeNetOutput forward(torch::Tensor x);
-};
-
-TORCH_MODULE(GoogLeNet);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/inception.cpp b/torchvision/csrc/models/inception.cpp
deleted file mode 100644
index f94f89778b2..00000000000
--- a/torchvision/csrc/models/inception.cpp
+++ /dev/null
@@ -1,377 +0,0 @@
-#include "inception.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-
-using Options = torch::nn::Conv2dOptions;
-
-namespace _inceptionimpl {
-BasicConv2dImpl::BasicConv2dImpl(
-    torch::nn::Conv2dOptions options,
-    double std_dev) {
-  options.bias(false);
-  conv = torch::nn::Conv2d(options);
-  bn = torch::nn::BatchNorm2d(
-      torch::nn::BatchNormOptions(options.out_channels()).eps(0.001));
-
-  register_module("conv", conv);
-  register_module("bn", bn);
-
-  torch::nn::init::normal_(
-      conv->weight,
-      0,
-      std_dev); // Note: used instead of truncated normal initialization
-
-  torch::nn::init::constant_(bn->weight, 1);
-  torch::nn::init::constant_(bn->bias, 0);
-}
-
-torch::Tensor BasicConv2dImpl::forward(torch::Tensor x) {
-  x = conv->forward(x);
-  x = bn->forward(x);
-  return torch::relu_(x);
-}
-
-InceptionAImpl::InceptionAImpl(int64_t in_channels, int64_t pool_features)
-    : branch1x1(Options(in_channels, 64, 1)),
-      branch5x5_1(Options(in_channels, 48, 1)),
-      branch5x5_2(Options(48, 64, 5).padding(2)),
-      branch3x3dbl_1(Options(in_channels, 64, 1)),
-      branch3x3dbl_2(Options(64, 96, 3).padding(1)),
-      branch3x3dbl_3(Options(96, 96, 3).padding(1)),
-      branch_pool(Options(in_channels, pool_features, 1)) {
-  register_module("branch1x1", branch1x1);
-  register_module("branch5x5_1", branch5x5_1);
-  register_module("branch5x5_2", branch5x5_2);
-  register_module("branch3x3dbl_1", branch3x3dbl_1);
-  register_module("branch3x3dbl_2", branch3x3dbl_2);
-  register_module("branch3x3dbl_3", branch3x3dbl_3);
-  register_module("branch_pool", branch_pool);
-}
-
-torch::Tensor InceptionAImpl::forward(const torch::Tensor& x) {
-  auto branch1x1 = this->branch1x1->forward(x);
-
-  auto branch5x5 = this->branch5x5_1->forward(x);
-  branch5x5 = this->branch5x5_2->forward(branch5x5);
-
-  auto branch3x3dbl = this->branch3x3dbl_1->forward(x);
-  branch3x3dbl = this->branch3x3dbl_2->forward(branch3x3dbl);
-  branch3x3dbl = this->branch3x3dbl_3->forward(branch3x3dbl);
-
-  auto branch_pool = torch::avg_pool2d(x, 3, 1, 1);
-  branch_pool = this->branch_pool->forward(branch_pool);
-
-  return torch::cat({branch1x1, branch5x5, branch3x3dbl, branch_pool}, 1);
-}
-
-InceptionBImpl::InceptionBImpl(int64_t in_channels)
-    : branch3x3(Options(in_channels, 384, 3).stride(2)),
-      branch3x3dbl_1(Options(in_channels, 64, 1)),
-      branch3x3dbl_2(Options(64, 96, 3).padding(1)),
-      branch3x3dbl_3(Options(96, 96, 3).stride(2)) {
-  register_module("branch3x3", branch3x3);
-  register_module("branch3x3dbl_1", branch3x3dbl_1);
-  register_module("branch3x3dbl_2", branch3x3dbl_2);
-  register_module("branch3x3dbl_3", branch3x3dbl_3);
-}
-
-torch::Tensor InceptionBImpl::forward(const torch::Tensor& x) {
-  auto branch3x3 = this->branch3x3->forward(x);
-
-  auto branch3x3dbl = this->branch3x3dbl_1->forward(x);
-  branch3x3dbl = this->branch3x3dbl_2->forward(branch3x3dbl);
-  branch3x3dbl = this->branch3x3dbl_3->forward(branch3x3dbl);
-
-  auto branch_pool = torch::max_pool2d(x, 3, 2);
-  return torch::cat({branch3x3, branch3x3dbl, branch_pool}, 1);
-}
-
-InceptionCImpl::InceptionCImpl(int64_t in_channels, int64_t channels_7x7) {
-  branch1x1 = BasicConv2d(Options(in_channels, 192, 1));
-
-  auto c7 = channels_7x7;
-  branch7x7_1 = BasicConv2d(Options(in_channels, c7, 1));
-  branch7x7_2 = BasicConv2d(Options(c7, c7, {1, 7}).padding({0, 3}));
-  branch7x7_3 = BasicConv2d(Options(c7, 192, {7, 1}).padding({3, 0}));
-
-  branch7x7dbl_1 = BasicConv2d(Options(in_channels, c7, 1));
-  branch7x7dbl_2 = BasicConv2d(Options(c7, c7, {7, 1}).padding({3, 0}));
-  branch7x7dbl_3 = BasicConv2d(Options(c7, c7, {1, 7}).padding({0, 3}));
-  branch7x7dbl_4 = BasicConv2d(Options(c7, c7, {7, 1}).padding({3, 0}));
-  branch7x7dbl_5 = BasicConv2d(Options(c7, 192, {1, 7}).padding({0, 3}));
-
-  branch_pool = BasicConv2d(Options(in_channels, 192, 1));
-
-  register_module("branch1x1", branch1x1);
-  register_module("branch7x7_1", branch7x7_1);
-  register_module("branch7x7_2", branch7x7_2);
-  register_module("branch7x7_3", branch7x7_3);
-  register_module("branch7x7dbl_1", branch7x7dbl_1);
-  register_module("branch7x7dbl_2", branch7x7dbl_2);
-  register_module("branch7x7dbl_3", branch7x7dbl_3);
-  register_module("branch7x7dbl_4", branch7x7dbl_4);
-  register_module("branch7x7dbl_5", branch7x7dbl_5);
-  register_module("branch_pool", branch_pool);
-}
-
-torch::Tensor InceptionCImpl::forward(const torch::Tensor& x) {
-  auto branch1x1 = this->branch1x1->forward(x);
-
-  auto branch7x7 = this->branch7x7_1->forward(x);
-  branch7x7 = this->branch7x7_2->forward(branch7x7);
-  branch7x7 = this->branch7x7_3->forward(branch7x7);
-
-  auto branch7x7dbl = this->branch7x7dbl_1->forward(x);
-  branch7x7dbl = this->branch7x7dbl_2->forward(branch7x7dbl);
-  branch7x7dbl = this->branch7x7dbl_3->forward(branch7x7dbl);
-  branch7x7dbl = this->branch7x7dbl_4->forward(branch7x7dbl);
-  branch7x7dbl = this->branch7x7dbl_5->forward(branch7x7dbl);
-
-  auto branch_pool = torch::avg_pool2d(x, 3, 1, 1);
-  branch_pool = this->branch_pool->forward(branch_pool);
-
-  return torch::cat({branch1x1, branch7x7, branch7x7dbl, branch_pool}, 1);
-}
-
-InceptionDImpl::InceptionDImpl(int64_t in_channels)
-    : branch3x3_1(Options(in_channels, 192, 1)),
-      branch3x3_2(Options(192, 320, 3).stride(2)),
-      branch7x7x3_1(Options(in_channels, 192, 1)),
-      branch7x7x3_2(Options(192, 192, {1, 7}).padding({0, 3})),
-      branch7x7x3_3(Options(192, 192, {7, 1}).padding({3, 0})),
-      branch7x7x3_4(Options(192, 192, 3).stride(2))
-
-{
-  register_module("branch3x3_1", branch3x3_1);
-  register_module("branch3x3_2", branch3x3_2);
-  register_module("branch7x7x3_1", branch7x7x3_1);
-  register_module("branch7x7x3_2", branch7x7x3_2);
-  register_module("branch7x7x3_3", branch7x7x3_3);
-  register_module("branch7x7x3_4", branch7x7x3_4);
-}
-
-torch::Tensor InceptionDImpl::forward(const torch::Tensor& x) {
-  auto branch3x3 = this->branch3x3_1->forward(x);
-  branch3x3 = this->branch3x3_2->forward(branch3x3);
-
-  auto branch7x7x3 = this->branch7x7x3_1->forward(x);
-  branch7x7x3 = this->branch7x7x3_2->forward(branch7x7x3);
-  branch7x7x3 = this->branch7x7x3_3->forward(branch7x7x3);
-  branch7x7x3 = this->branch7x7x3_4->forward(branch7x7x3);
-
-  auto branch_pool = torch::max_pool2d(x, 3, 2);
-  return torch::cat({branch3x3, branch7x7x3, branch_pool}, 1);
-}
-
-InceptionEImpl::InceptionEImpl(int64_t in_channels)
-    : branch1x1(Options(in_channels, 320, 1)),
-      branch3x3_1(Options(in_channels, 384, 1)),
-      branch3x3_2a(Options(384, 384, {1, 3}).padding({0, 1})),
-      branch3x3_2b(Options(384, 384, {3, 1}).padding({1, 0})),
-      branch3x3dbl_1(Options(in_channels, 448, 1)),
-      branch3x3dbl_2(Options(448, 384, 3).padding(1)),
-      branch3x3dbl_3a(Options(384, 384, {1, 3}).padding({0, 1})),
-      branch3x3dbl_3b(Options(384, 384, {3, 1}).padding({1, 0})),
-      branch_pool(Options(in_channels, 192, 1)) {
-  register_module("branch1x1", branch1x1);
-  register_module("branch3x3_1", branch3x3_1);
-  register_module("branch3x3_2a", branch3x3_2a);
-  register_module("branch3x3_2b", branch3x3_2b);
-  register_module("branch3x3dbl_1", branch3x3dbl_1);
-  register_module("branch3x3dbl_2", branch3x3dbl_2);
-  register_module("branch3x3dbl_3a", branch3x3dbl_3a);
-  register_module("branch3x3dbl_3b", branch3x3dbl_3b);
-  register_module("branch_pool", branch_pool);
-}
-
-torch::Tensor InceptionEImpl::forward(const torch::Tensor& x) {
-  auto branch1x1 = this->branch1x1->forward(x);
-
-  auto branch3x3 = this->branch3x3_1->forward(x);
-  branch3x3 = torch::cat(
-      {
-          this->branch3x3_2a->forward(branch3x3),
-          this->branch3x3_2b->forward(branch3x3),
-      },
-      1);
-
-  auto branch3x3dbl = this->branch3x3dbl_1->forward(x);
-  branch3x3dbl = this->branch3x3dbl_2->forward(branch3x3dbl);
-  branch3x3dbl = torch::cat(
-      {this->branch3x3dbl_3a->forward(branch3x3dbl),
-       this->branch3x3dbl_3b->forward(branch3x3dbl)},
-      1);
-
-  auto branch_pool = torch::avg_pool2d(x, 3, 1, 1);
-  branch_pool = this->branch_pool->forward(branch_pool);
-
-  return torch::cat({branch1x1, branch3x3, branch3x3dbl, branch_pool}, 1);
-}
-
-InceptionAuxImpl::InceptionAuxImpl(int64_t in_channels, int64_t num_classes)
-    : conv0(BasicConv2d(Options(in_channels, 128, 1))),
-      conv1(BasicConv2d(Options(128, 768, 5), 0.01)),
-      fc(768, num_classes) {
-  torch::nn::init::normal_(
-      fc->weight,
-      0,
-      0.001); // Note: used instead of truncated normal initialization
-
-  register_module("conv0", conv0);
-  register_module("conv1", conv1);
-  register_module("fc", fc);
-}
-
-torch::Tensor InceptionAuxImpl::forward(torch::Tensor x) {
-  // N x 768 x 17 x 17
-  x = torch::avg_pool2d(x, 5, 3);
-  // N x 768 x 5 x 5
-  x = conv0->forward(x);
-  // N x 128 x 5 x 5
-  x = conv1->forward(x);
-  // N x 768 x 1 x 1
-  x = torch::adaptive_avg_pool2d(x, {1, 1});
-  // N x 768 x 1 x 1
-  x = x.view({x.size(0), -1});
-  // N x 768
-  x = fc->forward(x);
-  // N x 1000 (num_classes)
-  return x;
-}
-
-} // namespace _inceptionimpl
-
-InceptionV3Impl::InceptionV3Impl(
-    int64_t num_classes,
-    bool aux_logits,
-    bool transform_input)
-    : aux_logits(aux_logits), transform_input(transform_input) {
-  Conv2d_1a_3x3 = _inceptionimpl::BasicConv2d(Options(3, 32, 3).stride(2));
-  Conv2d_2a_3x3 = _inceptionimpl::BasicConv2d(Options(32, 32, 3));
-  Conv2d_2b_3x3 = _inceptionimpl::BasicConv2d(Options(32, 64, 3).padding(1));
-  Conv2d_3b_1x1 = _inceptionimpl::BasicConv2d(Options(64, 80, 1));
-  Conv2d_4a_3x3 = _inceptionimpl::BasicConv2d(Options(80, 192, 3));
-
-  Mixed_5b = _inceptionimpl::InceptionA(192, 32);
-  Mixed_5c = _inceptionimpl::InceptionA(256, 64);
-  Mixed_5d = _inceptionimpl::InceptionA(288, 64);
-
-  Mixed_6a = _inceptionimpl::InceptionB(288);
-  Mixed_6b = _inceptionimpl::InceptionC(768, 128);
-  Mixed_6c = _inceptionimpl::InceptionC(768, 160);
-  Mixed_6d = _inceptionimpl::InceptionC(768, 160);
-  Mixed_6e = _inceptionimpl::InceptionC(768, 192);
-
-  if (aux_logits)
-    AuxLogits = _inceptionimpl::InceptionAux(768, num_classes);
-
-  Mixed_7a = _inceptionimpl::InceptionD(768);
-  Mixed_7b = _inceptionimpl::InceptionE(1280);
-  Mixed_7c = _inceptionimpl::InceptionE(2048);
-
-  fc = torch::nn::Linear(2048, num_classes);
-  torch::nn::init::normal_(
-      fc->weight,
-      0,
-      0.1); // Note: used instead of truncated normal initialization
-
-  register_module("Conv2d_1a_3x3", Conv2d_1a_3x3);
-  register_module("Conv2d_2a_3x3", Conv2d_2a_3x3);
-  register_module("Conv2d_2b_3x3", Conv2d_2b_3x3);
-  register_module("Conv2d_3b_1x1", Conv2d_3b_1x1);
-  register_module("Conv2d_4a_3x3", Conv2d_4a_3x3);
-  register_module("Mixed_5b", Mixed_5b);
-  register_module("Mixed_5c", Mixed_5c);
-  register_module("Mixed_5d", Mixed_5d);
-  register_module("Mixed_6a", Mixed_6a);
-  register_module("Mixed_6b", Mixed_6b);
-  register_module("Mixed_6c", Mixed_6c);
-  register_module("Mixed_6d", Mixed_6d);
-  register_module("Mixed_6e", Mixed_6e);
-
-  if (!AuxLogits.is_empty())
-    register_module("AuxLogits", AuxLogits);
-
-  register_module("Mixed_7a", Mixed_7a);
-  register_module("Mixed_7b", Mixed_7b);
-  register_module("Mixed_7c", Mixed_7c);
-  register_module("fc", fc);
-
-  modelsimpl::deprecation_warning();
-}
-
-InceptionV3Output InceptionV3Impl::forward(torch::Tensor x) {
-  if (transform_input) {
-    auto x_ch0 = torch::unsqueeze(x.select(1, 0), 1) * (0.229 / 0.5) +
-        (0.485 - 0.5) / 0.5;
-    auto x_ch1 = torch::unsqueeze(x.select(1, 1), 1) * (0.224 / 0.5) +
-        (0.456 - 0.5) / 0.5;
-    auto x_ch2 = torch::unsqueeze(x.select(1, 2), 1) * (0.225 / 0.5) +
-        (0.406 - 0.5) / 0.5;
-
-    x = torch::cat({x_ch0, x_ch1, x_ch2}, 1);
-  }
-
-  // N x 3 x 299 x 299
-  x = Conv2d_1a_3x3->forward(x);
-  // N x 32 x 149 x 149
-  x = Conv2d_2a_3x3->forward(x);
-  // N x 32 x 147 x 147
-  x = Conv2d_2b_3x3->forward(x);
-  // N x 64 x 147 x 147
-  x = torch::max_pool2d(x, 3, 2);
-  // N x 64 x 73 x 73
-  x = Conv2d_3b_1x1->forward(x);
-  // N x 80 x 73 x 73
-  x = Conv2d_4a_3x3->forward(x);
-  // N x 192 x 71 x 71
-  x = torch::max_pool2d(x, 3, 2);
-  // N x 192 x 35 x 35
-  x = Mixed_5b->forward(x);
-  // N x 256 x 35 x 35
-  x = Mixed_5c->forward(x);
-  // N x 288 x 35 x 35
-  x = Mixed_5d->forward(x);
-  // N x 288 x 35 x 35
-  x = Mixed_6a->forward(x);
-  // N x 768 x 17 x 17
-  x = Mixed_6b->forward(x);
-  // N x 768 x 17 x 17
-  x = Mixed_6c->forward(x);
-  // N x 768 x 17 x 17
-  x = Mixed_6d->forward(x);
-  // N x 768 x 17 x 17
-  x = Mixed_6e->forward(x);
-  // N x 768 x 17 x 17
-
-  torch::Tensor aux;
-  if (is_training() && aux_logits)
-    aux = AuxLogits->forward(x);
-
-  // N x 768 x 17 x 17
-  x = Mixed_7a->forward(x);
-  // N x 1280 x 8 x 8
-  x = Mixed_7b->forward(x);
-  // N x 2048 x 8 x 8
-  x = Mixed_7c->forward(x);
-  // N x 2048 x 8 x 8
-  x = torch::adaptive_avg_pool2d(x, {1, 1});
-  // N x 2048 x 1 x 1
-  x = torch::dropout(x, 0.5, is_training());
-  // N x 2048 x 1 x 1
-  x = x.view({x.size(0), -1});
-  // N x 2048
-  x = fc->forward(x);
-  // N x 1000 (num_classes)
-
-  if (is_training() && aux_logits)
-    return {x, aux};
-  return {x, {}};
-}
-
-// namespace _inceptionimpl
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/inception.h b/torchvision/csrc/models/inception.h
deleted file mode 100644
index 53ce07a703a..00000000000
--- a/torchvision/csrc/models/inception.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-namespace _inceptionimpl {
-struct VISION_API BasicConv2dImpl : torch::nn::Module {
-  torch::nn::Conv2d conv{nullptr};
-  torch::nn::BatchNorm2d bn{nullptr};
-
-  explicit BasicConv2dImpl(
-      torch::nn::Conv2dOptions options,
-      double std_dev = 0.1);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(BasicConv2d);
-
-struct VISION_API InceptionAImpl : torch::nn::Module {
-  BasicConv2d branch1x1, branch5x5_1, branch5x5_2, branch3x3dbl_1,
-      branch3x3dbl_2, branch3x3dbl_3, branch_pool;
-
-  InceptionAImpl(int64_t in_channels, int64_t pool_features);
-
-  torch::Tensor forward(const torch::Tensor& x);
-};
-
-struct VISION_API InceptionBImpl : torch::nn::Module {
-  BasicConv2d branch3x3, branch3x3dbl_1, branch3x3dbl_2, branch3x3dbl_3;
-
-  explicit InceptionBImpl(int64_t in_channels);
-
-  torch::Tensor forward(const torch::Tensor& x);
-};
-
-struct VISION_API InceptionCImpl : torch::nn::Module {
-  BasicConv2d branch1x1{nullptr}, branch7x7_1{nullptr}, branch7x7_2{nullptr},
-      branch7x7_3{nullptr}, branch7x7dbl_1{nullptr}, branch7x7dbl_2{nullptr},
-      branch7x7dbl_3{nullptr}, branch7x7dbl_4{nullptr}, branch7x7dbl_5{nullptr},
-      branch_pool{nullptr};
-
-  InceptionCImpl(int64_t in_channels, int64_t channels_7x7);
-
-  torch::Tensor forward(const torch::Tensor& x);
-};
-
-struct VISION_API InceptionDImpl : torch::nn::Module {
-  BasicConv2d branch3x3_1, branch3x3_2, branch7x7x3_1, branch7x7x3_2,
-      branch7x7x3_3, branch7x7x3_4;
-
-  explicit InceptionDImpl(int64_t in_channels);
-
-  torch::Tensor forward(const torch::Tensor& x);
-};
-
-struct VISION_API InceptionEImpl : torch::nn::Module {
-  BasicConv2d branch1x1, branch3x3_1, branch3x3_2a, branch3x3_2b,
-      branch3x3dbl_1, branch3x3dbl_2, branch3x3dbl_3a, branch3x3dbl_3b,
-      branch_pool;
-
-  explicit InceptionEImpl(int64_t in_channels);
-
-  torch::Tensor forward(const torch::Tensor& x);
-};
-
-struct VISION_API InceptionAuxImpl : torch::nn::Module {
-  BasicConv2d conv0;
-  BasicConv2d conv1;
-  torch::nn::Linear fc;
-
-  InceptionAuxImpl(int64_t in_channels, int64_t num_classes);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(InceptionA);
-TORCH_MODULE(InceptionB);
-TORCH_MODULE(InceptionC);
-TORCH_MODULE(InceptionD);
-TORCH_MODULE(InceptionE);
-TORCH_MODULE(InceptionAux);
-
-} // namespace _inceptionimpl
-
-struct VISION_API InceptionV3Output {
-  torch::Tensor output;
-  torch::Tensor aux;
-};
-
-// Inception v3 model architecture from
-//"Rethinking the Inception Architecture for Computer Vision"
-//<http://arxiv.org/abs/1512.00567>
-struct VISION_API InceptionV3Impl : torch::nn::Module {
-  bool aux_logits, transform_input;
-
-  _inceptionimpl::BasicConv2d Conv2d_1a_3x3{nullptr}, Conv2d_2a_3x3{nullptr},
-      Conv2d_2b_3x3{nullptr}, Conv2d_3b_1x1{nullptr}, Conv2d_4a_3x3{nullptr};
-
-  _inceptionimpl::InceptionA Mixed_5b{nullptr}, Mixed_5c{nullptr},
-      Mixed_5d{nullptr};
-  _inceptionimpl::InceptionB Mixed_6a{nullptr};
-  _inceptionimpl::InceptionC Mixed_6b{nullptr}, Mixed_6c{nullptr},
-      Mixed_6d{nullptr}, Mixed_6e{nullptr};
-  _inceptionimpl::InceptionD Mixed_7a{nullptr};
-  _inceptionimpl::InceptionE Mixed_7b{nullptr}, Mixed_7c{nullptr};
-
-  torch::nn::Linear fc{nullptr};
-
-  _inceptionimpl::InceptionAux AuxLogits{nullptr};
-
-  explicit InceptionV3Impl(
-      int64_t num_classes = 1000,
-      bool aux_logits = true,
-      bool transform_input = false);
-
-  InceptionV3Output forward(torch::Tensor x);
-};
-
-TORCH_MODULE(InceptionV3);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/mnasnet.cpp b/torchvision/csrc/models/mnasnet.cpp
deleted file mode 100644
index 7bb5eb9c7da..00000000000
--- a/torchvision/csrc/models/mnasnet.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "mnasnet.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-using Options = torch::nn::Conv2dOptions;
-
-struct MNASNetInvertedResidualImpl : torch::nn::Module {
-  bool apply_residual;
-  torch::nn::Sequential layers;
-
-  MNASNetInvertedResidualImpl(
-      int64_t input,
-      int64_t output,
-      int64_t kernel,
-      int64_t stride,
-      double expansion_factor,
-      double bn_momentum = 0.1) {
-    TORCH_CHECK(stride == 1 || stride == 2);
-    TORCH_CHECK(kernel == 3 || kernel == 5);
-
-    auto mid = int64_t(input * expansion_factor);
-    apply_residual = input == output && stride == 1;
-
-    layers->push_back(torch::nn::Conv2d(Options(input, mid, 1).bias(false)));
-    layers->push_back(torch::nn::BatchNorm2d(
-        torch::nn::BatchNormOptions(mid).momentum(bn_momentum)));
-    layers->push_back(
-        torch::nn::Functional(torch::nn::Functional(modelsimpl::relu_)));
-    layers->push_back(
-        torch::nn::Conv2d(torch::nn::Conv2d(Options(mid, mid, kernel)
-                                                .padding(kernel / 2)
-                                                .stride(stride)
-                                                .groups(mid)
-                                                .bias(false))));
-    layers->push_back(torch::nn::BatchNorm2d(
-        torch::nn::BatchNormOptions(mid).momentum(bn_momentum)));
-    layers->push_back(
-        torch::nn::Functional(torch::nn::Functional(modelsimpl::relu_)));
-    layers->push_back(torch::nn::Conv2d(Options(mid, output, 1).bias(false)));
-    layers->push_back(torch::nn::BatchNorm2d(
-        torch::nn::BatchNormOptions(output).momentum(bn_momentum)));
-
-    register_module("layers", layers);
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    if (apply_residual)
-      return layers->forward(x) + x;
-    return layers->forward(x);
-  }
-};
-
-TORCH_MODULE(MNASNetInvertedResidual);
-
-struct StackSequentailImpl : torch::nn::SequentialImpl {
-  using SequentialImpl::SequentialImpl;
-
-  torch::Tensor forward(torch::Tensor x) {
-    return SequentialImpl::forward(x);
-  }
-};
-
-TORCH_MODULE(StackSequentail);
-
-StackSequentail stack(
-    int64_t input,
-    int64_t output,
-    int64_t kernel,
-    int64_t stride,
-    double exp_factor,
-    int64_t repeats,
-    double bn_momentum) {
-  TORCH_CHECK(repeats >= 1);
-
-  StackSequentail seq;
-  seq->push_back(MNASNetInvertedResidual(
-      input, output, kernel, stride, exp_factor, bn_momentum));
-
-  for (int64_t i = 1; i < repeats; ++i)
-    seq->push_back(MNASNetInvertedResidual(
-        output, output, kernel, 1, exp_factor, bn_momentum));
-
-  return seq;
-}
-
-int64_t round_to_multiple_of(
-    int64_t val,
-    int64_t divisor,
-    double round_up_bias = .9) {
-  TORCH_CHECK(0.0 < round_up_bias && round_up_bias < 1.0);
-  auto new_val = std::max(divisor, (val + divisor / 2) / divisor * divisor);
-  return new_val >= round_up_bias * val ? new_val : new_val + divisor;
-}
-
-std::vector<int64_t> scale_depths(std::vector<int64_t> depths, double alpha) {
-  std::vector<int64_t> data(depths.size());
-  for (size_t i = 0; i < data.size(); ++i) {
-    data[i] = round_to_multiple_of(int64_t(depths[i] * alpha), 8);
-  }
-
-  return data;
-}
-
-void MNASNetImpl::_initialize_weights() {
-  for (auto& module : modules(/*include_self=*/false)) {
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get()))
-      torch::nn::init::kaiming_normal_(
-          M->weight, 0, torch::kFanOut, torch::kReLU);
-    else if (auto M = dynamic_cast<torch::nn::BatchNorm2dImpl*>(module.get())) {
-      torch::nn::init::ones_(M->weight);
-      torch::nn::init::zeros_(M->bias);
-    } else if (auto M = dynamic_cast<torch::nn::LinearImpl*>(module.get())) {
-      torch::nn::init::normal_(M->weight, 0, 0.01);
-      torch::nn::init::zeros_(M->bias);
-    }
-  }
-}
-
-#define BN_MOMENTUM 1 - 0.9997
-
-MNASNetImpl::MNASNetImpl(double alpha, int64_t num_classes, double dropout) {
-  auto depths = scale_depths({24, 40, 80, 96, 192, 320}, alpha);
-
-  layers->push_back(
-      torch::nn::Conv2d(Options(3, 32, 3).padding(1).stride(2).bias(false)));
-  layers->push_back(torch::nn::BatchNorm2d(
-      torch::nn::BatchNormOptions(32).momentum(BN_MOMENTUM)));
-  layers->push_back(torch::nn::Functional(modelsimpl::relu_));
-  layers->push_back(torch::nn::Conv2d(
-      Options(32, 32, 3).padding(1).stride(1).groups(32).bias(false)));
-  layers->push_back(torch::nn::BatchNorm2d(
-      torch::nn::BatchNormOptions(32).momentum(BN_MOMENTUM)));
-  layers->push_back(torch::nn::Functional(modelsimpl::relu_));
-  layers->push_back(
-      torch::nn::Conv2d(Options(32, 16, 1).padding(0).stride(1).bias(false)));
-  layers->push_back(torch::nn::BatchNorm2d(
-      torch::nn::BatchNormOptions(16).momentum(BN_MOMENTUM)));
-
-  layers->push_back(stack(16, depths[0], 3, 2, 3, 3, BN_MOMENTUM));
-  layers->push_back(stack(depths[0], depths[1], 5, 2, 3, 3, BN_MOMENTUM));
-  layers->push_back(stack(depths[1], depths[2], 5, 2, 6, 3, BN_MOMENTUM));
-  layers->push_back(stack(depths[2], depths[3], 3, 1, 6, 2, BN_MOMENTUM));
-  layers->push_back(stack(depths[3], depths[4], 5, 2, 6, 4, BN_MOMENTUM));
-  layers->push_back(stack(depths[4], depths[5], 3, 1, 6, 1, BN_MOMENTUM));
-
-  layers->push_back(torch::nn::Conv2d(
-      Options(depths[5], 1280, 1).padding(0).stride(1).bias(false)));
-  layers->push_back(torch::nn::BatchNorm2d(
-      torch::nn::BatchNormOptions(1280).momentum(BN_MOMENTUM)));
-  layers->push_back(torch::nn::Functional(modelsimpl::relu_));
-
-  classifier = torch::nn::Sequential(
-      torch::nn::Dropout(dropout), torch::nn::Linear(1280, num_classes));
-
-  register_module("layers", layers);
-  register_module("classifier", classifier);
-
-  _initialize_weights();
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor MNASNetImpl::forward(torch::Tensor x) {
-  x = layers->forward(x);
-  x = x.mean({2, 3});
-  return classifier->forward(x);
-}
-
-MNASNet0_5Impl::MNASNet0_5Impl(int64_t num_classes, double dropout)
-    : MNASNetImpl(.5, num_classes, dropout) {}
-
-MNASNet0_75Impl::MNASNet0_75Impl(int64_t num_classes, double dropout)
-    : MNASNetImpl(.75, num_classes, dropout) {}
-
-MNASNet1_0Impl::MNASNet1_0Impl(int64_t num_classes, double dropout)
-    : MNASNetImpl(1, num_classes, dropout) {}
-
-MNASNet1_3Impl::MNASNet1_3Impl(int64_t num_classes, double dropout)
-    : MNASNetImpl(1.3, num_classes, dropout) {}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/mnasnet.h b/torchvision/csrc/models/mnasnet.h
deleted file mode 100644
index f08b5cf4284..00000000000
--- a/torchvision/csrc/models/mnasnet.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-struct VISION_API MNASNetImpl : torch::nn::Module {
-  torch::nn::Sequential layers, classifier;
-
-  void _initialize_weights();
-
-  explicit MNASNetImpl(
-      double alpha,
-      int64_t num_classes = 1000,
-      double dropout = .2);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-struct MNASNet0_5Impl : MNASNetImpl {
-  explicit MNASNet0_5Impl(int64_t num_classes = 1000, double dropout = .2);
-};
-
-struct MNASNet0_75Impl : MNASNetImpl {
-  explicit MNASNet0_75Impl(int64_t num_classes = 1000, double dropout = .2);
-};
-
-struct MNASNet1_0Impl : MNASNetImpl {
-  explicit MNASNet1_0Impl(int64_t num_classes = 1000, double dropout = .2);
-};
-
-struct MNASNet1_3Impl : MNASNetImpl {
-  explicit MNASNet1_3Impl(int64_t num_classes = 1000, double dropout = .2);
-};
-
-TORCH_MODULE(MNASNet);
-TORCH_MODULE(MNASNet0_5);
-TORCH_MODULE(MNASNet0_75);
-TORCH_MODULE(MNASNet1_0);
-TORCH_MODULE(MNASNet1_3);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/mobilenet.cpp b/torchvision/csrc/models/mobilenet.cpp
deleted file mode 100644
index 54655f76f82..00000000000
--- a/torchvision/csrc/models/mobilenet.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "mobilenet.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-using Options = torch::nn::Conv2dOptions;
-
-int64_t make_divisible(
-    double value,
-    int64_t divisor,
-    c10::optional<int64_t> min_value = {}) {
-  if (!min_value.has_value())
-    min_value = divisor;
-  auto new_value = std::max(
-      min_value.value(), (int64_t(value + divisor / 2) / divisor) * divisor);
-  if (new_value < .9 * value)
-    new_value += divisor;
-  return new_value;
-}
-
-struct ConvBNReLUImpl : torch::nn::SequentialImpl {
-  ConvBNReLUImpl(
-      int64_t in_planes,
-      int64_t out_planes,
-      int64_t kernel_size = 3,
-      int64_t stride = 1,
-      int64_t groups = 1) {
-    auto padding = (kernel_size - 1) / 2;
-
-    push_back(torch::nn::Conv2d(Options(in_planes, out_planes, kernel_size)
-                                    .stride(stride)
-                                    .padding(padding)
-                                    .groups(groups)
-                                    .bias(false)));
-    push_back(torch::nn::BatchNorm2d(out_planes));
-    push_back(torch::nn::Functional(modelsimpl::relu6_));
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    return torch::nn::SequentialImpl::forward(x);
-  }
-};
-
-TORCH_MODULE(ConvBNReLU);
-
-struct MobileNetInvertedResidualImpl : torch::nn::Module {
-  int64_t stride;
-  bool use_res_connect;
-  torch::nn::Sequential conv;
-
-  MobileNetInvertedResidualImpl(
-      int64_t input,
-      int64_t output,
-      int64_t stride,
-      double expand_ratio)
-      : stride(stride), use_res_connect(stride == 1 && input == output) {
-    auto double_compare = [](double a, double b) {
-      return double(std::abs(a - b)) < std::numeric_limits<double>::epsilon();
-    };
-
-    TORCH_CHECK(stride == 1 || stride == 2);
-    auto hidden_dim = int64_t(std::round(input * expand_ratio));
-
-    if (!double_compare(expand_ratio, 1))
-      conv->push_back(ConvBNReLU(input, hidden_dim, 1));
-
-    conv->push_back(ConvBNReLU(hidden_dim, hidden_dim, 3, stride, hidden_dim));
-    conv->push_back(torch::nn::Conv2d(
-        Options(hidden_dim, output, 1).stride(1).padding(0).bias(false)));
-    conv->push_back(torch::nn::BatchNorm2d(output));
-
-    register_module("conv", conv);
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    if (use_res_connect)
-      return x + conv->forward(x);
-    return conv->forward(x);
-  }
-};
-
-TORCH_MODULE(MobileNetInvertedResidual);
-
-MobileNetV2Impl::MobileNetV2Impl(
-    int64_t num_classes,
-    double width_mult,
-    std::vector<std::vector<int64_t>> inverted_residual_settings,
-    int64_t round_nearest) {
-  using Block = MobileNetInvertedResidual;
-  int64_t input_channel = 32;
-  int64_t last_channel = 1280;
-
-  if (inverted_residual_settings.empty())
-    inverted_residual_settings = {
-        // t, c, n, s
-        {1, 16, 1, 1},
-        {6, 24, 2, 2},
-        {6, 32, 3, 2},
-        {6, 64, 4, 2},
-        {6, 96, 3, 1},
-        {6, 160, 3, 2},
-        {6, 320, 1, 1},
-    };
-
-  TORCH_CHECK(
-      inverted_residual_settings[0].size() == 4,
-      "inverted_residual_settings should contain 4-element vectors");
-
-  input_channel = make_divisible(input_channel * width_mult, round_nearest);
-  this->last_channel =
-      make_divisible(last_channel * std::max(1.0, width_mult), round_nearest);
-  features->push_back(ConvBNReLU(3, input_channel, 3, 2));
-
-  for (auto setting : inverted_residual_settings) {
-    auto output_channel =
-        make_divisible(setting[1] * width_mult, round_nearest);
-
-    for (int64_t i = 0; i < setting[2]; ++i) {
-      auto stride = i == 0 ? setting[3] : 1;
-      features->push_back(
-          Block(input_channel, output_channel, stride, setting[0]));
-      input_channel = output_channel;
-    }
-  }
-
-  features->push_back(ConvBNReLU(input_channel, this->last_channel, 1));
-
-  classifier->push_back(torch::nn::Dropout(0.2));
-  classifier->push_back(torch::nn::Linear(this->last_channel, num_classes));
-
-  register_module("features", features);
-  register_module("classifier", classifier);
-
-  for (auto& module : modules(/*include_self=*/false)) {
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get())) {
-      torch::nn::init::kaiming_normal_(M->weight, 0, torch::kFanOut);
-      if (M->options.bias())
-        torch::nn::init::zeros_(M->bias);
-    } else if (
-        auto M = dynamic_cast<torch::nn::BatchNorm2dImpl*>(module.get())) {
-      torch::nn::init::ones_(M->weight);
-      torch::nn::init::zeros_(M->bias);
-    } else if (auto M = dynamic_cast<torch::nn::LinearImpl*>(module.get())) {
-      torch::nn::init::normal_(M->weight, 0, 0.01);
-      torch::nn::init::zeros_(M->bias);
-    }
-  }
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor MobileNetV2Impl::forward(at::Tensor x) {
-  x = features->forward(x);
-  x = x.mean({2, 3});
-  x = classifier->forward(x);
-  return x;
-}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/mobilenet.h b/torchvision/csrc/models/mobilenet.h
deleted file mode 100644
index 0d1b8f1d0c9..00000000000
--- a/torchvision/csrc/models/mobilenet.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-struct VISION_API MobileNetV2Impl : torch::nn::Module {
-  int64_t last_channel;
-  torch::nn::Sequential features, classifier;
-
-  explicit MobileNetV2Impl(
-      int64_t num_classes = 1000,
-      double width_mult = 1.0,
-      std::vector<std::vector<int64_t>> inverted_residual_settings = {},
-      int64_t round_nearest = 8);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-TORCH_MODULE(MobileNetV2);
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/models.h b/torchvision/csrc/models/models.h
deleted file mode 100644
index 8376ed12020..00000000000
--- a/torchvision/csrc/models/models.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include "alexnet.h"
-#include "densenet.h"
-#include "googlenet.h"
-#include "inception.h"
-#include "mnasnet.h"
-#include "mobilenet.h"
-#include "resnet.h"
-#include "shufflenetv2.h"
-#include "squeezenet.h"
-#include "vgg.h"
diff --git a/torchvision/csrc/models/modelsimpl.h b/torchvision/csrc/models/modelsimpl.h
deleted file mode 100644
index 365726426be..00000000000
--- a/torchvision/csrc/models/modelsimpl.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-
-namespace vision {
-namespace models {
-namespace modelsimpl {
-
-// TODO here torch::relu_ and torch::adaptive_avg_pool2d wrapped in
-// torch::nn::Fuctional don't work. so keeping these for now
-
-inline torch::Tensor& relu_(const torch::Tensor& x) {
-  return x.relu_();
-}
-
-inline torch::Tensor& relu6_(const torch::Tensor& x) {
-  return x.clamp_(0, 6);
-}
-
-inline torch::Tensor adaptive_avg_pool2d(
-    const torch::Tensor& x,
-    torch::ExpandingArray<2> output_size) {
-  return torch::adaptive_avg_pool2d(x, output_size);
-}
-
-inline torch::Tensor max_pool2d(
-    const torch::Tensor& x,
-    torch::ExpandingArray<2> kernel_size,
-    torch::ExpandingArray<2> stride) {
-  return torch::max_pool2d(x, kernel_size, stride);
-}
-
-inline bool double_compare(double a, double b) {
-  return double(std::abs(a - b)) < std::numeric_limits<double>::epsilon();
-};
-
-inline void deprecation_warning() {
-  TORCH_WARN_ONCE(
-      "The vision::models namespace is deprecated since 0.12 and will be "
-      "removed in 0.14. We recommend using Torch Script instead: "
-      "https://pytorch.org/tutorials/advanced/cpp_export.html");
-}
-
-} // namespace modelsimpl
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/resnet.cpp b/torchvision/csrc/models/resnet.cpp
deleted file mode 100644
index e97ba767745..00000000000
--- a/torchvision/csrc/models/resnet.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-#include "resnet.h"
-
-namespace vision {
-namespace models {
-namespace _resnetimpl {
-torch::nn::Conv2d conv3x3(
-    int64_t in,
-    int64_t out,
-    int64_t stride,
-    int64_t groups) {
-  torch::nn::Conv2dOptions O(in, out, 3);
-  O.padding(1).stride(stride).groups(groups).bias(false);
-  return torch::nn::Conv2d(O);
-}
-
-torch::nn::Conv2d conv1x1(int64_t in, int64_t out, int64_t stride) {
-  torch::nn::Conv2dOptions O(in, out, 1);
-  O.stride(stride).bias(false);
-  return torch::nn::Conv2d(O);
-}
-
-int BasicBlock::expansion = 1;
-int Bottleneck::expansion = 4;
-
-BasicBlock::BasicBlock(
-    int64_t inplanes,
-    int64_t planes,
-    int64_t stride,
-    const torch::nn::Sequential& downsample,
-    int64_t groups,
-    int64_t base_width)
-    : stride(stride), downsample(downsample) {
-  TORCH_CHECK(
-      groups == 1 && base_width == 64,
-      "BasicBlock only supports groups=1 and base_width=64");
-
-  // Both conv1 and downsample layers downsample the input when stride != 1
-  conv1 = conv3x3(inplanes, planes, stride);
-  conv2 = conv3x3(planes, planes);
-
-  bn1 = torch::nn::BatchNorm2d(planes);
-  bn2 = torch::nn::BatchNorm2d(planes);
-
-  register_module("conv1", conv1);
-  register_module("conv2", conv2);
-
-  register_module("bn1", bn1);
-  register_module("bn2", bn2);
-
-  if (!downsample.is_empty())
-    register_module("downsample", this->downsample);
-}
-
-Bottleneck::Bottleneck(
-    int64_t inplanes,
-    int64_t planes,
-    int64_t stride,
-    const torch::nn::Sequential& downsample,
-    int64_t groups,
-    int64_t base_width)
-    : stride(stride), downsample(downsample) {
-  auto width = int64_t(planes * (base_width / 64.)) * groups;
-
-  // Both conv2 and downsample layers downsample the input when stride != 1
-  conv1 = conv1x1(inplanes, width);
-  conv2 = conv3x3(width, width, stride, groups);
-  conv3 = conv1x1(width, planes * expansion);
-
-  bn1 = torch::nn::BatchNorm2d(width);
-  bn2 = torch::nn::BatchNorm2d(width);
-  bn3 = torch::nn::BatchNorm2d(planes * expansion);
-
-  register_module("conv1", conv1);
-  register_module("conv2", conv2);
-  register_module("conv3", conv3);
-
-  register_module("bn1", bn1);
-  register_module("bn2", bn2);
-  register_module("bn3", bn3);
-
-  if (!downsample.is_empty())
-    register_module("downsample", this->downsample);
-}
-
-torch::Tensor Bottleneck::forward(torch::Tensor X) {
-  auto identity = X;
-
-  auto out = conv1->forward(X);
-  out = bn1->forward(out).relu_();
-
-  out = conv2->forward(out);
-  out = bn2->forward(out).relu_();
-
-  out = conv3->forward(out);
-  out = bn3->forward(out);
-
-  if (!downsample.is_empty())
-    identity = downsample->forward(X);
-
-  out += identity;
-  return out.relu_();
-}
-
-torch::Tensor BasicBlock::forward(torch::Tensor x) {
-  auto identity = x;
-
-  auto out = conv1->forward(x);
-  out = bn1->forward(out).relu_();
-
-  out = conv2->forward(out);
-  out = bn2->forward(out);
-
-  if (!downsample.is_empty())
-    identity = downsample->forward(x);
-
-  out += identity;
-  return out.relu_();
-}
-} // namespace _resnetimpl
-
-ResNet18Impl::ResNet18Impl(int64_t num_classes, bool zero_init_residual)
-    : ResNetImpl({2, 2, 2, 2}, num_classes, zero_init_residual) {}
-
-ResNet34Impl::ResNet34Impl(int64_t num_classes, bool zero_init_residual)
-    : ResNetImpl({3, 4, 6, 3}, num_classes, zero_init_residual) {}
-
-ResNet50Impl::ResNet50Impl(int64_t num_classes, bool zero_init_residual)
-    : ResNetImpl({3, 4, 6, 3}, num_classes, zero_init_residual) {}
-
-ResNet101Impl::ResNet101Impl(int64_t num_classes, bool zero_init_residual)
-    : ResNetImpl({3, 4, 23, 3}, num_classes, zero_init_residual) {}
-
-ResNet152Impl::ResNet152Impl(int64_t num_classes, bool zero_init_residual)
-    : ResNetImpl({3, 8, 36, 3}, num_classes, zero_init_residual) {}
-
-ResNext50_32x4dImpl::ResNext50_32x4dImpl(
-    int64_t num_classes,
-    bool zero_init_residual)
-    : ResNetImpl({3, 4, 6, 3}, num_classes, zero_init_residual, 32, 4) {}
-
-ResNext101_32x8dImpl::ResNext101_32x8dImpl(
-    int64_t num_classes,
-    bool zero_init_residual)
-    : ResNetImpl({3, 4, 23, 3}, num_classes, zero_init_residual, 32, 8) {}
-
-WideResNet50_2Impl::WideResNet50_2Impl(
-    int64_t num_classes,
-    bool zero_init_residual)
-    : ResNetImpl({3, 4, 6, 3}, num_classes, zero_init_residual, 1, 64 * 2) {}
-
-WideResNet101_2Impl::WideResNet101_2Impl(
-    int64_t num_classes,
-    bool zero_init_residual)
-    : ResNetImpl({3, 4, 23, 3}, num_classes, zero_init_residual, 1, 64 * 2) {}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/resnet.h b/torchvision/csrc/models/resnet.h
deleted file mode 100644
index 4b32bfc76b8..00000000000
--- a/torchvision/csrc/models/resnet.h
+++ /dev/null
@@ -1,260 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-template <typename Block>
-struct ResNetImpl;
-
-namespace _resnetimpl {
-// 3x3 convolution with padding
-torch::nn::Conv2d conv3x3(
-    int64_t in,
-    int64_t out,
-    int64_t stride = 1,
-    int64_t groups = 1);
-
-// 1x1 convolution
-torch::nn::Conv2d conv1x1(int64_t in, int64_t out, int64_t stride = 1);
-
-struct VISION_API BasicBlock : torch::nn::Module {
-  template <typename Block>
-  friend struct vision::models::ResNetImpl;
-
-  int64_t stride;
-  torch::nn::Sequential downsample;
-
-  torch::nn::Conv2d conv1{nullptr}, conv2{nullptr};
-  torch::nn::BatchNorm2d bn1{nullptr}, bn2{nullptr};
-
-  static int expansion;
-
-  BasicBlock(
-      int64_t inplanes,
-      int64_t planes,
-      int64_t stride = 1,
-      const torch::nn::Sequential& downsample = nullptr,
-      int64_t groups = 1,
-      int64_t base_width = 64);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-struct VISION_API Bottleneck : torch::nn::Module {
-  template <typename Block>
-  friend struct vision::models::ResNetImpl;
-
-  int64_t stride;
-  torch::nn::Sequential downsample;
-
-  torch::nn::Conv2d conv1{nullptr}, conv2{nullptr}, conv3{nullptr};
-  torch::nn::BatchNorm2d bn1{nullptr}, bn2{nullptr}, bn3{nullptr};
-
-  static int expansion;
-
-  Bottleneck(
-      int64_t inplanes,
-      int64_t planes,
-      int64_t stride = 1,
-      const torch::nn::Sequential& downsample = nullptr,
-      int64_t groups = 1,
-      int64_t base_width = 64);
-
-  torch::Tensor forward(torch::Tensor X);
-};
-} // namespace _resnetimpl
-
-template <typename Block>
-struct ResNetImpl : torch::nn::Module {
-  int64_t groups, base_width, inplanes;
-  torch::nn::Conv2d conv1;
-  torch::nn::BatchNorm2d bn1;
-  torch::nn::Sequential layer1, layer2, layer3, layer4;
-  torch::nn::Linear fc;
-
-  torch::nn::Sequential _make_layer(
-      int64_t planes,
-      int64_t blocks,
-      int64_t stride = 1);
-
-  explicit ResNetImpl(
-      const std::vector<int>& layers,
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false,
-      int64_t groups = 1,
-      int64_t width_per_group = 64);
-
-  torch::Tensor forward(torch::Tensor X);
-};
-
-template <typename Block>
-torch::nn::Sequential ResNetImpl<Block>::_make_layer(
-    int64_t planes,
-    int64_t blocks,
-    int64_t stride) {
-  torch::nn::Sequential downsample = nullptr;
-  if (stride != 1 || inplanes != planes * Block::expansion) {
-    downsample = torch::nn::Sequential(
-        _resnetimpl::conv1x1(inplanes, planes * Block::expansion, stride),
-        torch::nn::BatchNorm2d(planes * Block::expansion));
-  }
-
-  torch::nn::Sequential layers;
-  layers->push_back(
-      Block(inplanes, planes, stride, downsample, groups, base_width));
-
-  inplanes = planes * Block::expansion;
-
-  for (int i = 1; i < blocks; ++i)
-    layers->push_back(Block(inplanes, planes, 1, nullptr, groups, base_width));
-
-  return layers;
-}
-
-template <typename Block>
-ResNetImpl<Block>::ResNetImpl(
-    const std::vector<int>& layers,
-    int64_t num_classes,
-    bool zero_init_residual,
-    int64_t groups,
-    int64_t width_per_group)
-    : groups(groups),
-      base_width(width_per_group),
-      inplanes(64),
-      conv1(
-          torch::nn::Conv2dOptions(3, 64, 7).stride(2).padding(3).bias(false)),
-      bn1(64),
-      layer1(_make_layer(64, layers[0])),
-      layer2(_make_layer(128, layers[1], 2)),
-      layer3(_make_layer(256, layers[2], 2)),
-      layer4(_make_layer(512, layers[3], 2)),
-      fc(512 * Block::expansion, num_classes) {
-  register_module("conv1", conv1);
-  register_module("bn1", bn1);
-  register_module("fc", fc);
-
-  register_module("layer1", layer1);
-  register_module("layer2", layer2);
-  register_module("layer3", layer3);
-  register_module("layer4", layer4);
-
-  for (auto& module : modules(/*include_self=*/false)) {
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get()))
-      torch::nn::init::kaiming_normal_(
-          M->weight,
-          /*a=*/0,
-          torch::kFanOut,
-          torch::kReLU);
-    else if (auto M = dynamic_cast<torch::nn::BatchNorm2dImpl*>(module.get())) {
-      torch::nn::init::constant_(M->weight, 1);
-      torch::nn::init::constant_(M->bias, 0);
-    }
-  }
-
-  // Zero-initialize the last BN in each residual branch, so that the residual
-  // branch starts with zeros, and each residual block behaves like an
-  // identity. This improves the model by 0.2~0.3% according to
-  // https://arxiv.org/abs/1706.02677
-  if (zero_init_residual)
-    for (auto& module : modules(/*include_self=*/false)) {
-      if (auto* M = dynamic_cast<_resnetimpl::Bottleneck*>(module.get()))
-        torch::nn::init::constant_(M->bn3->weight, 0);
-      else if (auto* M = dynamic_cast<_resnetimpl::BasicBlock*>(module.get()))
-        torch::nn::init::constant_(M->bn2->weight, 0);
-    }
-
-  modelsimpl::deprecation_warning();
-}
-
-template <typename Block>
-torch::Tensor ResNetImpl<Block>::forward(torch::Tensor x) {
-  x = conv1->forward(x);
-  x = bn1->forward(x).relu_();
-  x = torch::max_pool2d(x, 3, 2, 1);
-
-  x = layer1->forward(x);
-  x = layer2->forward(x);
-  x = layer3->forward(x);
-  x = layer4->forward(x);
-
-  x = torch::adaptive_avg_pool2d(x, {1, 1});
-  x = x.reshape({x.size(0), -1});
-  x = fc->forward(x);
-
-  return x;
-}
-
-struct VISION_API ResNet18Impl : ResNetImpl<_resnetimpl::BasicBlock> {
-  explicit ResNet18Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API ResNet34Impl : ResNetImpl<_resnetimpl::BasicBlock> {
-  explicit ResNet34Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API ResNet50Impl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit ResNet50Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API ResNet101Impl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit ResNet101Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API ResNet152Impl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit ResNet152Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API ResNext50_32x4dImpl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit ResNext50_32x4dImpl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API ResNext101_32x8dImpl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit ResNext101_32x8dImpl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API WideResNet50_2Impl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit WideResNet50_2Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-struct VISION_API WideResNet101_2Impl : ResNetImpl<_resnetimpl::Bottleneck> {
-  explicit WideResNet101_2Impl(
-      int64_t num_classes = 1000,
-      bool zero_init_residual = false);
-};
-
-template <typename Block>
-struct VISION_API ResNet : torch::nn::ModuleHolder<ResNetImpl<Block>> {
-  using torch::nn::ModuleHolder<ResNetImpl<Block>>::ModuleHolder;
-};
-
-TORCH_MODULE(ResNet18);
-TORCH_MODULE(ResNet34);
-TORCH_MODULE(ResNet50);
-TORCH_MODULE(ResNet101);
-TORCH_MODULE(ResNet152);
-TORCH_MODULE(ResNext50_32x4d);
-TORCH_MODULE(ResNext101_32x8d);
-TORCH_MODULE(WideResNet50_2);
-TORCH_MODULE(WideResNet101_2);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/shufflenetv2.cpp b/torchvision/csrc/models/shufflenetv2.cpp
deleted file mode 100644
index 0dce3665115..00000000000
--- a/torchvision/csrc/models/shufflenetv2.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-#include "shufflenetv2.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-
-using Options = torch::nn::Conv2dOptions;
-
-torch::Tensor channel_shuffle(torch::Tensor x, int64_t groups) {
-  auto shape = x.sizes();
-  auto batchsize = shape[0];
-  auto num_channels = shape[1];
-  auto height = shape[2];
-  auto width = shape[3];
-
-  auto channels_per_group = num_channels / groups;
-
-  x = x.view({batchsize, groups, channels_per_group, height, width});
-  x = torch::transpose(x, 1, 2).contiguous();
-  x = x.view({batchsize, -1, height, width});
-
-  return x;
-}
-
-torch::nn::Conv2d conv11(int64_t input, int64_t output) {
-  Options opts(input, output, 1);
-  opts = opts.stride(1).padding(0).bias(false);
-  return torch::nn::Conv2d(opts);
-}
-
-torch::nn::Conv2d conv33(int64_t input, int64_t output, int64_t stride) {
-  Options opts(input, output, 3);
-  opts = opts.stride(stride).padding(1).bias(false).groups(input);
-  return torch::nn::Conv2d(opts);
-}
-
-struct ShuffleNetV2InvertedResidualImpl : torch::nn::Module {
-  int64_t stride;
-  torch::nn::Sequential branch1{nullptr}, branch2{nullptr};
-
-  ShuffleNetV2InvertedResidualImpl(int64_t inp, int64_t oup, int64_t stride)
-      : stride(stride) {
-    TORCH_CHECK(stride >= 1 && stride <= 3, "illegal stride value");
-
-    auto branch_features = oup / 2;
-    TORCH_CHECK(stride != 1 || inp == branch_features << 1);
-
-    if (stride > 1) {
-      branch1 = torch::nn::Sequential(
-          conv33(inp, inp, stride),
-          torch::nn::BatchNorm2d(inp),
-          conv11(inp, branch_features),
-          torch::nn::BatchNorm2d(branch_features),
-          torch::nn::Functional(modelsimpl::relu_));
-    }
-
-    branch2 = torch::nn::Sequential(
-        conv11(stride > 1 ? inp : branch_features, branch_features),
-        torch::nn::BatchNorm2d(branch_features),
-        torch::nn::Functional(modelsimpl::relu_),
-        conv33(branch_features, branch_features, stride),
-        torch::nn::BatchNorm2d(branch_features),
-        conv11(branch_features, branch_features),
-        torch::nn::BatchNorm2d(branch_features),
-        torch::nn::Functional(modelsimpl::relu_));
-
-    if (!branch1.is_empty())
-      register_module("branch1", branch1);
-
-    register_module("branch2", branch2);
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    torch::Tensor out;
-
-    if (stride == 1) {
-      auto chunks = x.chunk(2, 1);
-      out = torch::cat({chunks[0], branch2->forward(chunks[1])}, 1);
-    } else
-      out = torch::cat({branch1->forward(x), branch2->forward(x)}, 1);
-
-    out = ::vision::models::channel_shuffle(out, 2);
-    return out;
-  }
-};
-
-TORCH_MODULE(ShuffleNetV2InvertedResidual);
-
-ShuffleNetV2Impl::ShuffleNetV2Impl(
-    const std::vector<int64_t>& stage_repeats,
-    const std::vector<int64_t>& stage_out_channels,
-    int64_t num_classes) {
-  TORCH_CHECK(
-      stage_repeats.size() == 3,
-      "expected stage_repeats as vector of 3 positive ints");
-
-  TORCH_CHECK(
-      stage_out_channels.size() == 5,
-      "expected stage_out_channels as vector of 5 positive ints");
-
-  _stage_out_channels = stage_out_channels;
-  int64_t input_channels = 3;
-  auto output_channels = _stage_out_channels[0];
-
-  conv1 = torch::nn::Sequential(
-      torch::nn::Conv2d(Options(input_channels, output_channels, 3)
-                            .stride(2)
-                            .padding(1)
-                            .bias(false)),
-      torch::nn::BatchNorm2d(output_channels),
-      torch::nn::Functional(modelsimpl::relu_));
-
-  input_channels = output_channels;
-  std::vector<torch::nn::Sequential> stages = {stage2, stage3, stage4};
-
-  for (size_t i = 0; i < stages.size(); ++i) {
-    auto& seq = stages[i];
-    auto repeats = stage_repeats[i];
-    auto output_channels = _stage_out_channels[i + 1];
-
-    seq->push_back(
-        ShuffleNetV2InvertedResidual(input_channels, output_channels, 2));
-
-    for (size_t j = 0; j < size_t(repeats - 1); ++j)
-      seq->push_back(
-          ShuffleNetV2InvertedResidual(output_channels, output_channels, 1));
-
-    input_channels = output_channels;
-  }
-
-  output_channels = _stage_out_channels.back();
-  conv5 = torch::nn::Sequential(
-      torch::nn::Conv2d(Options(input_channels, output_channels, 1)
-                            .stride(1)
-                            .padding(0)
-                            .bias(false)),
-      torch::nn::BatchNorm2d(output_channels),
-      torch::nn::Functional(modelsimpl::relu_));
-
-  fc = torch::nn::Linear(output_channels, num_classes);
-
-  register_module("conv1", conv1);
-  register_module("stage2", stage2);
-  register_module("stage3", stage3);
-  register_module("stage4", stage4);
-  register_module("conv2", conv5);
-  register_module("fc", fc);
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor ShuffleNetV2Impl::forward(torch::Tensor x) {
-  x = conv1->forward(x);
-  x = torch::max_pool2d(x, 3, 2, 1);
-
-  x = stage2->forward(x);
-  x = stage3->forward(x);
-  x = stage4->forward(x);
-  x = conv5->forward(x);
-
-  x = x.mean({2, 3});
-  x = fc->forward(x);
-  return x;
-}
-
-ShuffleNetV2_x0_5Impl::ShuffleNetV2_x0_5Impl(int64_t num_classes)
-    : ShuffleNetV2Impl({4, 8, 4}, {24, 48, 96, 192, 1024}, num_classes) {}
-
-ShuffleNetV2_x1_0Impl::ShuffleNetV2_x1_0Impl(int64_t num_classes)
-    : ShuffleNetV2Impl({4, 8, 4}, {24, 116, 232, 464, 1024}, num_classes) {}
-
-ShuffleNetV2_x1_5Impl::ShuffleNetV2_x1_5Impl(int64_t num_classes)
-    : ShuffleNetV2Impl({4, 8, 4}, {24, 176, 352, 704, 1024}, num_classes) {}
-
-ShuffleNetV2_x2_0Impl::ShuffleNetV2_x2_0Impl(int64_t num_classes)
-    : ShuffleNetV2Impl({4, 8, 4}, {24, 244, 488, 976, 2048}, num_classes) {}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/shufflenetv2.h b/torchvision/csrc/models/shufflenetv2.h
deleted file mode 100644
index 00a73b36a06..00000000000
--- a/torchvision/csrc/models/shufflenetv2.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-
-struct VISION_API ShuffleNetV2Impl : torch::nn::Module {
-  std::vector<int64_t> _stage_out_channels;
-  torch::nn::Sequential conv1{nullptr}, stage2, stage3, stage4, conv5{nullptr};
-  torch::nn::Linear fc{nullptr};
-
-  ShuffleNetV2Impl(
-      const std::vector<int64_t>& stage_repeats,
-      const std::vector<int64_t>& stage_out_channels,
-      int64_t num_classes = 1000);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-struct VISION_API ShuffleNetV2_x0_5Impl : ShuffleNetV2Impl {
-  explicit ShuffleNetV2_x0_5Impl(int64_t num_classes = 1000);
-};
-
-struct VISION_API ShuffleNetV2_x1_0Impl : ShuffleNetV2Impl {
-  explicit ShuffleNetV2_x1_0Impl(int64_t num_classes = 1000);
-};
-
-struct VISION_API ShuffleNetV2_x1_5Impl : ShuffleNetV2Impl {
-  explicit ShuffleNetV2_x1_5Impl(int64_t num_classes = 1000);
-};
-
-struct VISION_API ShuffleNetV2_x2_0Impl : ShuffleNetV2Impl {
-  explicit ShuffleNetV2_x2_0Impl(int64_t num_classes = 1000);
-};
-
-TORCH_MODULE(ShuffleNetV2);
-TORCH_MODULE(ShuffleNetV2_x0_5);
-TORCH_MODULE(ShuffleNetV2_x1_0);
-TORCH_MODULE(ShuffleNetV2_x1_5);
-TORCH_MODULE(ShuffleNetV2_x2_0);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/squeezenet.cpp b/torchvision/csrc/models/squeezenet.cpp
deleted file mode 100644
index 3f0820da3a8..00000000000
--- a/torchvision/csrc/models/squeezenet.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "squeezenet.h"
-
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-struct Fire : torch::nn::Module {
-  torch::nn::Conv2d squeeze, expand1x1, expand3x3;
-
-  Fire(
-      int64_t inplanes,
-      int64_t squeeze_planes,
-      int64_t expand1x1_planes,
-      int64_t expand3x3_planes)
-      : squeeze(torch::nn::Conv2dOptions(inplanes, squeeze_planes, 1)),
-        expand1x1(
-            torch::nn::Conv2dOptions(squeeze_planes, expand1x1_planes, 1)),
-        expand3x3(torch::nn::Conv2dOptions(squeeze_planes, expand3x3_planes, 3)
-                      .padding(1)) {
-    register_module("squeeze", squeeze);
-    register_module("expand1x1", expand1x1);
-    register_module("expand3x3", expand3x3);
-  }
-
-  torch::Tensor forward(torch::Tensor x) {
-    x = torch::relu(squeeze->forward(x));
-    return torch::cat(
-        {torch::relu(expand1x1->forward(x)),
-         torch::relu(expand3x3->forward(x))},
-        1);
-  }
-};
-
-SqueezeNetImpl::SqueezeNetImpl(double version, int64_t num_classes)
-    : num_classes(num_classes) {
-  if (modelsimpl::double_compare(version, 1.0)) {
-    features = torch::nn::Sequential(
-        torch::nn::Conv2d(torch::nn::Conv2dOptions(3, 96, 7).stride(2)),
-        torch::nn::Functional(modelsimpl::relu_),
-        torch::nn::Functional(torch::max_pool2d, 3, 2, 0, 1, true),
-        Fire(96, 16, 64, 64),
-        Fire(128, 16, 64, 64),
-        Fire(128, 32, 128, 128),
-        torch::nn::Functional(torch::max_pool2d, 3, 2, 0, 1, true),
-        Fire(256, 32, 128, 128),
-        Fire(256, 48, 192, 192),
-        Fire(384, 48, 192, 192),
-        Fire(384, 64, 256, 256),
-        torch::nn::Functional(torch::max_pool2d, 3, 2, 0, 1, true),
-        Fire(512, 64, 256, 256));
-  } else if (modelsimpl::double_compare(version, 1.1)) {
-    features = torch::nn::Sequential(
-        torch::nn::Conv2d(torch::nn::Conv2dOptions(3, 64, 3).stride(2)),
-        torch::nn::Functional(modelsimpl::relu_),
-        torch::nn::Functional(torch::max_pool2d, 3, 2, 0, 1, true),
-        Fire(64, 16, 64, 64),
-        Fire(128, 16, 64, 64),
-        torch::nn::Functional(torch::max_pool2d, 3, 2, 0, 1, true),
-        Fire(128, 32, 128, 128),
-        Fire(256, 32, 128, 128),
-        torch::nn::Functional(torch::max_pool2d, 3, 2, 0, 1, true),
-        Fire(256, 48, 192, 192),
-        Fire(384, 48, 192, 192),
-        Fire(384, 64, 256, 256),
-        Fire(512, 64, 256, 256));
-  } else
-    TORCH_CHECK(
-        false,
-        "Unsupported SqueezeNet version ",
-        version,
-        ". 1_0 or 1_1 expected");
-
-  // Final convolution is initialized differently from the rest
-  auto final_conv =
-      torch::nn::Conv2d(torch::nn::Conv2dOptions(512, num_classes, 1));
-
-  classifier = torch::nn::Sequential(
-      torch::nn::Dropout(0.5),
-      final_conv,
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Functional(modelsimpl::adaptive_avg_pool2d, 1));
-
-  register_module("features", features);
-  register_module("classifier", classifier);
-
-  for (auto& module : modules(/*include_self=*/false))
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get())) {
-      if (M == final_conv.get())
-        torch::nn::init::normal_(M->weight, 0.0, 0.01);
-      else
-        torch::nn::init::kaiming_uniform_(M->weight);
-
-      if (M->options.bias())
-        torch::nn::init::constant_(M->bias, 0);
-    }
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor SqueezeNetImpl::forward(torch::Tensor x) {
-  x = features->forward(x);
-  x = classifier->forward(x);
-  return x.view({x.size(0), -1});
-}
-
-SqueezeNet1_0Impl::SqueezeNet1_0Impl(int64_t num_classes)
-    : SqueezeNetImpl(1.0, num_classes) {}
-
-SqueezeNet1_1Impl::SqueezeNet1_1Impl(int64_t num_classes)
-    : SqueezeNetImpl(1.1, num_classes) {}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/squeezenet.h b/torchvision/csrc/models/squeezenet.h
deleted file mode 100644
index 37bc5825717..00000000000
--- a/torchvision/csrc/models/squeezenet.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-struct VISION_API SqueezeNetImpl : torch::nn::Module {
-  int64_t num_classes;
-  torch::nn::Sequential features{nullptr}, classifier{nullptr};
-
-  explicit SqueezeNetImpl(double version = 1.0, int64_t num_classes = 1000);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-// SqueezeNet model architecture from the "SqueezeNet: AlexNet-level
-// accuracy with 50x fewer parameters and <0.5MB model size"
-// <https://arxiv.org/abs/1602.07360> paper.
-struct VISION_API SqueezeNet1_0Impl : SqueezeNetImpl {
-  explicit SqueezeNet1_0Impl(int64_t num_classes = 1000);
-};
-
-// SqueezeNet 1.1 model from the official SqueezeNet repo
-// <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>.
-// SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
-// than SqueezeNet 1.0, without sacrificing accuracy.
-struct VISION_API SqueezeNet1_1Impl : SqueezeNetImpl {
-  explicit SqueezeNet1_1Impl(int64_t num_classes = 1000);
-};
-
-TORCH_MODULE(SqueezeNet);
-TORCH_MODULE(SqueezeNet1_0);
-TORCH_MODULE(SqueezeNet1_1);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/vgg.cpp b/torchvision/csrc/models/vgg.cpp
deleted file mode 100644
index 61c3cb844c1..00000000000
--- a/torchvision/csrc/models/vgg.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "vgg.h"
-
-#include <unordered_map>
-#include "modelsimpl.h"
-
-namespace vision {
-namespace models {
-torch::nn::Sequential makeLayers(
-    const std::vector<int>& cfg,
-    bool batch_norm = false) {
-  torch::nn::Sequential seq;
-  auto channels = 3;
-
-  for (const auto& V : cfg) {
-    if (V <= -1)
-      seq->push_back(torch::nn::Functional(modelsimpl::max_pool2d, 2, 2));
-    else {
-      seq->push_back(torch::nn::Conv2d(
-          torch::nn::Conv2dOptions(channels, V, 3).padding(1)));
-
-      if (batch_norm)
-        seq->push_back(torch::nn::BatchNorm2d(V));
-      seq->push_back(torch::nn::Functional(modelsimpl::relu_));
-
-      channels = V;
-    }
-  }
-
-  return seq;
-}
-
-void VGGImpl::_initialize_weights() {
-  for (auto& module : modules(/*include_self=*/false)) {
-    if (auto M = dynamic_cast<torch::nn::Conv2dImpl*>(module.get())) {
-      torch::nn::init::kaiming_normal_(
-          M->weight,
-          /*a=*/0,
-          torch::kFanOut,
-          torch::kReLU);
-      torch::nn::init::constant_(M->bias, 0);
-    } else if (
-        auto M = dynamic_cast<torch::nn::BatchNorm2dImpl*>(module.get())) {
-      torch::nn::init::constant_(M->weight, 1);
-      torch::nn::init::constant_(M->bias, 0);
-    } else if (auto M = dynamic_cast<torch::nn::LinearImpl*>(module.get())) {
-      torch::nn::init::normal_(M->weight, 0, 0.01);
-      torch::nn::init::constant_(M->bias, 0);
-    }
-  }
-}
-
-VGGImpl::VGGImpl(
-    const torch::nn::Sequential& features,
-    int64_t num_classes,
-    bool initialize_weights) {
-  classifier = torch::nn::Sequential(
-      torch::nn::Linear(512 * 7 * 7, 4096),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Dropout(),
-      torch::nn::Linear(4096, 4096),
-      torch::nn::Functional(modelsimpl::relu_),
-      torch::nn::Dropout(),
-      torch::nn::Linear(4096, num_classes));
-
-  this->features = features;
-
-  register_module("features", this->features);
-  register_module("classifier", classifier);
-
-  if (initialize_weights)
-    _initialize_weights();
-
-  modelsimpl::deprecation_warning();
-}
-
-torch::Tensor VGGImpl::forward(torch::Tensor x) {
-  x = features->forward(x);
-  x = torch::adaptive_avg_pool2d(x, {7, 7});
-  x = x.view({x.size(0), -1});
-  x = classifier->forward(x);
-  return x;
-}
-
-// clang-format off
-static std::unordered_map<char, std::vector<int>> cfgs = {
-  {'A', {64, -1, 128, -1, 256, 256, -1, 512, 512, -1, 512, 512, -1}},
-  {'B', {64, 64, -1, 128, 128, -1, 256, 256, -1, 512, 512, -1, 512, 512, -1}},
-  {'D', {64, 64, -1, 128, 128, -1, 256, 256, 256, -1, 512, 512, 512, -1, 512, 512, 512, -1}},
-  {'E', {64,  64,  -1,  128, 128, -1,  256, 256, 256, 256, -1, 512, 512, 512, 512, -1,  512, 512, 512, 512, -1}}};
-// clang-format on
-
-VGG11Impl::VGG11Impl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['A']), num_classes, initialize_weights) {}
-
-VGG13Impl::VGG13Impl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['B']), num_classes, initialize_weights) {}
-
-VGG16Impl::VGG16Impl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['D']), num_classes, initialize_weights) {}
-
-VGG19Impl::VGG19Impl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['E']), num_classes, initialize_weights) {}
-
-VGG11BNImpl::VGG11BNImpl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['A'], true), num_classes, initialize_weights) {}
-
-VGG13BNImpl::VGG13BNImpl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['B'], true), num_classes, initialize_weights) {}
-
-VGG16BNImpl::VGG16BNImpl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['D'], true), num_classes, initialize_weights) {}
-
-VGG19BNImpl::VGG19BNImpl(int64_t num_classes, bool initialize_weights)
-    : VGGImpl(makeLayers(cfgs['E'], true), num_classes, initialize_weights) {}
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/models/vgg.h b/torchvision/csrc/models/vgg.h
deleted file mode 100644
index dd5b30512c9..00000000000
--- a/torchvision/csrc/models/vgg.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#pragma once
-
-#include <torch/nn.h>
-#include "../macros.h"
-
-namespace vision {
-namespace models {
-struct VISION_API VGGImpl : torch::nn::Module {
-  torch::nn::Sequential features{nullptr}, classifier{nullptr};
-
-  void _initialize_weights();
-
-  explicit VGGImpl(
-      const torch::nn::Sequential& features,
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-
-  torch::Tensor forward(torch::Tensor x);
-};
-
-// VGG 11-layer model (configuration "A")
-struct VISION_API VGG11Impl : VGGImpl {
-  explicit VGG11Impl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 13-layer model (configuration "B")
-struct VISION_API VGG13Impl : VGGImpl {
-  explicit VGG13Impl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 16-layer model (configuration "D")
-struct VISION_API VGG16Impl : VGGImpl {
-  explicit VGG16Impl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 19-layer model (configuration "E")
-struct VISION_API VGG19Impl : VGGImpl {
-  explicit VGG19Impl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 11-layer model (configuration "A") with batch normalization
-struct VISION_API VGG11BNImpl : VGGImpl {
-  explicit VGG11BNImpl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 13-layer model (configuration "B") with batch normalization
-struct VISION_API VGG13BNImpl : VGGImpl {
-  explicit VGG13BNImpl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 16-layer model (configuration "D") with batch normalization
-struct VISION_API VGG16BNImpl : VGGImpl {
-  explicit VGG16BNImpl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-// VGG 19-layer model (configuration 'E') with batch normalization
-struct VISION_API VGG19BNImpl : VGGImpl {
-  explicit VGG19BNImpl(
-      int64_t num_classes = 1000,
-      bool initialize_weights = true);
-};
-
-TORCH_MODULE(VGG);
-
-TORCH_MODULE(VGG11);
-TORCH_MODULE(VGG13);
-TORCH_MODULE(VGG16);
-TORCH_MODULE(VGG19);
-
-TORCH_MODULE(VGG11BN);
-TORCH_MODULE(VGG13BN);
-TORCH_MODULE(VGG16BN);
-TORCH_MODULE(VGG19BN);
-
-} // namespace models
-} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp
index 96c9ad041de..39482ceadbf 100644
--- a/torchvision/csrc/ops/autocast/nms_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp
@@ -9,21 +9,40 @@ namespace ops {
 
 namespace {
 
+template <c10::DispatchKey autocast_key, c10::DeviceType device_type>
 at::Tensor nms_autocast(
     const at::Tensor& dets,
     const at::Tensor& scores,
     double iou_threshold) {
-  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key);
+
   return nms(
-      at::autocast::cached_cast(at::kFloat, dets),
-      at::autocast::cached_cast(at::kFloat, scores),
+      at::autocast::cached_cast(at::kFloat, dets, device_type),
+      at::autocast::cached_cast(at::kFloat, scores, device_type),
       iou_threshold);
 }
 
 } // namespace
 
 TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
-  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_autocast));
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::nms"),
+      TORCH_FN(
+          (nms_autocast<c10::DispatchKey::Autocast, c10::DeviceType::CUDA>)));
+}
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::nms"),
+      TORCH_FN(
+          (nms_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
+}
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastXPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::nms"),
+      TORCH_FN(
+          (nms_autocast<c10::DispatchKey::AutocastXPU, c10::DeviceType::XPU>)));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
index 78cb2309bbe..3eb8443b54d 100644
--- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
@@ -9,6 +9,7 @@ namespace ops {
 
 namespace {
 
+template <c10::DispatchKey autocast_key, c10::DeviceType device_type>
 at::Tensor roi_align_autocast(
     const at::Tensor& input,
     const at::Tensor& rois,
@@ -17,10 +18,10 @@ at::Tensor roi_align_autocast(
     int64_t pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
-  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key);
   return roi_align(
-             at::autocast::cached_cast(at::kFloat, input),
-             at::autocast::cached_cast(at::kFloat, rois),
+             at::autocast::cached_cast(at::kFloat, input, device_type),
+             at::autocast::cached_cast(at::kFloat, rois, device_type),
              spatial_scale,
              pooled_height,
              pooled_width,
@@ -34,7 +35,25 @@ at::Tensor roi_align_autocast(
 TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchvision::roi_align"),
-      TORCH_FN(roi_align_autocast));
+      TORCH_FN((roi_align_autocast<
+                c10::DispatchKey::Autocast,
+                c10::DeviceType::CUDA>)));
+}
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN((roi_align_autocast<
+                c10::DispatchKey::AutocastCPU,
+                c10::DeviceType::CPU>)));
+}
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastXPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN((roi_align_autocast<
+                c10::DispatchKey::AutocastXPU,
+                c10::DeviceType::XPU>)));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/autograd/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/autograd/deform_conv2d_kernel.cpp
index 801afb6a9bc..abd8ad764b5 100644
--- a/torchvision/csrc/ops/autograd/deform_conv2d_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/deform_conv2d_kernel.cpp
@@ -3,6 +3,8 @@
 #include <torch/autograd.h>
 #include <torch/types.h>
 
+#include <utility>
+
 namespace vision {
 namespace ops {
 
@@ -18,17 +20,17 @@ class DeformConv2dFunction
       const torch::autograd::Variable& offset,
       const torch::autograd::Variable& mask,
       const torch::autograd::Variable& bias,
-      int64_t stride_h,
-      int64_t stride_w,
-      int64_t pad_h,
-      int64_t pad_w,
-      int64_t dilation_h,
-      int64_t dilation_w,
-      int64_t groups,
-      int64_t offset_groups,
+      const c10::SymInt& stride_h,
+      const c10::SymInt& stride_w,
+      const c10::SymInt& pad_h,
+      const c10::SymInt& pad_w,
+      const c10::SymInt& dilation_h,
+      const c10::SymInt& dilation_w,
+      const c10::SymInt& groups,
+      const c10::SymInt& offset_groups,
       bool use_mask) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto output = deform_conv2d(
+    auto output = deform_conv2d_symint(
         input,
         weight,
         offset,
@@ -70,17 +72,17 @@ class DeformConv2dFunction
     auto mask = saved[3];
     auto bias = saved[4];
 
-    auto stride_h = ctx->saved_data["stride_h"].toInt();
-    auto stride_w = ctx->saved_data["stride_w"].toInt();
-    auto pad_h = ctx->saved_data["pad_h"].toInt();
-    auto pad_w = ctx->saved_data["pad_w"].toInt();
-    auto dilation_h = ctx->saved_data["dilation_h"].toInt();
-    auto dilation_w = ctx->saved_data["dilation_w"].toInt();
-    auto groups = ctx->saved_data["groups"].toInt();
-    auto offset_groups = ctx->saved_data["offset_groups"].toInt();
+    auto stride_h = ctx->saved_data["stride_h"].toSymInt();
+    auto stride_w = ctx->saved_data["stride_w"].toSymInt();
+    auto pad_h = ctx->saved_data["pad_h"].toSymInt();
+    auto pad_w = ctx->saved_data["pad_w"].toSymInt();
+    auto dilation_h = ctx->saved_data["dilation_h"].toSymInt();
+    auto dilation_w = ctx->saved_data["dilation_w"].toSymInt();
+    auto groups = ctx->saved_data["groups"].toSymInt();
+    auto offset_groups = ctx->saved_data["offset_groups"].toSymInt();
     auto use_mask = ctx->saved_data["use_mask"].toBool();
 
-    auto grads = detail::_deform_conv2d_backward(
+    auto grads = detail::_deform_conv2d_backward_symint(
         grad_output[0],
         input,
         weight,
@@ -133,31 +135,31 @@ class DeformConv2dBackwardFunction
       const torch::autograd::Variable& offset,
       const torch::autograd::Variable& mask,
       const torch::autograd::Variable& bias,
-      int64_t stride_h,
-      int64_t stride_w,
-      int64_t pad_h,
-      int64_t pad_w,
-      int64_t dilation_h,
-      int64_t dilation_w,
-      int64_t groups,
-      int64_t offset_groups,
+      c10::SymInt stride_h,
+      c10::SymInt stride_w,
+      c10::SymInt pad_h,
+      c10::SymInt pad_w,
+      c10::SymInt dilation_h,
+      c10::SymInt dilation_w,
+      c10::SymInt groups,
+      c10::SymInt offset_groups,
       bool use_mask) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = detail::_deform_conv2d_backward(
+    auto result = detail::_deform_conv2d_backward_symint(
         grad,
         input,
         weight,
         offset,
         mask,
         bias,
-        stride_h,
-        stride_w,
-        pad_h,
-        pad_w,
-        dilation_h,
-        dilation_w,
-        groups,
-        offset_groups,
+        std::move(stride_h),
+        std::move(stride_w),
+        std::move(pad_h),
+        std::move(pad_w),
+        std::move(dilation_h),
+        std::move(dilation_w),
+        std::move(groups),
+        std::move(offset_groups),
         use_mask);
 
     auto grad_input = std::get<0>(result);
@@ -188,14 +190,14 @@ at::Tensor deform_conv2d_autograd(
     const at::Tensor& offset,
     const at::Tensor& mask,
     const at::Tensor& bias,
-    int64_t stride_h,
-    int64_t stride_w,
-    int64_t pad_h,
-    int64_t pad_w,
-    int64_t dilation_h,
-    int64_t dilation_w,
-    int64_t groups,
-    int64_t offset_groups,
+    c10::SymInt stride_h,
+    c10::SymInt stride_w,
+    c10::SymInt pad_h,
+    c10::SymInt pad_w,
+    c10::SymInt dilation_h,
+    c10::SymInt dilation_w,
+    c10::SymInt groups,
+    c10::SymInt offset_groups,
     bool use_mask) {
   return DeformConv2dFunction::apply(
       input,
@@ -222,14 +224,14 @@ deform_conv2d_backward_autograd(
     const at::Tensor& offset,
     const at::Tensor& mask,
     const at::Tensor& bias,
-    int64_t stride_h,
-    int64_t stride_w,
-    int64_t pad_h,
-    int64_t pad_w,
-    int64_t dilation_h,
-    int64_t dilation_w,
-    int64_t groups,
-    int64_t offset_groups,
+    c10::SymInt stride_h,
+    c10::SymInt stride_w,
+    c10::SymInt pad_h,
+    c10::SymInt pad_w,
+    c10::SymInt dilation_h,
+    c10::SymInt dilation_w,
+    c10::SymInt groups,
+    c10::SymInt offset_groups,
     bool use_mask) {
   auto result = DeformConv2dBackwardFunction::apply(
       grad,
diff --git a/torchvision/csrc/ops/autograd/ps_roi_align_kernel.cpp b/torchvision/csrc/ops/autograd/ps_roi_align_kernel.cpp
index 47e51ce9ca2..01f7dd1aa76 100644
--- a/torchvision/csrc/ops/autograd/ps_roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/ps_roi_align_kernel.cpp
@@ -3,6 +3,8 @@
 #include <torch/autograd.h>
 #include <torch/types.h>
 
+#include <utility>
+
 namespace vision {
 namespace ops {
 
@@ -16,16 +18,16 @@ class PSROIAlignFunction
       const torch::autograd::Variable& input,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
+      const c10::SymInt& pooled_height,
+      const c10::SymInt& pooled_width,
       int64_t sampling_ratio) {
     ctx->saved_data["spatial_scale"] = spatial_scale;
     ctx->saved_data["pooled_height"] = pooled_height;
     ctx->saved_data["pooled_width"] = pooled_width;
     ctx->saved_data["sampling_ratio"] = sampling_ratio;
-    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["input_shape"] = input.sym_sizes();
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = ps_roi_align(
+    auto result = ps_roi_align_symint(
         input,
         rois,
         spatial_scale,
@@ -48,19 +50,19 @@ class PSROIAlignFunction
     auto saved = ctx->get_saved_variables();
     auto rois = saved[0];
     auto channel_mapping = saved[1];
-    auto input_shape = ctx->saved_data["input_shape"].toIntList();
-    auto grad_in = detail::_ps_roi_align_backward(
+    auto input_shape = ctx->saved_data["input_shape"].toList();
+    auto grad_in = detail::_ps_roi_align_backward_symint(
         grad_output[0],
         rois,
         channel_mapping,
         ctx->saved_data["spatial_scale"].toDouble(),
-        ctx->saved_data["pooled_height"].toInt(),
-        ctx->saved_data["pooled_width"].toInt(),
+        ctx->saved_data["pooled_height"].toSymInt(),
+        ctx->saved_data["pooled_width"].toSymInt(),
         ctx->saved_data["sampling_ratio"].toInt(),
-        input_shape[0],
-        input_shape[1],
-        input_shape[2],
-        input_shape[3]);
+        input_shape[0].get().toSymInt(),
+        input_shape[1].get().toSymInt(),
+        input_shape[2].get().toSymInt(),
+        input_shape[3].get().toSymInt());
 
     return {
         grad_in,
@@ -82,26 +84,26 @@ class PSROIAlignBackwardFunction
       const torch::autograd::Variable& rois,
       const torch::autograd::Variable& channel_mapping,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
       int64_t sampling_ratio,
-      int64_t batch_size,
-      int64_t channels,
-      int64_t height,
-      int64_t width) {
+      c10::SymInt batch_size,
+      c10::SymInt channels,
+      c10::SymInt height,
+      c10::SymInt width) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto grad_in = detail::_ps_roi_align_backward(
+    auto grad_in = detail::_ps_roi_align_backward_symint(
         grad,
         rois,
         channel_mapping,
         spatial_scale,
-        pooled_height,
-        pooled_width,
+        std::move(pooled_height),
+        std::move(pooled_width),
         sampling_ratio,
-        batch_size,
-        channels,
-        height,
-        width);
+        std::move(batch_size),
+        std::move(channels),
+        std::move(height),
+        std::move(width));
 
     return {grad_in};
   }
@@ -117,8 +119,8 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_align_autograd(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
     int64_t sampling_ratio) {
   auto result = PSROIAlignFunction::apply(
       input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
@@ -131,13 +133,13 @@ at::Tensor ps_roi_align_backward_autograd(
     const at::Tensor& rois,
     const at::Tensor& channel_mapping,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
     int64_t sampling_ratio,
-    int64_t batch_size,
-    int64_t channels,
-    int64_t height,
-    int64_t width) {
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width) {
   return PSROIAlignBackwardFunction::apply(
       grad,
       rois,
diff --git a/torchvision/csrc/ops/autograd/ps_roi_pool_kernel.cpp b/torchvision/csrc/ops/autograd/ps_roi_pool_kernel.cpp
index ddc37262382..5c3315bb52a 100644
--- a/torchvision/csrc/ops/autograd/ps_roi_pool_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/ps_roi_pool_kernel.cpp
@@ -3,6 +3,8 @@
 #include <torch/autograd.h>
 #include <torch/types.h>
 
+#include <utility>
+
 namespace vision {
 namespace ops {
 
@@ -15,15 +17,15 @@ class PSROIPoolFunction : public torch::autograd::Function<PSROIPoolFunction> {
       const torch::autograd::Variable& input,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width) {
+      const c10::SymInt& pooled_height,
+      const c10::SymInt& pooled_width) {
     ctx->saved_data["spatial_scale"] = spatial_scale;
     ctx->saved_data["pooled_height"] = pooled_height;
     ctx->saved_data["pooled_width"] = pooled_width;
-    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["input_shape"] = input.sym_sizes();
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result =
-        ps_roi_pool(input, rois, spatial_scale, pooled_height, pooled_width);
+    auto result = ps_roi_pool_symint(
+        input, rois, spatial_scale, pooled_height, pooled_width);
 
     auto output = std::get<0>(result);
     auto channel_mapping = std::get<1>(result);
@@ -40,18 +42,18 @@ class PSROIPoolFunction : public torch::autograd::Function<PSROIPoolFunction> {
     auto saved = ctx->get_saved_variables();
     auto rois = saved[0];
     auto channel_mapping = saved[1];
-    auto input_shape = ctx->saved_data["input_shape"].toIntList();
-    auto grad_in = detail::_ps_roi_pool_backward(
+    auto input_shape = ctx->saved_data["input_shape"].toList();
+    auto grad_in = detail::_ps_roi_pool_backward_symint(
         grad_output[0],
         rois,
         channel_mapping,
         ctx->saved_data["spatial_scale"].toDouble(),
-        ctx->saved_data["pooled_height"].toInt(),
-        ctx->saved_data["pooled_width"].toInt(),
-        input_shape[0],
-        input_shape[1],
-        input_shape[2],
-        input_shape[3]);
+        ctx->saved_data["pooled_height"].toSymInt(),
+        ctx->saved_data["pooled_width"].toSymInt(),
+        input_shape[0].get().toSymInt(),
+        input_shape[1].get().toSymInt(),
+        input_shape[2].get().toSymInt(),
+        input_shape[3].get().toSymInt());
 
     return {
         grad_in,
@@ -72,24 +74,24 @@ class PSROIPoolBackwardFunction
       const torch::autograd::Variable& rois,
       const torch::autograd::Variable& channel_mapping,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
-      int64_t batch_size,
-      int64_t channels,
-      int64_t height,
-      int64_t width) {
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
+      c10::SymInt batch_size,
+      c10::SymInt channels,
+      c10::SymInt height,
+      c10::SymInt width) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto grad_in = detail::_ps_roi_pool_backward(
+    auto grad_in = detail::_ps_roi_pool_backward_symint(
         grad,
         rois,
         channel_mapping,
         spatial_scale,
-        pooled_height,
-        pooled_width,
-        batch_size,
-        channels,
-        height,
-        width);
+        std::move(pooled_height),
+        std::move(pooled_width),
+        std::move(batch_size),
+        std::move(channels),
+        std::move(height),
+        std::move(width));
 
     return {grad_in};
   }
@@ -105,8 +107,8 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_pool_autograd(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width) {
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width) {
   auto result = PSROIPoolFunction::apply(
       input, rois, spatial_scale, pooled_height, pooled_width);
 
@@ -118,12 +120,12 @@ at::Tensor ps_roi_pool_backward_autograd(
     const at::Tensor& rois,
     const at::Tensor& channel_mapping,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
-    int64_t batch_size,
-    int64_t channels,
-    int64_t height,
-    int64_t width) {
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width) {
   return PSROIPoolBackwardFunction::apply(
       grad,
       rois,
diff --git a/torchvision/csrc/ops/autograd/roi_align_kernel.cpp b/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
index f26842b6428..0a1ae55b971 100644
--- a/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
@@ -3,6 +3,8 @@
 #include <torch/autograd.h>
 #include <torch/types.h>
 
+#include <utility>
+
 namespace vision {
 namespace ops {
 
@@ -15,8 +17,8 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
       const torch::autograd::Variable& input,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
+      const c10::SymInt& pooled_height,
+      const c10::SymInt& pooled_width,
       int64_t sampling_ratio,
       bool aligned) {
     ctx->saved_data["spatial_scale"] = spatial_scale;
@@ -24,10 +26,10 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
     ctx->saved_data["pooled_width"] = pooled_width;
     ctx->saved_data["sampling_ratio"] = sampling_ratio;
     ctx->saved_data["aligned"] = aligned;
-    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["input_shape"] = input.sym_sizes();
     ctx->save_for_backward({rois});
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = roi_align(
+    auto result = roi_align_symint(
         input,
         rois,
         spatial_scale,
@@ -44,17 +46,17 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
     // Use data saved in forward
     auto saved = ctx->get_saved_variables();
     auto rois = saved[0];
-    auto input_shape = ctx->saved_data["input_shape"].toIntList();
-    auto grad_in = detail::_roi_align_backward(
+    auto input_shape = ctx->saved_data["input_shape"].toList();
+    auto grad_in = detail::_roi_align_backward_symint(
         grad_output[0],
         rois,
         ctx->saved_data["spatial_scale"].toDouble(),
-        ctx->saved_data["pooled_height"].toInt(),
-        ctx->saved_data["pooled_width"].toInt(),
-        input_shape[0],
-        input_shape[1],
-        input_shape[2],
-        input_shape[3],
+        ctx->saved_data["pooled_height"].toSymInt(),
+        ctx->saved_data["pooled_width"].toSymInt(),
+        input_shape[0].get().toSymInt(),
+        input_shape[1].get().toSymInt(),
+        input_shape[2].get().toSymInt(),
+        input_shape[3].get().toSymInt(),
         ctx->saved_data["sampling_ratio"].toInt(),
         ctx->saved_data["aligned"].toBool());
     return {
@@ -77,25 +79,25 @@ class ROIAlignBackwardFunction
       const torch::autograd::Variable& grad,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
-      int64_t batch_size,
-      int64_t channels,
-      int64_t height,
-      int64_t width,
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
+      c10::SymInt batch_size,
+      c10::SymInt channels,
+      c10::SymInt height,
+      c10::SymInt width,
       int64_t sampling_ratio,
       bool aligned) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = detail::_roi_align_backward(
+    auto result = detail::_roi_align_backward_symint(
         grad,
         rois,
         spatial_scale,
-        pooled_height,
-        pooled_width,
-        batch_size,
-        channels,
-        height,
-        width,
+        std::move(pooled_height),
+        std::move(pooled_width),
+        std::move(batch_size),
+        std::move(channels),
+        std::move(height),
+        std::move(width),
         sampling_ratio,
         aligned);
     return {result};
@@ -112,8 +114,8 @@ at::Tensor roi_align_autograd(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
   return ROIAlignFunction::apply(
@@ -130,12 +132,12 @@ at::Tensor roi_align_backward_autograd(
     const at::Tensor& grad,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
-    int64_t batch_size,
-    int64_t channels,
-    int64_t height,
-    int64_t width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
     int64_t sampling_ratio,
     bool aligned) {
   return ROIAlignBackwardFunction::apply(
diff --git a/torchvision/csrc/ops/autograd/roi_pool_kernel.cpp b/torchvision/csrc/ops/autograd/roi_pool_kernel.cpp
index d246f831bcb..4944a731c6b 100644
--- a/torchvision/csrc/ops/autograd/roi_pool_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/roi_pool_kernel.cpp
@@ -3,6 +3,8 @@
 #include <torch/autograd.h>
 #include <torch/types.h>
 
+#include <utility>
+
 namespace vision {
 namespace ops {
 
@@ -15,15 +17,15 @@ class ROIPoolFunction : public torch::autograd::Function<ROIPoolFunction> {
       const torch::autograd::Variable& input,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width) {
+      const c10::SymInt& pooled_height,
+      const c10::SymInt& pooled_width) {
     ctx->saved_data["spatial_scale"] = spatial_scale;
     ctx->saved_data["pooled_height"] = pooled_height;
     ctx->saved_data["pooled_width"] = pooled_width;
-    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["input_shape"] = input.sym_sizes();
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result =
-        roi_pool(input, rois, spatial_scale, pooled_height, pooled_width);
+    auto result = roi_pool_symint(
+        input, rois, spatial_scale, pooled_height, pooled_width);
 
     auto output = std::get<0>(result);
     auto argmax = std::get<1>(result);
@@ -40,18 +42,18 @@ class ROIPoolFunction : public torch::autograd::Function<ROIPoolFunction> {
     auto saved = ctx->get_saved_variables();
     auto rois = saved[0];
     auto argmax = saved[1];
-    auto input_shape = ctx->saved_data["input_shape"].toIntList();
-    auto grad_in = detail::_roi_pool_backward(
+    auto input_shape = ctx->saved_data["input_shape"].toList();
+    auto grad_in = detail::_roi_pool_backward_symint(
         grad_output[0],
         rois,
         argmax,
         ctx->saved_data["spatial_scale"].toDouble(),
-        ctx->saved_data["pooled_height"].toInt(),
-        ctx->saved_data["pooled_width"].toInt(),
-        input_shape[0],
-        input_shape[1],
-        input_shape[2],
-        input_shape[3]);
+        ctx->saved_data["pooled_height"].toSymInt(),
+        ctx->saved_data["pooled_width"].toSymInt(),
+        input_shape[0].get().toSymInt(),
+        input_shape[1].get().toSymInt(),
+        input_shape[2].get().toSymInt(),
+        input_shape[3].get().toSymInt());
 
     return {
         grad_in,
@@ -72,24 +74,24 @@ class ROIPoolBackwardFunction
       const torch::autograd::Variable& rois,
       const torch::autograd::Variable& argmax,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
-      int64_t batch_size,
-      int64_t channels,
-      int64_t height,
-      int64_t width) {
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
+      c10::SymInt batch_size,
+      c10::SymInt channels,
+      c10::SymInt height,
+      c10::SymInt width) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto grad_in = detail::_roi_pool_backward(
+    auto grad_in = detail::_roi_pool_backward_symint(
         grad,
         rois,
         argmax,
         spatial_scale,
-        pooled_height,
-        pooled_width,
-        batch_size,
-        channels,
-        height,
-        width);
+        std::move(pooled_height),
+        std::move(pooled_width),
+        std::move(batch_size),
+        std::move(channels),
+        std::move(height),
+        std::move(width));
 
     return {grad_in};
   }
@@ -105,8 +107,8 @@ std::tuple<at::Tensor, at::Tensor> roi_pool_autograd(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width) {
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width) {
   auto result = ROIPoolFunction::apply(
       input, rois, spatial_scale, pooled_height, pooled_width);
 
@@ -118,12 +120,12 @@ at::Tensor roi_pool_backward_autograd(
     const at::Tensor& rois,
     const at::Tensor& argmax,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
-    int64_t batch_size,
-    int64_t channels,
-    int64_t height,
-    int64_t width) {
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width) {
   return ROIPoolBackwardFunction::apply(
       grad,
       rois,
diff --git a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp
index c5e59077aa6..f89e6cc3030 100644
--- a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp
@@ -1,3 +1,4 @@
+// @nolint (improperly imported third-party code)
 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer
  *****************
diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp
index c54d1f00148..454ce118a6d 100644
--- a/torchvision/csrc/ops/cpu/nms_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp
@@ -11,14 +11,15 @@ at::Tensor nms_kernel_impl(
     const at::Tensor& dets,
     const at::Tensor& scores,
     double iou_threshold) {
-  TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
-  TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
+  TORCH_CHECK(dets.is_cpu(), "dets must be a CPU tensor");
+  TORCH_CHECK(scores.is_cpu(), "scores must be a CPU tensor");
   TORCH_CHECK(
       dets.scalar_type() == scores.scalar_type(),
       "dets should have the same type as scores");
 
-  if (dets.numel() == 0)
+  if (dets.numel() == 0) {
     return at::empty({0}, dets.options().dtype(at::kLong));
+  }
 
   auto x1_t = dets.select(1, 0).contiguous();
   auto y1_t = dets.select(1, 1).contiguous();
@@ -47,8 +48,9 @@ at::Tensor nms_kernel_impl(
 
   for (int64_t _i = 0; _i < ndets; _i++) {
     auto i = order[_i];
-    if (suppressed[i] == 1)
+    if (suppressed[i] == 1) {
       continue;
+    }
     keep[num_to_keep++] = i;
     auto ix1 = x1[i];
     auto iy1 = y1[i];
@@ -58,8 +60,9 @@ at::Tensor nms_kernel_impl(
 
     for (int64_t _j = _i + 1; _j < ndets; _j++) {
       auto j = order[_j];
-      if (suppressed[j] == 1)
+      if (suppressed[j] == 1) {
         continue;
+      }
       auto xx1 = std::max(ix1, x1[j]);
       auto yy1 = std::max(iy1, y1[j]);
       auto xx2 = std::min(ix2, x2[j]);
@@ -69,8 +72,9 @@ at::Tensor nms_kernel_impl(
       auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
       auto inter = w * h;
       auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr > iou_threshold)
+      if (ovr > iou_threshold) {
         suppressed[j] = 1;
+      }
     }
   }
   return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
diff --git a/torchvision/csrc/ops/cpu/ps_roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/ps_roi_align_kernel.cpp
index 1c272427d3f..820029c73d5 100644
--- a/torchvision/csrc/ops/cpu/ps_roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/ps_roi_align_kernel.cpp
@@ -20,10 +20,12 @@ T bilinear_interpolate(
     return 0;
   }
 
-  if (y <= 0)
+  if (y <= 0) {
     y = 0;
-  if (x <= 0)
+  }
+  if (x <= 0) {
     x = 0;
+  }
 
   int y_low = (int)y;
   int x_low = (int)x;
@@ -163,10 +165,12 @@ void bilinear_interpolate_gradient(
     return;
   }
 
-  if (y <= 0)
+  if (y <= 0) {
     y = 0;
-  if (x <= 0)
+  }
+  if (x <= 0) {
     x = 0;
+  }
 
   y_low = (int)y;
   x_low = (int)x;
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index e6684e953d0..e0185da45df 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -60,7 +60,7 @@ void roi_align_forward_kernel_impl(
     // When the grid is empty, output zeros.
     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indices and weights shared by all chanels,
+    // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<detail::PreCalc<T>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
@@ -130,10 +130,12 @@ void bilinear_interpolate_gradient(
     return;
   }
 
-  if (y <= 0)
+  if (y <= 0) {
     y = 0;
-  if (x <= 0)
+  }
+  if (x <= 0) {
     x = 0;
+  }
 
   y_low = (int)y;
   x_low = (int)x;
@@ -304,8 +306,9 @@ at::Tensor roi_align_forward_kernel(
   at::Tensor output = at::zeros(
       {num_rois, channels, pooled_height, pooled_width}, input.options());
 
-  if (output.numel() == 0)
+  if (output.numel() == 0) {
     return output;
+  }
 
   auto input_ = input.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
index d28d332b41e..ae496b37d48 100644
--- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
+++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
@@ -1,3 +1,4 @@
+// @nolint (improperly imported third-party code)
 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer
  *****************
@@ -70,7 +71,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -82,7 +83,7 @@ namespace {
 const int kMaxParallelImgs = 32;
 
 inline unsigned int GET_THREADS() {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef WITH_HIP
   return 256;
 #endif
   return 512;
@@ -240,11 +241,11 @@ void deformable_im2col(
   // https://github.com/pytorch/vision/issues/4269
   bool use_64bits_indexing = false;
   // Checks if num_kernels or columns numel larger than 2 ** 31
-  use_64bits_indexing |= num_kernels > (1 << 31);
+  use_64bits_indexing |= num_kernels > std::numeric_limits<int32_t>::max();
   use_64bits_indexing |=
       ((int64_t)n_in_channels * weight_h * weight_w * parallel_imgs * out_h *
            out_w >
-       (1 << 31));
+       std::numeric_limits<int32_t>::max());
 
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -300,11 +301,7 @@ void deformable_im2col(
               data_col.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 int get_greatest_divisor_below_bound(int n, int bound) {
@@ -339,6 +336,8 @@ __global__ void deformable_col2im_kernel(
     index_t out_w,
     bool use_mask,
     scalar_t* grad_im) {
+  const index_t grad_im_numel = width * height * channels * batch_sz;
+
   CUDA_1D_KERNEL_LOOP_T(index, n, int64_t) {
     const index_t out_x = index % out_w;
     const index_t out_y = (index / out_w) % out_h;
@@ -385,7 +384,12 @@ __global__ void deformable_col2im_kernel(
             std::abs(y - yp) < 1 && std::abs(x - xp) < 1) {
           index_t grad_pos = ((b * channels + c) * height + yp) * width + xp;
           scalar_t weight = (1 - std::abs(y - yp)) * (1 - std::abs(x - xp));
-          gpuAtomicAdd(grad_im + grad_pos, mask_value * weight * col[index]);
+          at::native::fastAtomicAdd(
+              grad_im,
+              grad_pos,
+              grad_im_numel,
+              mask_value * weight * col[index],
+              true);
         }
       }
     }
@@ -428,7 +432,9 @@ void compute_grad_input(
   // https://github.com/pytorch/vision/issues/4269
   bool use_64bits_indexing = false;
   // Checks if num_kernels or columns numel larger than 2 ** 31
-  use_64bits_indexing |= num_kernels > (1 << 31);
+  use_64bits_indexing |= num_kernels > std::numeric_limits<int32_t>::max();
+
+  at::globalContext().alertNotDeterministic("compute_grad_input");
 
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -483,11 +489,7 @@ void compute_grad_input(
               grad_im.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("error in compute_grad_input: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t, typename index_t>
@@ -671,10 +673,10 @@ void compute_grad_offset_and_mask(
   // https://github.com/pytorch/vision/issues/4269
   bool use_64bits_indexing = false;
   // Checks if columns numel is larger than 2 ** 31
-  use_64bits_indexing |= num_kernels > (1 << 31);
+  use_64bits_indexing |= num_kernels > std::numeric_limits<int32_t>::max();
   use_64bits_indexing |=
       ((int64_t)channels * weight_h * weight_w * parallel_imgs * out_h * out_w >
-       (1 << 31));
+       std::numeric_limits<int32_t>::max());
 
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -736,12 +738,7 @@ void compute_grad_offset_and_mask(
               grad_mask.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf(
-        "error in compute_grad_offset_and_mask: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> backward_gradient_inputs(
diff --git a/torchvision/csrc/ops/cuda/nms_kernel.cu b/torchvision/csrc/ops/cuda/nms_kernel.cu
index f4be29e2535..7d94aae5802 100644
--- a/torchvision/csrc/ops/cuda/nms_kernel.cu
+++ b/torchvision/csrc/ops/cuda/nms_kernel.cu
@@ -77,6 +77,51 @@ __global__ void nms_kernel_impl(
   }
 }
 
+__global__ static void gather_keep_from_mask(
+    bool* keep,
+    const unsigned long long* dev_mask,
+    const int n_boxes) {
+  // Taken and adapted from mmcv
+  // https://github.com/open-mmlab/mmcv/blob/03ce9208d18c0a63d7ffa087ea1c2f5661f2441a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh#L76
+  const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
+  const int thread_id = threadIdx.x;
+
+  // Mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
+
+  // Initialize removed.
+  for (int i = thread_id; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
+  }
+  __syncthreads();
+
+  for (int nblock = 0; nblock < col_blocks; nblock++) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; inblock++) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes)
+        break;
+      // Select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (thread_id == 0) {
+          keep[i] = true;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // Remove all bboxes which overlap the candidate.
+        for (int j = thread_id; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock)
+            removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
+      }
+    }
+  }
+}
+
 at::Tensor nms_kernel(
     const at::Tensor& dets,
     const at::Tensor& scores,
@@ -133,35 +178,25 @@ at::Tensor nms_kernel(
             (unsigned long long*)mask.data_ptr<int64_t>());
       });
 
-  at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long* mask_host =
-      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
-
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
   at::Tensor keep =
-      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data_ptr<int64_t>();
-
-  int num_to_keep = 0;
-  for (int i = 0; i < dets_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long* p = mask_host + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
+      at::zeros({dets_num}, dets.options().dtype(at::kBool).device(at::kCUDA));
+
+  // Unwrap the mask to fill keep with proper values
+  // Keeping the unwrap on device instead of applying iterative for loops on cpu
+  // prevents the device -> cpu -> device transfer that could be bottleneck for
+  // large number of boxes.
+  // See https://github.com/pytorch/vision/issues/8713 for more details.
+  gather_keep_from_mask<<<
+      1,
+      min(col_blocks, threadsPerBlock),
+      col_blocks * sizeof(unsigned long long),
+      stream>>>(
+      keep.data_ptr<bool>(),
+      (unsigned long long*)mask.data_ptr<int64_t>(),
+      dets_num);
 
   AT_CUDA_CHECK(cudaGetLastError());
-  return order_t.index(
-      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
-           .to(order_t.device(), keep.scalar_type())});
+  return order_t.masked_select(keep);
 }
 
 } // namespace
diff --git a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
index b9c624b09c8..105c6a14256 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -212,7 +212,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
     int sampling_ratio,
     int channels_out,
     T* grad_input,
-    const T* rois) {
+    const T* rois,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, *, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -235,8 +236,6 @@ __global__ void ps_roi_align_backward_kernel_impl(
     T bin_size_w = roi_width / static_cast<T>(pooled_width);
 
     int c_in = channel_mapping[index];
-    T* grad_input_offset =
-        grad_input + (roi_batch_ind * channels + c_in) * height * width;
 
     // Do not using floor/ceil; this implementation detail is critical
     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
@@ -252,6 +251,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
     const T count = roi_bin_grid_h * roi_bin_grid_w;
 
+    const int offset = (roi_batch_ind * channels + c_in) * height * width;
+
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
       const T y = hstart +
           static_cast<T>(iy + .5f) * bin_size_h /
@@ -285,10 +286,30 @@ __global__ void ps_roi_align_backward_kernel_impl(
         T g4 = grad_output_this_bin * w4 / count;
 
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          gpuAtomicAdd(grad_input_offset + y_low * width + x_low, g1);
-          gpuAtomicAdd(grad_input_offset + y_low * width + x_high, g2);
-          gpuAtomicAdd(grad_input_offset + y_high * width + x_low, g3);
-          gpuAtomicAdd(grad_input_offset + y_high * width + x_high, g4);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_low * width + x_low,
+              memory_span,
+              static_cast<T>(g1),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_low * width + x_high,
+              memory_span,
+              static_cast<T>(g2),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_high * width + x_low,
+              memory_span,
+              static_cast<T>(g3),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_high * width + x_high,
+              memory_span,
+              static_cast<T>(g4),
+              true);
         } // if
       } // ix
     } // iy
@@ -412,6 +433,8 @@ at::Tensor ps_roi_align_backward_kernel(
 
   int channels_out = channels / (pooled_height * pooled_width);
 
+  at::globalContext().alertNotDeterministic("ps_roi_align_backward_kernel");
+
   auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "ps_roi_align_backward_kernel", [&] {
@@ -428,7 +451,8 @@ at::Tensor ps_roi_align_backward_kernel(
             sampling_ratio,
             channels_out,
             grad_input.data_ptr<scalar_t>(),
-            rois_.data_ptr<scalar_t>());
+            rois_.data_ptr<scalar_t>(),
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
index 917fff03e8d..2c90690f4a5 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -91,7 +91,8 @@ __global__ void ps_roi_pool_backward_kernel_impl(
     int pooled_width,
     int channels_out,
     T* grad_input,
-    const T* rois) {
+    const T* rois,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, *, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -124,14 +125,15 @@ __global__ void ps_roi_pool_backward_kernel_impl(
     bool is_empty = (hend <= hstart) || (wend <= wstart);
 
     int c_in = channel_mapping[index];
-    T* grad_input_offset =
-        grad_input + (roi_batch_ind * channels + c_in) * height * width;
     T bin_area = (hend - hstart) * (wend - wstart);
     T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
+
+    const int offset = (roi_batch_ind * channels + c_in) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int grad_input_index = h * width + w;
-        gpuAtomicAdd(grad_input_offset + grad_input_index, diff_val);
+        at::native::fastAtomicAdd(
+            grad_input, offset + grad_input_index, memory_span, diff_val, true);
       }
     }
   }
@@ -251,6 +253,8 @@ at::Tensor ps_roi_pool_backward_kernel(
 
   int channels_out = channels / (pooled_height * pooled_width);
 
+  at::globalContext().alertNotDeterministic("ps_roi_pool_backward_kernel");
+
   auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "ps_roi_pool_backward_kernel", [&] {
@@ -267,7 +271,8 @@ at::Tensor ps_roi_pool_backward_kernel(
             pooled_width,
             channels_out,
             grad_input.data_ptr<scalar_t>(),
-            rois_.data_ptr<scalar_t>());
+            rois_.data_ptr<scalar_t>(),
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index f1f886c4738..26c53448663 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -218,7 +218,8 @@ __global__ void roi_align_backward_kernel_impl(
     int n_stride,
     int c_stride,
     int h_stride,
-    int w_stride) {
+    int w_stride,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -247,12 +248,9 @@ __global__ void roi_align_backward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
     // We need to index the gradient using the tensor strides to access the
     // correct values.
-    int output_offset = n * n_stride + c * c_stride;
+    const int output_offset = n * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
@@ -267,6 +265,8 @@ __global__ void roi_align_backward_kernel_impl(
     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
 
+    const int input_offset = (roi_batch_ind * channels + c) * height * width;
+
     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
     {
       const T y = roi_start_h + ph * bin_size_h +
@@ -301,14 +301,30 @@ __global__ void roi_align_backward_kernel_impl(
         T g4 = grad_output_this_bin * w4 / count;
 
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          gpuAtomicAdd(
-              offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          gpuAtomicAdd(
-              offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          gpuAtomicAdd(
-              offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          gpuAtomicAdd(
-              offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_low * width + x_low,
+              memory_span,
+              static_cast<T>(g1),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_low * width + x_high,
+              memory_span,
+              static_cast<T>(g2),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_high * width + x_low,
+              memory_span,
+              static_cast<T>(g3),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_high * width + x_high,
+              memory_span,
+              static_cast<T>(g4),
+              true);
         } // if
       } // ix
     } // iy
@@ -421,6 +437,8 @@ at::Tensor roi_align_backward_kernel(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
+  at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
+
   auto rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "roi_align_backward_kernel", [&] {
@@ -440,7 +458,8 @@ at::Tensor roi_align_backward_kernel(
             n_stride,
             c_stride,
             h_stride,
-            w_stride);
+            w_stride,
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
index e29c4438ed4..3a9374bb438 100644
--- a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
@@ -3,7 +3,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <float.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -94,7 +94,8 @@ __global__ void roi_pool_backward_kernel_impl(
     int n_stride,
     int c_stride,
     int h_stride,
-    int w_stride) {
+    int w_stride,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -104,19 +105,21 @@ __global__ void roi_pool_backward_kernel_impl(
 
     const T* offset_rois = rois + n * 5;
     int roi_batch_ind = offset_rois[0];
-    T* grad_input_offset =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
 
-    int output_offset = n * n_stride + c * c_stride;
+    const int output_offset = n * n_stride + c * c_stride;
     const int* argmax_data_offset =
         argmax_data + (n * channels + c) * pooled_height * pooled_width;
-    int argmax = argmax_data_offset[ph * pooled_width + pw];
+    const int argmax = argmax_data_offset[ph * pooled_width + pw];
+    const int offset = (roi_batch_ind * channels + c) * height * width;
 
     if (argmax != -1) {
-      gpuAtomicAdd(
-          grad_input_offset + argmax,
+      at::native::fastAtomicAdd(
+          grad_input,
+          offset + argmax,
+          memory_span,
           static_cast<T>(
-              grad_output[output_offset + ph * h_stride + pw * w_stride]));
+              grad_output[output_offset + ph * h_stride + pw * w_stride]),
+          true);
     }
   }
 }
@@ -232,6 +235,8 @@ at::Tensor roi_pool_backward_kernel(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
+  at::globalContext().alertNotDeterministic("roi_pool_backward_kernel");
+
   auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "roi_pool_backward_kernel", [&] {
@@ -251,7 +256,8 @@ at::Tensor roi_pool_backward_kernel(
             n_stride,
             c_stride,
             h_stride,
-            w_stride);
+            w_stride,
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/deform_conv2d.cpp b/torchvision/csrc/ops/deform_conv2d.cpp
index d8f2c9b6ff4..3cda60fe0bc 100644
--- a/torchvision/csrc/ops/deform_conv2d.cpp
+++ b/torchvision/csrc/ops/deform_conv2d.cpp
@@ -43,6 +43,42 @@ at::Tensor deform_conv2d(
       use_mask);
 }
 
+at::Tensor deform_conv2d_symint(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    c10::SymInt stride_h,
+    c10::SymInt stride_w,
+    c10::SymInt pad_h,
+    c10::SymInt pad_w,
+    c10::SymInt dilation_h,
+    c10::SymInt dilation_w,
+    c10::SymInt groups,
+    c10::SymInt offset_groups,
+    bool use_mask) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.deform_conv2d.deform_conv2d");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::deform_conv2d", "")
+                       .typed<decltype(deform_conv2d_symint)>();
+  return op.call(
+      input,
+      weight,
+      offset,
+      mask,
+      bias,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      groups,
+      offset_groups,
+      use_mask);
+}
+
 namespace detail {
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -84,13 +120,52 @@ _deform_conv2d_backward(
       use_mask);
 }
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+_deform_conv2d_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    c10::SymInt stride_h,
+    c10::SymInt stride_w,
+    c10::SymInt pad_h,
+    c10::SymInt pad_w,
+    c10::SymInt dilation_h,
+    c10::SymInt dilation_w,
+    c10::SymInt groups,
+    c10::SymInt offset_groups,
+    bool use_mask) {
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("torchvision::_deform_conv2d_backward", "")
+          .typed<decltype(_deform_conv2d_backward_symint)>();
+  return op.call(
+      grad,
+      input,
+      weight,
+      offset,
+      mask,
+      bias,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      groups,
+      offset_groups,
+      use_mask);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::deform_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor mask, Tensor bias, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int groups, int offset_groups, bool use_mask) -> Tensor"));
+      "torchvision::deform_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor mask, Tensor bias, SymInt stride_h, SymInt stride_w, SymInt pad_h, SymInt pad_w, SymInt dilation_h, SymInt dilation_w, SymInt groups, SymInt offset_groups, bool use_mask) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_deform_conv2d_backward(Tensor grad, Tensor input, Tensor weight, Tensor offset, Tensor mask, Tensor bias, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int groups, int offset_groups, bool use_mask) -> (Tensor, Tensor, Tensor, Tensor, Tensor)"));
+      "torchvision::_deform_conv2d_backward(Tensor grad, Tensor input, Tensor weight, Tensor offset, Tensor mask, Tensor bias, SymInt stride_h, SymInt stride_w, SymInt pad_h, SymInt pad_w, SymInt dilation_h, SymInt dilation_w, SymInt groups, SymInt offset_groups, bool use_mask) -> (Tensor, Tensor, Tensor, Tensor, Tensor)"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/deform_conv2d.h b/torchvision/csrc/ops/deform_conv2d.h
index a35be02aac8..cf1f142e648 100644
--- a/torchvision/csrc/ops/deform_conv2d.h
+++ b/torchvision/csrc/ops/deform_conv2d.h
@@ -22,6 +22,22 @@ VISION_API at::Tensor deform_conv2d(
     int64_t offset_groups,
     bool use_mask);
 
+VISION_API at::Tensor deform_conv2d_symint(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    c10::SymInt stride_h,
+    c10::SymInt stride_w,
+    c10::SymInt pad_h,
+    c10::SymInt pad_w,
+    c10::SymInt dilation_h,
+    c10::SymInt dilation_w,
+    c10::SymInt groups,
+    c10::SymInt offset_groups,
+    bool use_mask);
+
 namespace detail {
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -42,6 +58,24 @@ _deform_conv2d_backward(
     int64_t offset_groups,
     bool use_mask);
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+_deform_conv2d_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    c10::SymInt stride_h,
+    c10::SymInt stride_w,
+    c10::SymInt pad_h,
+    c10::SymInt pad_w,
+    c10::SymInt dilation_h,
+    c10::SymInt dilation_w,
+    c10::SymInt groups,
+    c10::SymInt offset_groups,
+    bool use_mask);
+
 } // namespace detail
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/mps/deform_conv2d_kernel.mm b/torchvision/csrc/ops/mps/deform_conv2d_kernel.mm
new file mode 100644
index 00000000000..63371365655
--- /dev/null
+++ b/torchvision/csrc/ops/mps/deform_conv2d_kernel.mm
@@ -0,0 +1,149 @@
+#include <ATen/ATen.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor deform_conv2d_forward_kernel(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_h,
+    int64_t pad_w,
+    int64_t dilation_h,
+    int64_t dilation_w,
+    int64_t n_weight_grps,
+    int64_t n_offset_grps,
+    bool use_mask) {
+  using namespace at::native::mps;
+  at::Tensor input_c = input.contiguous();
+  at::Tensor weight_c = weight.contiguous();
+  at::Tensor offset_c = offset.contiguous();
+  at::Tensor mask_c = mask.contiguous();
+  at::Tensor bias_c = bias.contiguous();
+
+  TORCH_CHECK(input_c.ndimension() == 4, "Input tensor must be 4D");
+  TORCH_CHECK(weight_c.ndimension() == 4, "Weight tensor must be 4D");
+  TORCH_CHECK(offset_c.ndimension() == 4, "Offset tensor must be 4D");
+  TORCH_CHECK(!use_mask || mask_c.ndimension() == 4, "Mask tensor must be 4D if use_mask is true");
+  TORCH_CHECK(input_c.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(weight.is_mps(), "weight must be a MPS tensor");
+  TORCH_CHECK(offset.is_mps(), "offset must be a MPS tensor");
+  TORCH_CHECK(mask.is_mps(), "mask must be a MPS tensor");
+  TORCH_CHECK(bias.is_mps(), "bias must be a MPS tensor");
+
+  at::DeviceGuard guard(input_c.device());
+
+  uint32_t batch = input_c.size(0);
+  uint32_t in_channels = input_c.size(1);
+  uint32_t in_h = input_c.size(2);
+  uint32_t in_w = input_c.size(3);
+  uint32_t weight_h = weight_c.size(2);
+  uint32_t weight_w = weight_c.size(3);
+  uint32_t out_channels = weight_c.size(0);
+  uint32_t ker_h = dilation_h * (weight_h - 1) + 1;
+  uint32_t ker_w = dilation_w * (weight_w - 1) + 1;
+  uint32_t out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1;
+  uint32_t out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1;
+  uint32_t pad_h_u = static_cast<uint32_t>(pad_h);
+  uint32_t pad_w_u = static_cast<uint32_t>(pad_w);
+  uint32_t stride_h_u = static_cast<uint32_t>(stride_h);
+  uint32_t stride_w_u = static_cast<uint32_t>(stride_w);
+  uint32_t dilation_h_u = static_cast<uint32_t>(dilation_h);
+  uint32_t dilation_w_u = static_cast<uint32_t>(dilation_w);
+
+  TORCH_CHECK(weight_c.size(1) * n_weight_grps == in_channels,
+    "Input channels (", in_channels, 
+    ") must equal weight.size(1) * n_weight_grps (", weight_c.size(1), " * ", n_weight_grps, ")");
+  TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0,
+    "Weight tensor's out channels (", weight_c.size(0), 
+    ") must be divisible by n_weight_grps (", n_weight_grps, ")");
+  TORCH_CHECK(offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w,
+    "Offset tensor shape[1] is invalid: got ", offset_c.size(1), 
+    ", expected ", n_offset_grps * 2 * weight_h * weight_w);
+  TORCH_CHECK(!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w,
+    "Mask tensor shape[1] is invalid: got ", mask_c.size(1), 
+    ", expected ", n_offset_grps * weight_h * weight_w);
+  TORCH_CHECK(in_channels % n_offset_grps == 0,
+    "Input tensor channels (", in_channels, 
+    ") must be divisible by n_offset_grps (", n_offset_grps, ")");
+  TORCH_CHECK(offset_c.size(0) == batch,
+    "Offset tensor batch size (", offset_c.size(0),
+    ") must match input tensor batch size (", batch, ")");
+  TORCH_CHECK(offset_c.size(2) == out_h && offset_c.size(3) == out_w,
+    "Offset tensor spatial dimensions (", offset_c.size(2), ", ", offset_c.size(3), 
+    ") must match calculated output dimensions (", out_h, ", ", out_w, ")");
+  TORCH_CHECK(!use_mask || mask_c.size(0) == batch,
+    "Mask tensor batch size (", mask_c.size(0),
+    ") must match input tensor batch size (", batch, ")");
+  TORCH_CHECK(!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w),
+    "Mask tensor spatial dimensions (", mask_c.size(2), ", ", mask_c.size(3),
+    ") must match calculated output dimensions (", out_h, ", ", out_w, ")");
+  TORCH_CHECK(out_h > 0 && out_w > 0,
+    "Calculated output size too small - out_h: ", out_h, " out_w: ", out_w);
+
+  auto columns = at::empty({in_channels * weight_h * weight_w, batch * out_h * out_w}, input_c.options());
+
+  id<MTLBuffer> inputBuffer  = getMTLBufferStorage(input_c);
+  id<MTLBuffer> offsetBuffer = getMTLBufferStorage(offset_c);
+  id<MTLBuffer> maskBuffer   = use_mask ? getMTLBufferStorage(mask_c) : nil;
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(columns);
+
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  std::string kernelName = "deformable_im2col_" + scalarToMetalTypeString(input.scalar_type());
+  id<MTLComputePipelineState> pipelineState = mps::visionPipelineState(device, kernelName);
+
+  int num_kernels = in_channels * out_h * out_w * batch;
+  NSUInteger threadsPerThreadgroup = pipelineState.maxTotalThreadsPerThreadgroup;
+  NSUInteger threadgroups = (num_kernels + threadsPerThreadgroup - 1) / threadsPerThreadgroup;
+  MTLSize threadGroupSize = MTLSizeMake(threadsPerThreadgroup, 1, 1);
+  MTLSize threadgroupsPerGrid = MTLSizeMake(threadgroups, 1, 1);
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^{
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      [computeEncoder setComputePipelineState:pipelineState];
+      at::native::mps::mtl_setArgs(computeEncoder, inputBuffer, offsetBuffer, maskBuffer,
+                                   std::array<uint32_t, 2>{in_h, in_w},
+                                   std::array<uint32_t, 2>{weight_h, weight_w},
+                                   std::array<uint32_t, 2>{pad_h_u, pad_w_u},
+                                   std::array<uint32_t, 2>{stride_h_u, stride_w_u},
+                                   std::array<uint32_t, 2>{dilation_h_u, dilation_w_u},
+                                   batch, in_channels, n_offset_grps,
+                                   std::array<uint32_t, 2>{out_h, out_w},
+                                   use_mask, outputBuffer);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+    }
+  });
+  int in_channels_per_grp = in_channels / n_weight_grps;
+  int out_channels_per_grp = out_channels / n_weight_grps;
+  auto weight_grouped = weight_c.view({n_weight_grps, out_channels_per_grp, in_channels_per_grp, weight_h, weight_w});
+  auto columns_grouped = columns.view({n_weight_grps,
+                                      (in_channels * weight_h * weight_w) / n_weight_grps,
+                                      batch * out_h * out_w});
+  auto weight_reshaped = weight_grouped.reshape({n_weight_grps, out_channels_per_grp, -1});
+  auto out_grouped = at::bmm(weight_reshaped, columns_grouped);
+  auto out = out_grouped.reshape({n_weight_grps * out_channels_per_grp, batch, out_h, out_w})
+              .transpose(0, 1);
+  return out + bias_c.view({1, out_channels, 1, 1});
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::deform_conv2d"),
+      TORCH_FN(deform_conv2d_forward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/mps_helpers.h b/torchvision/csrc/ops/mps/mps_helpers.h
new file mode 100644
index 00000000000..d3c0e8d94b7
--- /dev/null
+++ b/torchvision/csrc/ops/mps/mps_helpers.h
@@ -0,0 +1,6 @@
+constexpr int threadsPerBlock = 512;
+
+template <typename T>
+constexpr inline T ceil_div(T n, T m) {
+  return (n + m - 1) / m;
+}
diff --git a/torchvision/csrc/ops/mps/mps_kernels.h b/torchvision/csrc/ops/mps/mps_kernels.h
new file mode 100644
index 00000000000..35c60fa0064
--- /dev/null
+++ b/torchvision/csrc/ops/mps/mps_kernels.h
@@ -0,0 +1,1197 @@
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace vision {
+namespace ops {
+
+namespace mps {
+
+static at::native::mps::MetalShaderLibrary lib(R"VISION_METAL(
+
+#include <metal_atomic>
+#include <metal_stdlib>
+using namespace metal;
+
+/*----------Macros----------*/
+
+#define MPS_1D_KERNEL_LOOP_T(i, n, n_tgs, index_t)      \
+  for (index_t i = (tgid.x * tptg.x) + tid2.x; i < (n); \
+       i += (tptg.x * n_tgs))
+
+#define MPS_1D_KERNEL_LOOP(i, n, n_tgs) MPS_1D_KERNEL_LOOP_T(i, n, n_tgs, uint)
+
+/*----------Helpers--------*/
+
+template <typename T>
+inline T ceil_div(T n, T m) {
+  return (n + m - 1) / m;
+}
+
+inline void atomic_add_float(device float* data_ptr, const float val)
+{
+  atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed);
+}
+
+
+inline void atomic_add_float(device half* data_ptr, const half val)
+{
+  atomic_fetch_add_explicit((device atomic_float*) data_ptr, static_cast<float>(val), memory_order_relaxed);
+}
+
+template <typename T, typename integer_t>
+inline T bilinear_interpolate(
+    constant T* input,
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    uint index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  integer_t y_low = (integer_t)y;
+  integer_t x_low = (integer_t)x;
+  integer_t y_high;
+  integer_t x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T, typename integer_t>
+inline T bilinear_interpolate_deformable_conv2d(
+    constant T* input,
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    uint index /* index for debug only*/) {
+  if (y <= -1.0 || y >= height || x <= -1.0 || x >= width) {
+    return 0;
+  }
+  integer_t y_low = static_cast<integer_t>(floor(y));
+  integer_t x_low = static_cast<integer_t>(floor(x));
+  integer_t y_high = y_low + 1;
+  integer_t x_high = x_low + 1;
+
+  T ly = y - static_cast<T>(y_low);
+  T lx = x - static_cast<T>(x_low);
+  T hh = 1.0 - ly;
+  T hw = 1.0 - lx;
+
+  T v1 = 0;
+  if (y_low >= 0 && x_low >= 0)
+    v1 = input[y_low * width + x_low];
+  
+  T v2 = 0;
+  if (y_low >= 0 && x_high <= width - 1)
+    v2 = input[y_low * width + x_high];
+  
+  T v3 = 0;
+  if (y_high <= height - 1 && x_low >= 0)
+    v3 = input[y_high * width + x_low];
+  
+  T v4 = 0;
+  if (y_high <= height - 1 && x_high <= width - 1)
+    v4 = input[y_high * width + x_high];
+
+  T w1 = hh * hw;
+  T w2 = hh * lx;
+  T w3 = ly * hw;
+  T w4 = ly * lx;
+
+  T val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+  return val;
+}
+
+template <typename T, typename integer_t>
+inline void bilinear_interpolate_gradient(
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    thread T& w1,
+    thread T& w2,
+    thread T& w3,
+    thread T& w4,
+    thread integer_t& x_low,
+    thread integer_t& x_high,
+    thread integer_t& y_low,
+    thread integer_t& y_high,
+    uint index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (integer_t)y;
+  x_low = (integer_t)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+}
+
+template <typename T, typename scalar_t>
+inline bool IoU(
+  constant T & a,
+  threadgroup T & b,
+  const float threshold) {
+  auto xx1 = max(a.x, b.x);
+  auto yy1 = max(a.y, b.y);
+  auto xx2 = min(a.z, b.z);
+  auto yy2 = min(a.w, b.w);
+  auto w = max(static_cast<scalar_t>(0), xx2 - xx1);
+  auto h = max(static_cast<scalar_t>(0), yy2 - yy1);
+  // Upcast to float before multiplications to circumvent precision issues in half.
+  auto inter = static_cast<float>(w) * static_cast<float>(h);
+  auto area_b = static_cast<float>(b.z - b.x) * static_cast<float>(b.w - b.y);
+  auto area_a = static_cast<float>(a.z - a.x) * static_cast<float>(a.w - a.y);
+  return (inter / (area_a + area_b - inter)) > threshold;
+}
+
+/*----------Kernels----------*/
+
+// This should be in sync with the one in nms_kernel.mm.
+// Since metal does not support dynamic array,
+// we need to make it static instead of deriving it from [[threads_per_threadgroup]].
+constant int64_t nmsThreadsPerBlock = sizeof(uint64_t) * 8;
+
+template<typename T, typename scalar_t>
+kernel void nms(constant  T        * dev_boxes     [[buffer(0)]],
+                device    uint64_t * mask          [[buffer(1)]],
+                constant  int64_t  & n_boxes       [[buffer(2)]],
+                constant  float    & iou_threshold [[buffer(3)]],
+                uint2     tgid     [[threadgroup_position_in_grid]],
+                uint2     tid2     [[thread_position_in_threadgroup]]) {
+  
+  const uint row_start = tgid.y;
+  const uint col_start = tgid.x;
+  const uint tid = tid2.x;
+  const uint row_size =
+      min(n_boxes - row_start * nmsThreadsPerBlock, nmsThreadsPerBlock);
+  const uint col_size =
+      min(n_boxes - col_start * nmsThreadsPerBlock, nmsThreadsPerBlock);
+
+  threadgroup T block_boxes[nmsThreadsPerBlock];
+  block_boxes[tid] = dev_boxes[nmsThreadsPerBlock * col_start + tid];
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid < row_size) {
+    const uint cur_box_idx = nmsThreadsPerBlock * row_start + tid;
+    uint64_t t = 0;
+    uint start = 0;
+    
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+
+    for (uint i = start; i < col_size; i++){
+      if (IoU<T, scalar_t>(dev_boxes[cur_box_idx], block_boxes[i], iou_threshold)){
+        t |= static_cast<uint64_t>(1) << i;  // discard 1 keep 0
+      }
+    }
+    const uint col_blocks = ceil_div(n_boxes, nmsThreadsPerBlock);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+#define REGISTER_NMS_OP(DTYPE)                             \
+template                                                   \
+[[host_name("nms_" #DTYPE)]]                               \
+kernel void nms<DTYPE ## 4, DTYPE>(                        \
+  constant DTYPE ## 4 * dev_boxes         [[buffer(0)]],   \
+  device   uint64_t   * mask              [[buffer(1)]],   \
+  constant int64_t    & n_boxes           [[buffer(2)]],   \
+  constant float      & iou_threshold     [[buffer(3)]],   \
+  uint2    tgid   [[threadgroup_position_in_grid]],        \
+  uint2    tid2   [[thread_position_in_threadgroup]]);
+
+
+template<typename T>
+kernel void deformable_im2col_kernel(
+    constant T*           input_ptr     [[ buffer(0) ]],
+    constant T*           offset_ptr    [[ buffer(1) ]],
+    constant T*           mask_ptr      [[ buffer(2) ]],
+    constant int2&        input_size    [[ buffer(3) ]],   // (height, width)
+    constant int2&        weight_size   [[ buffer(4) ]],   // (weight_h, weight_w)
+    constant int2&        pad           [[ buffer(5) ]],   // (pad_h, pad_w)
+    constant int2&        stride        [[ buffer(6) ]],   // (stride_h, stride_w)
+    constant int2&        dilation      [[ buffer(7) ]],   // (dilation_h, dilation_w)
+    constant int&         batch_size    [[ buffer(8) ]],
+    constant int&         n_in_channels [[ buffer(9) ]],
+    constant int&         n_offset_grps [[ buffer(10)]],
+    constant int2&        out_size      [[ buffer(11)]],   // (out_h, out_w)
+    constant bool&        use_mask      [[ buffer(12)]],
+    device T*             columns_ptr   [[ buffer(13)]],
+    uint                  tid           [[ thread_position_in_grid ]],
+    uint                  tpg           [[ threads_per_grid ]]
+)
+{
+    int height = input_size.x, width = input_size.y;
+    int weight_h = weight_size.x, weight_w = weight_size.y;
+    int pad_h = pad.x, pad_w = pad.y;
+    int stride_h = stride.x, stride_w = stride.y;
+    int dilation_h = dilation.x, dilation_w = dilation.y;
+    int out_h = out_size.x, out_w = out_size.y;
+
+    int total = out_w * out_h * batch_size * n_in_channels;
+    if (tid >= total) {
+        return;
+    }
+
+    int out_x = tid % out_w;
+    int out_y = (tid / out_w) % out_h;
+    int out_b = (tid / (out_w * out_h)) % batch_size;
+    int in_c  = tid / (out_w * out_h * batch_size);
+    int out_c = in_c * weight_h * weight_w;
+    
+    int c_per_offset_grp = n_in_channels / n_offset_grps;
+    int grp_idx = in_c / c_per_offset_grp;
+    
+    int col_offset = out_c * (batch_size * out_h * out_w)
+                      + out_b * (out_h * out_w)
+                      + out_y * out_w + out_x;
+    device T* local_columns_ptr = columns_ptr + col_offset;
+    
+    int input_offset = out_b * (n_in_channels * height * width)
+                        + in_c * (height * width);
+    constant T* local_input_ptr = input_ptr + input_offset;
+    
+    int offset_offset = (out_b * n_offset_grps + grp_idx) * 2 * weight_h * weight_w * out_h * out_w;
+    constant T* local_offset_ptr = offset_ptr + offset_offset;
+    
+    constant T* local_mask_ptr = nullptr;
+    if (use_mask) {
+        int mask_offset = (out_b * n_offset_grps + grp_idx) * weight_h * weight_w * out_h * out_w;
+        local_mask_ptr = mask_ptr + mask_offset;
+    }
+    
+    for (int i = 0; i < weight_h; ++i) {
+        for (int j = 0; j < weight_w; ++j) {
+            int mask_index = i * weight_w + j;
+            int offset_index = 2 * mask_index;
+            
+            T mask_value = 1;
+            if (use_mask) {
+                mask_value = local_mask_ptr[mask_index * (out_h * out_w) + out_y * out_w + out_x];
+            }
+            
+            T offset_h_val = local_offset_ptr[offset_index * (out_h * out_w) + out_y * out_w + out_x];
+            T offset_w_val = local_offset_ptr[(offset_index + 1) * (out_h * out_w) + out_y * out_w + out_x];
+            
+            T y = (out_y * stride_h - pad_h) + i * dilation_h + offset_h_val;
+            T x = (out_x * stride_w - pad_w) + j * dilation_w + offset_w_val;
+            
+            T interp = bilinear_interpolate_deformable_conv2d(local_input_ptr, height, width, y, x, tid);
+            
+            *local_columns_ptr = mask_value * interp;
+            
+            local_columns_ptr += batch_size * out_h * out_w;
+        }
+    }
+}
+
+#define REGISTER_DEFORMABLE_IM2COL_OP(DTYPE)                                         \
+template                                                                             \
+[[host_name("deformable_im2col_" #DTYPE)]]                                           \
+kernel void deformable_im2col_kernel<DTYPE>(                                         \
+    constant DTYPE*               input_ptr        [[ buffer(0) ]],                  \
+    constant DTYPE*               offset_ptr       [[ buffer(1) ]],                  \
+    constant DTYPE*               mask_ptr         [[ buffer(2) ]],                  \
+    constant int2&                input_size       [[ buffer(3) ]],   /* (h, w) */   \
+    constant int2&                weight_size      [[ buffer(4) ]],   /* (h, w) */   \
+    constant int2&                pad              [[ buffer(5) ]],   /* (h, w) */   \
+    constant int2&                stride           [[ buffer(6) ]],   /* (h, w) */   \
+    constant int2&                dilation         [[ buffer(7) ]],   /* (h, w) */   \
+    constant int&                 batch_size       [[ buffer(8) ]],                  \
+    constant int&                 n_in_channels    [[ buffer(9) ]],                  \
+    constant int&                 n_offset_grps    [[ buffer(10)]],                  \
+    constant int2&                out_size         [[ buffer(11)]],  /* (h, w) */    \
+    constant bool&                use_mask         [[ buffer(12)]],                  \
+    device DTYPE*                 columns_ptr      [[ buffer(13)]],                  \
+    uint                          tid              [[ thread_position_in_grid ]],    \
+    uint                          tpg              [[ threads_per_grid ]]);
+
+template<typename T, typename integer_t>
+kernel void roi_align(
+    constant T       * input          [[buffer(0)]],
+    constant T       * rois           [[buffer(1)]],
+    device   T       * output         [[buffer(2)]],
+    constant int64_t & output_size    [[buffer(3)]],
+    constant int64_t & channels       [[buffer(4)]],
+    constant int64_t & height         [[buffer(5)]],
+    constant int64_t & width          [[buffer(6)]],
+    constant int64_t & pooled_height  [[buffer(7)]],
+    constant int64_t & pooled_width   [[buffer(8)]],
+    constant int64_t & sampling_ratio [[buffer(9)]],
+    constant bool    & aligned        [[buffer(10)]],
+    constant float   & spatial_scale  [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, static_cast<integer_t>(1)); // e.g. = 4
+
+    T output_val = 0.;
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    output[index] = output_val;
+  }
+}
+
+#define REGISTER_ROI_ALIGN_OP(DTYPE, INT_DTYPE)         \
+template                                                \
+[[host_name("roi_align_" #DTYPE)]]                      \
+kernel void roi_align<DTYPE, INT_DTYPE>(                \
+  constant DTYPE * input            [[buffer(0)]],      \
+  constant DTYPE * rois             [[buffer(1)]],      \
+  device   DTYPE * output           [[buffer(2)]],      \
+  constant int64_t & output_size    [[buffer(3)]],      \
+  constant int64_t & channels       [[buffer(4)]],      \
+  constant int64_t & height         [[buffer(5)]],      \
+  constant int64_t & width          [[buffer(6)]],      \
+  constant int64_t & pooled_height  [[buffer(7)]],      \
+  constant int64_t & pooled_width   [[buffer(8)]],      \
+  constant int64_t & sampling_ratio [[buffer(9)]],      \
+  constant bool    & aligned        [[buffer(10)]],     \
+  constant float   & spatial_scale  [[buffer(11)]],     \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_align_backward(
+    constant T       * grad_output    [[buffer(0)]],
+    constant T       * rois           [[buffer(1)]],
+    device   T       * grad_input     [[buffer(2)]],
+    constant int64_t & output_size    [[buffer(3)]],
+    constant int64_t & channels       [[buffer(4)]],
+    constant int64_t & height         [[buffer(5)]],
+    constant int64_t & width          [[buffer(6)]],
+    constant int64_t & pooled_height  [[buffer(7)]],
+    constant int64_t & pooled_width   [[buffer(8)]],
+    constant int64_t & sampling_ratio [[buffer(9)]],
+    constant bool    & aligned        [[buffer(10)]],
+    constant float   & spatial_scale  [[buffer(11)]],
+    constant int64_t & n_stride       [[buffer(12)]],
+    constant int64_t & c_stride       [[buffer(13)]],
+    constant int64_t & h_stride       [[buffer(14)]],
+    constant int64_t & w_stride       [[buffer(15)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We need to index the gradient using the tensor strides to access the
+    // correct values.
+    const integer_t output_offset = n * n_stride + c * c_stride;
+    constant T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    const integer_t input_offset = (roi_batch_ind * channels + c) * height * width;
+
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        integer_t x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomic_add_float(grad_input + input_offset + y_low * width + x_low, static_cast<T>(g1));
+          atomic_add_float(grad_input + input_offset + y_low * width + x_high, static_cast<T>(g2));
+          atomic_add_float(grad_input + input_offset + y_high * width + x_low, static_cast<T>(g3));
+          atomic_add_float(grad_input + input_offset + y_high * width + x_high, static_cast<T>(g4));
+          
+        } // if
+      } // ix
+    } // iy
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_ROI_ALIGN_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                   \
+[[host_name("roi_align_backward_" #DTYPE)]]                \
+kernel void roi_align_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output    [[buffer(0)]],       \
+    constant DTYPE   * rois           [[buffer(1)]],       \
+    device   DTYPE   * grad_input     [[buffer(2)]],       \
+    constant int64_t & output_size    [[buffer(3)]],       \
+    constant int64_t & channels       [[buffer(4)]],       \
+    constant int64_t & height         [[buffer(5)]],       \
+    constant int64_t & width          [[buffer(6)]],       \
+    constant int64_t & pooled_height  [[buffer(7)]],       \
+    constant int64_t & pooled_width   [[buffer(8)]],       \
+    constant int64_t & sampling_ratio [[buffer(9)]],       \
+    constant bool    & aligned        [[buffer(10)]],      \
+    constant float   & spatial_scale  [[buffer(11)]],      \
+    constant int64_t & n_stride       [[buffer(12)]],      \
+    constant int64_t & c_stride       [[buffer(13)]],      \
+    constant int64_t & h_stride       [[buffer(14)]],      \
+    constant int64_t & w_stride       [[buffer(15)]],      \
+    uint2     tgid   [[threadgroup_position_in_grid]],     \
+    uint2     tptg   [[threads_per_threadgroup]],          \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_pool(
+    constant T       * input         [[buffer(0)]],
+    constant T       * rois          [[buffer(1)]],
+    device   T       * output        [[buffer(2)]],
+    device   int64_t * argmax        [[buffer(3)]],
+    constant int64_t & output_size   [[buffer(4)]],
+    constant int64_t & channels      [[buffer(5)]],
+    constant int64_t & height        [[buffer(6)]],
+    constant int64_t & width         [[buffer(7)]],
+    constant int64_t & pooled_height [[buffer(8)]],
+    constant int64_t & pooled_width  [[buffer(9)]],
+    constant float   & spatial_scale [[buffer(10)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w + 1, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h + 1, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    integer_t maxidx = -1;
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t input_index = h * width + w;
+        if (offset_input[input_index] > maxval) {
+          maxval = offset_input[input_index];
+          maxidx = input_index;
+        }
+      }
+    }
+    output[index] = maxval;
+    argmax[index] = maxidx;
+  }
+}
+
+#define REGISTER_ROI_POOL_OP(DTYPE, INT_DTYPE)          \
+template                                                \
+[[host_name("roi_pool_" #DTYPE)]]                       \
+kernel void roi_pool<DTYPE, INT_DTYPE>(                 \
+  constant DTYPE * input           [[buffer(0)]],       \
+  constant DTYPE * rois            [[buffer(1)]],       \
+  device   DTYPE * output          [[buffer(2)]],       \
+  device   int64_t * argmax_data   [[buffer(3)]],       \
+  constant int64_t & output_size   [[buffer(4)]],       \
+  constant int64_t & channels      [[buffer(5)]],       \
+  constant int64_t & height        [[buffer(6)]],       \
+  constant int64_t & width         [[buffer(7)]],       \
+  constant int64_t & pooled_height [[buffer(8)]],       \
+  constant int64_t & pooled_width  [[buffer(9)]],       \
+  constant float   & spatial_scale [[buffer(10)]],      \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_pool_backward(
+    constant T       * grad_output   [[buffer(0)]],
+    constant T       * rois          [[buffer(1)]],
+    constant int64_t * argmax_data   [[buffer(2)]],
+    device   T       * grad_input    [[buffer(3)]],
+    constant int64_t & output_size   [[buffer(4)]],
+    constant int64_t & channels      [[buffer(5)]],
+    constant int64_t & height        [[buffer(6)]],
+    constant int64_t & width         [[buffer(7)]],
+    constant int64_t & pooled_height [[buffer(8)]],
+    constant int64_t & pooled_width  [[buffer(9)]],
+    constant float   & spatial_scale [[buffer(10)]],
+    constant int64_t & n_stride      [[buffer(11)]],
+    constant int64_t & c_stride      [[buffer(12)]],
+    constant int64_t & h_stride      [[buffer(13)]],
+    constant int64_t & w_stride      [[buffer(14)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    const integer_t output_offset = n * n_stride + c * c_stride;
+    constant integer_t * argmax_data_offset =
+        argmax_data + (n * channels + c) * pooled_height * pooled_width;
+    const integer_t argmax = argmax_data_offset[ph * pooled_width + pw];
+    const integer_t offset = (roi_batch_ind * channels + c) * height * width;
+
+    if (argmax != -1) {
+      atomic_add_float(grad_input + offset + argmax, static_cast<T>(grad_output[output_offset + ph * h_stride + pw * w_stride]));
+    }
+    
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_ROI_POOL_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                  \
+[[host_name("roi_pool_backward_" #DTYPE)]]                \
+kernel void roi_pool_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output   [[buffer(0)]],       \
+    constant DTYPE   * rois          [[buffer(1)]],       \
+    constant int64_t * argmax_data   [[buffer(2)]],       \
+    device   DTYPE   * grad_input    [[buffer(3)]],       \
+    constant int64_t & output_size   [[buffer(4)]],       \
+    constant int64_t & channels      [[buffer(5)]],       \
+    constant int64_t & height        [[buffer(6)]],       \
+    constant int64_t & width         [[buffer(7)]],       \
+    constant int64_t & pooled_height [[buffer(8)]],       \
+    constant int64_t & pooled_width  [[buffer(9)]],       \
+    constant float   & spatial_scale [[buffer(10)]],      \
+    constant int64_t & n_stride      [[buffer(11)]],      \
+    constant int64_t & c_stride      [[buffer(12)]],      \
+    constant int64_t & h_stride      [[buffer(13)]],      \
+    constant int64_t & w_stride      [[buffer(14)]],      \
+    uint2     tgid   [[threadgroup_position_in_grid]],    \
+    uint2     tptg   [[threads_per_threadgroup]],         \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_align(
+    constant T       * input           [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    device   T       * output          [[buffer(2)]],
+    device   int64_t * channel_mapping [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & sampling_ratio  [[buffer(10)]],
+    constant int64_t & channels_out    [[buffer(11)]],
+    constant float   & spatial_scale   [[buffer(12)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c_out, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c_out = (index / pooled_width / pooled_height) % channels_out;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    // (n, c_in, ph, pw) is the associated element in the input
+    integer_t c_in = (c_out * pooled_height + ph) * pooled_width + pw;
+
+    // [start, end) interval for spatial sampling
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - static_cast<T>(0.5);
+    T roi_start_h = offset_rois[2] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_w = offset_rois[3] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_h = offset_rois[4] * spatial_scale - static_cast<T>(0.5);
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // Do not using floor/ceil; this implementation detail is critical
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c_in) * height * width;
+    T out_sum = 0;
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = hstart +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = wstart +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        out_sum += val;
+      }
+    }
+
+    out_sum /= count;
+    output[index] = out_sum;
+    channel_mapping[index] = c_in;
+  }
+}
+
+#define REGISTER_PS_ROI_ALIGN_OP(DTYPE, INT_DTYPE)      \
+template                                                \
+[[host_name("ps_roi_align_" #DTYPE)]]                   \
+kernel void ps_roi_align<DTYPE, INT_DTYPE>(             \
+  constant DTYPE   * input           [[buffer(0)]],     \
+  constant DTYPE   * rois            [[buffer(1)]],     \
+  device   DTYPE   * output          [[buffer(2)]],     \
+  device   int64_t * channel_mapping [[buffer(3)]],     \
+  constant int64_t & output_size     [[buffer(4)]],     \
+  constant int64_t & channels        [[buffer(5)]],     \
+  constant int64_t & height          [[buffer(6)]],     \
+  constant int64_t & width           [[buffer(7)]],     \
+  constant int64_t & pooled_height   [[buffer(8)]],     \
+  constant int64_t & pooled_width    [[buffer(9)]],     \
+  constant int64_t & sampling_ratio  [[buffer(10)]],    \
+  constant int64_t & channels_out    [[buffer(11)]],    \
+  constant float   & spatial_scale   [[buffer(12)]],    \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_align_backward(
+    constant T       * grad_output     [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    constant int64_t * channel_mapping [[buffer(2)]],
+    device   T       * grad_input      [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & sampling_ratio  [[buffer(10)]],
+    constant int64_t & channels_out    [[buffer(11)]],
+    constant float   & spatial_scale   [[buffer(12)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, *, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - static_cast<T>(0.5);
+    T roi_start_h = offset_rois[2] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_w = offset_rois[3] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_h = offset_rois[4] * spatial_scale - static_cast<T>(0.5);
+
+    // Force too small ROIs to be 1x1
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    integer_t c_in = channel_mapping[index];
+
+    // Do not using floor/ceil; this implementation detail is critical
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    const integer_t offset = (roi_batch_ind * channels + c_in) * height * width;
+
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = hstart +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = wstart +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        integer_t x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomic_add_float(grad_input + offset + y_low * width + x_low, static_cast<T>(g1));
+          atomic_add_float(grad_input + offset + y_low * width + x_high, static_cast<T>(g2));
+          atomic_add_float(grad_input + offset + y_high * width + x_low, static_cast<T>(g3));
+          atomic_add_float(grad_input + offset + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  }
+}
+
+#define REGISTER_PS_ROI_ALIGN_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                      \
+[[host_name("ps_roi_align_backward_" #DTYPE)]]                \
+kernel void ps_roi_align_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output     [[buffer(0)]],         \
+    constant DTYPE   * rois            [[buffer(1)]],         \
+    constant int64_t * channel_mapping [[buffer(2)]],         \
+    device   DTYPE   * grad_input      [[buffer(3)]],         \
+    constant int64_t & output_size     [[buffer(4)]],         \
+    constant int64_t & channels        [[buffer(5)]],         \
+    constant int64_t & height          [[buffer(6)]],         \
+    constant int64_t & width           [[buffer(7)]],         \
+    constant int64_t & pooled_height   [[buffer(8)]],         \
+    constant int64_t & pooled_width    [[buffer(9)]],         \
+    constant int64_t & sampling_ratio  [[buffer(10)]],        \
+    constant int64_t & channels_out    [[buffer(11)]],        \
+    constant float   & spatial_scale   [[buffer(12)]],        \
+    uint2     tgid   [[threadgroup_position_in_grid]],        \
+    uint2     tptg   [[threads_per_threadgroup]],             \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_pool(
+    constant T       * input           [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    device   T       * output          [[buffer(2)]],
+    device   int64_t * channel_mapping [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & channels_out    [[buffer(10)]],
+    constant float   & spatial_scale   [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c_out, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c_out = (index / (pooled_width * pooled_height)) % channels_out;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    // (n, c_in, ph, pw) is the associated element in the input
+    integer_t c_in = (c_out * pooled_height + ph) * pooled_width + pw;
+
+    // [start, end) interval for spatial sampling
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force too small ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height - 1));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height - 1));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width - 1));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width - 1));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c_in) * height * width;
+    T out_sum = 0;
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t input_index = h * width + w;
+        out_sum += offset_input[input_index];
+      }
+    }
+
+    T bin_area = (hend - hstart) * (wend - wstart);
+    output[index] = is_empty ? static_cast<T>(0) : out_sum / bin_area;
+    channel_mapping[index] = c_in;
+  }
+}
+
+#define REGISTER_PS_ROI_POOL_OP(DTYPE, INT_DTYPE)     \
+template                                              \
+[[host_name("ps_roi_pool_" #DTYPE)]]                  \
+kernel void ps_roi_pool<DTYPE, INT_DTYPE>(            \
+  constant DTYPE   * input           [[buffer(0)]],   \
+  constant DTYPE   * rois            [[buffer(1)]],   \
+  device   DTYPE   * output          [[buffer(2)]],   \
+  device   int64_t * channel_mapping [[buffer(3)]],   \
+  constant int64_t & output_size     [[buffer(4)]],   \
+  constant int64_t & channels        [[buffer(5)]],   \
+  constant int64_t & height          [[buffer(6)]],   \
+  constant int64_t & width           [[buffer(7)]],   \
+  constant int64_t & pooled_height   [[buffer(8)]],   \
+  constant int64_t & pooled_width    [[buffer(9)]],   \
+  constant int64_t & channels_out    [[buffer(10)]],  \
+  constant float   & spatial_scale   [[buffer(11)]],  \
+  uint2    tgid   [[threadgroup_position_in_grid]],   \
+  uint2    tptg   [[threads_per_threadgroup]],        \
+  uint2    tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_pool_backward(
+    constant T       * grad_output     [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    constant int64_t * channel_mapping [[buffer(2)]],
+    device   T       * grad_input      [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & channels_out    [[buffer(10)]],
+    constant float   & spatial_scale   [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, *, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force too small ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    integer_t c_in = channel_mapping[index];
+    T bin_area = (hend - hstart) * (wend - wstart);
+    T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
+
+    const integer_t offset = (roi_batch_ind * channels + c_in) * height * width;
+
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t grad_input_index = h * width + w;
+        atomic_add_float(grad_input + offset + grad_input_index, diff_val);
+      }
+    }
+    
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_PS_ROI_POOL_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                     \
+[[host_name("ps_roi_pool_backward_" #DTYPE)]]                \
+kernel void ps_roi_pool_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output     [[buffer(0)]],        \
+    constant DTYPE   * rois            [[buffer(1)]],        \
+    constant int64_t * channel_mapping [[buffer(2)]],        \
+    device   DTYPE   * grad_input      [[buffer(3)]],        \
+    constant int64_t & output_size     [[buffer(4)]],        \
+    constant int64_t & channels        [[buffer(5)]],        \
+    constant int64_t & height          [[buffer(6)]],        \
+    constant int64_t & width           [[buffer(7)]],        \
+    constant int64_t & pooled_height   [[buffer(8)]],        \
+    constant int64_t & pooled_width    [[buffer(9)]],        \
+    constant int64_t & channels_out    [[buffer(10)]],       \ 
+    constant float   & spatial_scale   [[buffer(11)]],       \
+    uint2     tgid   [[threadgroup_position_in_grid]],       \
+    uint2     tptg   [[threads_per_threadgroup]],            \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+REGISTER_NMS_OP(float);
+REGISTER_NMS_OP(half);
+REGISTER_DEFORMABLE_IM2COL_OP(float);
+REGISTER_DEFORMABLE_IM2COL_OP(half);
+REGISTER_ROI_ALIGN_OP(float, int64_t);
+REGISTER_ROI_ALIGN_OP(half, int64_t);
+REGISTER_ROI_ALIGN_BACKWARD_OP(float, int64_t);
+REGISTER_ROI_ALIGN_BACKWARD_OP(half, int64_t);
+REGISTER_ROI_POOL_OP(float, int64_t);
+REGISTER_ROI_POOL_OP(half, int64_t);
+REGISTER_ROI_POOL_BACKWARD_OP(float, int64_t);
+REGISTER_ROI_POOL_BACKWARD_OP(half, int64_t);
+REGISTER_PS_ROI_ALIGN_OP(float, int64_t);
+REGISTER_PS_ROI_ALIGN_OP(half, int64_t);
+REGISTER_PS_ROI_ALIGN_BACKWARD_OP(float, int64_t);
+REGISTER_PS_ROI_ALIGN_BACKWARD_OP(half, int64_t);
+REGISTER_PS_ROI_POOL_OP(float, int64_t);
+REGISTER_PS_ROI_POOL_OP(half, int64_t);
+REGISTER_PS_ROI_POOL_BACKWARD_OP(float, int64_t);
+REGISTER_PS_ROI_POOL_BACKWARD_OP(half, int64_t);
+
+)VISION_METAL");
+
+static id<MTLComputePipelineState> visionPipelineState(
+    id<MTLDevice> device,
+    const std::string& kernel) {
+  return lib.getPipelineStateForFunc(kernel);
+}
+
+} // namespace mps
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/nms_kernel.mm b/torchvision/csrc/ops/mps/nms_kernel.mm
new file mode 100644
index 00000000000..5ee9b5cbeae
--- /dev/null
+++ b/torchvision/csrc/ops/mps/nms_kernel.mm
@@ -0,0 +1,109 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+// This should be in sync with `nmsThreadsPerBlock` in the metal kernel.
+constexpr int64_t nmsThreadsPerBlock = sizeof(uint64_t) * 8;
+
+at::Tensor nms_kernel(const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) {
+  using namespace at::native::mps;
+  TORCH_CHECK(dets.is_mps(), "dets must be a MPS tensor");
+  TORCH_CHECK(scores.is_mps(), "scores must be a MPS tensor");
+
+  TORCH_CHECK(dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
+  TORCH_CHECK(dets.size(1) == 4, "boxes should have 4 elements in dimension 1, got ", dets.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ", scores.dim(), "D");
+  TORCH_CHECK(dets.size(0) == scores.size(0),
+              "boxes and scores should have same number of elements in ",
+              "dimension 0, got ",
+              dets.size(0),
+              " and ",
+              scores.size(0))
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t).contiguous();
+  int64_t dets_num = dets.size(0);
+  float iou_threshold_f = static_cast<float>(iou_threshold);
+
+  const int col_blocks = (dets_num + nmsThreadsPerBlock - 1) / nmsThreadsPerBlock;
+  at::Tensor mask = at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(dets_sorted);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(mask);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(col_blocks, col_blocks, 1);
+
+      const std::string kernel = "nms_" + scalarToMetalTypeString(dets_sorted.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {dets, scores});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      [computeEncoder setBuffer:inputBuffer offset:dets_sorted.storage_offset() * dets_sorted.element_size() atIndex:0];
+      [computeEncoder setBuffer:outputBuffer offset:mask.storage_offset() * mask.element_size() atIndex:1];
+      [computeEncoder setBytes:&dets_num length:sizeof(int64_t) atIndex:2];
+      [computeEncoder setBytes:&iou_threshold_f length:sizeof(float) atIndex:3];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > nmsThreadsPerBlock) {
+        tgSize = nmsThreadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+
+  int64_t num_to_keep = 0;
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep = at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < dets_num; i++) {
+    int64_t nblock = i / nmsThreadsPerBlock;
+    int64_t inblock = i % nmsThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int64_t j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
new file mode 100644
index 00000000000..16b711ad5ef
--- /dev/null
+++ b/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
@@ -0,0 +1,205 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_kernel(const at::Tensor& input,
+                                                               const at::Tensor& rois,
+                                                               double spatial_scale,
+                                                               int64_t pooled_height,
+                                                               int64_t pooled_width,
+                                                               int64_t sampling_ratio) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ps_roi_align_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  TORCH_CHECK(channels % (pooled_height * pooled_width) == 0,
+              "input channels must be a multiple of pooling height * pooling width");
+
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  auto output = at::zeros({num_rois, channels_out, pooled_height, pooled_width}, input.options());
+  auto channel_mapping = at::zeros(output.sizes(), input.options().dtype(at::kLong));
+
+  int64_t output_size = output.numel();
+
+  if (output_size == 0) {
+    return std::make_tuple(output, channel_mapping);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_align_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:12];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, channel_mapping);
+}
+
+at::Tensor ps_roi_align_backward_kernel(const at::Tensor& grad,
+                                        const at::Tensor& rois,
+                                        const at::Tensor& channel_mapping,
+                                        double spatial_scale,
+                                        int64_t pooled_height,
+                                        int64_t pooled_width,
+                                        int64_t sampling_ratio,
+                                        int64_t batch_size,
+                                        int64_t channels,
+                                        int64_t height,
+                                        int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support ps_roi_align backward with float16 inputs.");
+  TORCH_CHECK(channel_mapping.is_mps(), "channel_mapping must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2},
+      channel_mapping_t{channel_mapping, "channel_mapping", 3};
+
+  at::CheckedFrom c = "ps_roi_align_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t output_size = grad.numel();
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  at::globalContext().alertNotDeterministic("ps_roi_align_backward_kernel");
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_align_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad_.storage_offset() * grad_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:12];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::ps_roi_align"), TORCH_FN(ps_roi_align_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_ps_roi_align_backward"), TORCH_FN(ps_roi_align_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
new file mode 100644
index 00000000000..75d0ff4845f
--- /dev/null
+++ b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
@@ -0,0 +1,199 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_kernel(const at::Tensor& input,
+                                                              const at::Tensor& rois,
+                                                              double spatial_scale,
+                                                              int64_t pooled_height,
+                                                              int64_t pooled_width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ps_roi_pool_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  TORCH_CHECK(channels % (pooled_height * pooled_width) == 0,
+              "input channels must be a multiple of pooling height * pooling width");
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  auto output = at::zeros({num_rois, channels_out, pooled_height, pooled_width}, input.options());
+  auto channel_mapping = at::zeros(output.sizes(), input.options().dtype(at::kLong));
+  auto output_size = output.numel();
+
+  if (output_size == 0) {
+    return std::make_tuple(output, channel_mapping);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_pool_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, channel_mapping);
+}
+
+at::Tensor ps_roi_pool_backward_kernel(const at::Tensor& grad,
+                                       const at::Tensor& rois,
+                                       const at::Tensor& channel_mapping,
+                                       double spatial_scale,
+                                       int64_t pooled_height,
+                                       int64_t pooled_width,
+                                       int64_t batch_size,
+                                       int64_t channels,
+                                       int64_t height,
+                                       int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support ps_roi_pool backward with float16 inputs.");
+  TORCH_CHECK(channel_mapping.is_mps(), "channel_mapping must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
+      channel_mapping_t{channel_mapping, "channel_mapping", 3};
+
+  at::CheckedFrom c = "ps_roi_pool_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("ps_roi_pool_backward_kernel");
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_pool_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad_, rois_, channel_mapping});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad_.storage_offset() * grad_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::ps_roi_pool"), TORCH_FN(ps_roi_pool_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_ps_roi_pool_backward"), TORCH_FN(ps_roi_pool_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/roi_align_kernel.mm b/torchvision/csrc/ops/mps/roi_align_kernel.mm
new file mode 100644
index 00000000000..d4ed8b43fd2
--- /dev/null
+++ b/torchvision/csrc/ops/mps/roi_align_kernel.mm
@@ -0,0 +1,197 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor roi_align_forward_kernel(const at::Tensor& input,
+                                    const at::Tensor& rois,
+                                    double spatial_scale,
+                                    int64_t pooled_height,
+                                    int64_t pooled_width,
+                                    int64_t sampling_ratio,
+                                    bool aligned) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_align_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor output = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  int64_t output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_align_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:3];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&aligned length:sizeof(bool) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return output;
+}
+
+at::Tensor roi_align_backward_kernel(const at::Tensor& grad,
+                                     const at::Tensor& rois,
+                                     double spatial_scale,
+                                     int64_t pooled_height,
+                                     int64_t pooled_width,
+                                     int64_t batch_size,
+                                     int64_t channels,
+                                     int64_t height,
+                                     int64_t width,
+                                     int64_t sampling_ratio,
+                                     bool aligned) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support roi_align backward with float16 inputs.");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_align_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_align_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad.storage_offset() * grad.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:2];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:3];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&aligned length:sizeof(bool) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+      [computeEncoder setBytes:&n_stride length:sizeof(int64_t) atIndex:12];
+      [computeEncoder setBytes:&c_stride length:sizeof(int64_t) atIndex:13];
+      [computeEncoder setBytes:&h_stride length:sizeof(int64_t) atIndex:14];
+      [computeEncoder setBytes:&w_stride length:sizeof(int64_t) atIndex:15];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::roi_align"), TORCH_FN(roi_align_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_roi_align_backward"), TORCH_FN(roi_align_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/roi_pool_kernel.mm b/torchvision/csrc/ops/mps/roi_pool_kernel.mm
new file mode 100644
index 00000000000..816d8d70863
--- /dev/null
+++ b/torchvision/csrc/ops/mps/roi_pool_kernel.mm
@@ -0,0 +1,196 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> roi_pool_forward_kernel(const at::Tensor& input,
+                                                           const at::Tensor& rois,
+                                                           double spatial_scale,
+                                                           int64_t pooled_height,
+                                                           int64_t pooled_width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_pool_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor output = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
+  at::Tensor argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kLong));
+
+  int64_t output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return std::make_tuple(output, argmax);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> argmaxBuffer = getMTLBufferStorage(argmax);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_pool_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:argmaxBuffer offset:argmax.storage_offset() * argmax.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:10];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, argmax);
+}
+
+at::Tensor roi_pool_backward_kernel(const at::Tensor& grad,
+                                    const at::Tensor& rois,
+                                    const at::Tensor& argmax,
+                                    double spatial_scale,
+                                    int64_t pooled_height,
+                                    int64_t pooled_width,
+                                    int64_t batch_size,
+                                    int64_t channels,
+                                    int64_t height,
+                                    int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support roi_pool backward with float16 inputs.");
+  TORCH_CHECK(argmax.is_mps(), "argmax must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2}, argmax_t{argmax, "argmax", 3};
+
+  at::CheckedFrom c = "roi_pool_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("roi_pool_backward_kernel");
+  auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> argmaxBuffer = getMTLBufferStorage(argmax_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_pool_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_, argmax_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad.storage_offset() * grad.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:argmaxBuffer offset:argmax_.storage_offset() * argmax_.element_size() atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:10];
+      [computeEncoder setBytes:&n_stride length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&c_stride length:sizeof(int64_t) atIndex:12];
+      [computeEncoder setBytes:&h_stride length:sizeof(int64_t) atIndex:13];
+      [computeEncoder setBytes:&w_stride length:sizeof(int64_t) atIndex:14];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::roi_pool"), TORCH_FN(roi_pool_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_roi_pool_backward"), TORCH_FN(roi_pool_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/nms.cpp b/torchvision/csrc/ops/nms.cpp
index 07a934bce5a..5ecf8812f1b 100644
--- a/torchvision/csrc/ops/nms.cpp
+++ b/torchvision/csrc/ops/nms.cpp
@@ -19,6 +19,7 @@ at::Tensor nms(
 }
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
+  m.set_python_module("torchvision._meta_registrations");
   m.def(TORCH_SELECTIVE_SCHEMA(
       "torchvision::nms(Tensor dets, Tensor scores, float iou_threshold) -> Tensor"));
 }
diff --git a/torchvision/csrc/ops/ps_roi_align.cpp b/torchvision/csrc/ops/ps_roi_align.cpp
index 6d091b3c695..de458c0d62d 100644
--- a/torchvision/csrc/ops/ps_roi_align.cpp
+++ b/torchvision/csrc/ops/ps_roi_align.cpp
@@ -22,6 +22,21 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_align(
       input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
 }
 
+std::tuple<at::Tensor, at::Tensor> ps_roi_align_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.ps_roi_align.ps_roi_align");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::ps_roi_align", "")
+                       .typed<decltype(ps_roi_align_symint)>();
+  return op.call(
+      input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
 namespace detail {
 
 at::Tensor _ps_roi_align_backward(
@@ -54,13 +69,43 @@ at::Tensor _ps_roi_align_backward(
       width);
 }
 
+at::Tensor _ps_roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const at::Tensor& channel_mapping,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width) {
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("torchvision::_ps_roi_align_backward", "")
+          .typed<decltype(_ps_roi_align_backward_symint)>();
+  return op.call(
+      grad,
+      rois,
+      channel_mapping,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      batch_size,
+      channels,
+      height,
+      width);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::ps_roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio) -> (Tensor, Tensor)"));
+      "torchvision::ps_roi_align(Tensor input, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, int sampling_ratio) -> (Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_ps_roi_align_backward(Tensor grad, Tensor rois, Tensor channel_mapping, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio, int batch_size, int channels, int height, int width) -> Tensor"));
+      "torchvision::_ps_roi_align_backward(Tensor grad, Tensor rois, Tensor channel_mapping, float spatial_scale, SymInt pooled_height, SymInt pooled_width, int sampling_ratio, SymInt batch_size, SymInt channels, SymInt height, SymInt width) -> Tensor"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/ps_roi_align.h b/torchvision/csrc/ops/ps_roi_align.h
index c5ed865982c..75650586bc6 100644
--- a/torchvision/csrc/ops/ps_roi_align.h
+++ b/torchvision/csrc/ops/ps_roi_align.h
@@ -14,6 +14,14 @@ VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_align(
     int64_t pooled_width,
     int64_t sampling_ratio);
 
+VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_align_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio);
+
 namespace detail {
 
 at::Tensor _ps_roi_align_backward(
@@ -29,6 +37,19 @@ at::Tensor _ps_roi_align_backward(
     int64_t height,
     int64_t width);
 
+at::Tensor _ps_roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const at::Tensor& channel_mapping,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width);
+
 } // namespace detail
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/ps_roi_pool.cpp b/torchvision/csrc/ops/ps_roi_pool.cpp
index c9f64661033..92469d5e380 100644
--- a/torchvision/csrc/ops/ps_roi_pool.cpp
+++ b/torchvision/csrc/ops/ps_roi_pool.cpp
@@ -20,6 +20,19 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_pool(
   return op.call(input, rois, spatial_scale, pooled_height, pooled_width);
 }
 
+std::tuple<at::Tensor, at::Tensor> ps_roi_pool_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.ps_roi_pool.ps_roi_pool");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::ps_roi_pool", "")
+                       .typed<decltype(ps_roi_pool_symint)>();
+  return op.call(input, rois, spatial_scale, pooled_height, pooled_width);
+}
+
 namespace detail {
 
 at::Tensor _ps_roi_pool_backward(
@@ -50,13 +63,41 @@ at::Tensor _ps_roi_pool_backward(
       width);
 }
 
+at::Tensor _ps_roi_pool_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const at::Tensor& channel_mapping,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width) {
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("torchvision::_ps_roi_pool_backward", "")
+          .typed<decltype(_ps_roi_pool_backward_symint)>();
+  return op.call(
+      grad,
+      rois,
+      channel_mapping,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::ps_roi_pool(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width) -> (Tensor, Tensor)"));
+      "torchvision::ps_roi_pool(Tensor input, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width) -> (Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_ps_roi_pool_backward(Tensor grad, Tensor rois, Tensor channel_mapping, float spatial_scale, int pooled_height, int pooled_width, int batch_size, int channels, int height, int width) -> Tensor"));
+      "torchvision::_ps_roi_pool_backward(Tensor grad, Tensor rois, Tensor channel_mapping, float spatial_scale, SymInt pooled_height, SymInt pooled_width, SymInt batch_size, SymInt channels, SymInt height, SymInt width) -> Tensor"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/ps_roi_pool.h b/torchvision/csrc/ops/ps_roi_pool.h
index 20c2511e7aa..4a3cc54e0e5 100644
--- a/torchvision/csrc/ops/ps_roi_pool.h
+++ b/torchvision/csrc/ops/ps_roi_pool.h
@@ -13,6 +13,13 @@ VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_pool(
     int64_t pooled_height,
     int64_t pooled_width);
 
+VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_pool_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width);
+
 namespace detail {
 
 at::Tensor _ps_roi_pool_backward(
@@ -27,6 +34,18 @@ at::Tensor _ps_roi_pool_backward(
     int64_t height,
     int64_t width);
 
+at::Tensor _ps_roi_pool_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const at::Tensor& channel_mapping,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width);
+
 } // namespace detail
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
index 8af7fe3a07e..fdcf21a2ae3 100644
--- a/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
@@ -17,8 +17,9 @@ at::Tensor qnms_kernel_impl(
       dets.scalar_type() == scores.scalar_type(),
       "dets should have the same type as scores");
 
-  if (dets.numel() == 0)
+  if (dets.numel() == 0) {
     return at::empty({0}, dets.options().dtype(at::kLong));
+  }
 
   const auto ndets = dets.size(0);
 
@@ -56,8 +57,9 @@ at::Tensor qnms_kernel_impl(
 
   for (int64_t _i = 0; _i < ndets; _i++) {
     auto i = order[_i];
-    if (suppressed[i] == 1)
+    if (suppressed[i] == 1) {
       continue;
+    }
     keep[num_to_keep++] = i;
 
     // We explicitly cast coordinates to float so that the code can be
@@ -70,8 +72,9 @@ at::Tensor qnms_kernel_impl(
 
     for (int64_t _j = _i + 1; _j < ndets; _j++) {
       auto j = order[_j];
-      if (suppressed[j] == 1)
+      if (suppressed[j] == 1) {
         continue;
+      }
       float xx1 = std::max(ix1val, (float)x1[j].val_);
       float yy1 = std::max(iy1val, (float)y1[j].val_);
       float xx2 = std::min(ix2val, (float)x2[j].val_);
@@ -81,8 +84,9 @@ at::Tensor qnms_kernel_impl(
       auto h = std::max(0.f, yy2 - yy1); // * scale (gets canceled below)
       auto inter = w * h;
       auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr > iou_threshold)
+      if (ovr > iou_threshold) {
         suppressed[j] = 1;
+      }
     }
   }
   return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 15f468b31e7..c9097603f39 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -3,6 +3,10 @@
 
 #include "../../cpu/roi_align_common.h"
 
+#ifdef USE_FBGEMM
+#include <fbgemm/QuantUtils.h>
+#endif
+
 namespace vision {
 namespace ops {
 
@@ -164,7 +168,7 @@ void qroi_align_forward_kernel_impl(
     const float count =
         std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indices and weights shared by all chanels,
+    // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<detail::PreCalc<float>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
@@ -257,8 +261,9 @@ at::Tensor qroi_align_forward_kernel(
       input.q_scale(),
       input.q_zero_point());
 
-  if (output.numel() == 0)
+  if (output.numel() == 0) {
     return output;
+  }
 
   AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] {
     qroi_align_forward_kernel_impl<scalar_t>(
diff --git a/torchvision/csrc/ops/roi_align.cpp b/torchvision/csrc/ops/roi_align.cpp
index e2465d6261e..aa6dccb44f2 100644
--- a/torchvision/csrc/ops/roi_align.cpp
+++ b/torchvision/csrc/ops/roi_align.cpp
@@ -32,6 +32,31 @@ at::Tensor roi_align(
       aligned);
 }
 
+at::Tensor roi_align_symint(
+    const at::Tensor& input, // Input feature map.
+    const at::Tensor& rois, // List of ROIs to pool over.
+    double spatial_scale, // The scale of the image features. ROIs will be
+    // scaled to this.
+    c10::SymInt pooled_height, // The height of the pooled feature map.
+    c10::SymInt pooled_width, // The width of the pooled feature
+    int64_t sampling_ratio, // The number of points to sample in each bin
+    bool aligned) // The flag for pixel shift
+// along each axis.
+{
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.roi_align.roi_align");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::roi_align", "")
+                       .typed<decltype(roi_align_symint)>();
+  return op.call(
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
+}
+
 namespace detail {
 
 at::Tensor _roi_align_backward(
@@ -64,13 +89,43 @@ at::Tensor _roi_align_backward(
       aligned);
 }
 
+at::Tensor _roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("torchvision::_roi_align_backward", "")
+          .typed<decltype(_roi_align_backward_symint)>();
+  return op.call(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio,
+      aligned);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio, bool aligned) -> Tensor"));
+      "torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, int sampling_ratio, bool aligned) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_roi_align_backward(Tensor grad, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int batch_size, int channels, int height, int width, int sampling_ratio, bool aligned) -> Tensor"));
+      "torchvision::_roi_align_backward(Tensor grad, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, SymInt batch_size, SymInt channels, SymInt height, SymInt width, int sampling_ratio, bool aligned) -> Tensor"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 2ddb6ac3945..072d6d4231c 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -15,6 +15,15 @@ VISION_API at::Tensor roi_align(
     int64_t sampling_ratio,
     bool aligned);
 
+VISION_API at::Tensor roi_align_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 namespace detail {
 
 at::Tensor _roi_align_backward(
@@ -30,6 +39,19 @@ at::Tensor _roi_align_backward(
     int64_t sampling_ratio,
     bool aligned);
 
+at::Tensor _roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 } // namespace detail
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/roi_pool.cpp b/torchvision/csrc/ops/roi_pool.cpp
index 53969e247ee..20ca3ca91e7 100644
--- a/torchvision/csrc/ops/roi_pool.cpp
+++ b/torchvision/csrc/ops/roi_pool.cpp
@@ -20,6 +20,19 @@ std::tuple<at::Tensor, at::Tensor> roi_pool(
   return op.call(input, rois, spatial_scale, pooled_height, pooled_width);
 }
 
+std::tuple<at::Tensor, at::Tensor> roi_pool_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.roi_pool.roi_pool");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::roi_pool", "")
+                       .typed<decltype(roi_pool_symint)>();
+  return op.call(input, rois, spatial_scale, pooled_height, pooled_width);
+}
+
 namespace detail {
 
 at::Tensor _roi_pool_backward(
@@ -49,13 +62,40 @@ at::Tensor _roi_pool_backward(
       width);
 }
 
+at::Tensor _roi_pool_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const at::Tensor& argmax,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::_roi_pool_backward", "")
+                       .typed<decltype(_roi_pool_backward_symint)>();
+  return op.call(
+      grad,
+      rois,
+      argmax,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::roi_pool(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width) -> (Tensor, Tensor)"));
+      "torchvision::roi_pool(Tensor input, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width) -> (Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_roi_pool_backward(Tensor grad, Tensor rois, Tensor argmax, float spatial_scale, int pooled_height, int pooled_width, int batch_size, int channels, int height, int width) -> Tensor"));
+      "torchvision::_roi_pool_backward(Tensor grad, Tensor rois, Tensor argmax, float spatial_scale, SymInt pooled_height, SymInt pooled_width, SymInt batch_size, SymInt channels, SymInt height, SymInt width) -> Tensor"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/roi_pool.h b/torchvision/csrc/ops/roi_pool.h
index 25ef5a1986d..e2133240f4f 100644
--- a/torchvision/csrc/ops/roi_pool.h
+++ b/torchvision/csrc/ops/roi_pool.h
@@ -13,6 +13,13 @@ VISION_API std::tuple<at::Tensor, at::Tensor> roi_pool(
     int64_t pooled_height,
     int64_t pooled_width);
 
+VISION_API std::tuple<at::Tensor, at::Tensor> roi_pool_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width);
+
 namespace detail {
 
 at::Tensor _roi_pool_backward(
@@ -27,6 +34,18 @@ at::Tensor _roi_pool_backward(
     int64_t height,
     int64_t width);
 
+at::Tensor _roi_pool_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const at::Tensor& argmax,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width);
+
 } // namespace detail
 
 } // namespace ops
diff --git a/torchvision/csrc/vision.cpp b/torchvision/csrc/vision.cpp
index 161b8ecfa2f..806e870a83f 100644
--- a/torchvision/csrc/vision.cpp
+++ b/torchvision/csrc/vision.cpp
@@ -1,10 +1,5 @@
 #include "vision.h"
 
-#ifndef MOBILE
-#ifdef USE_PYTHON
-#include <Python.h>
-#endif
-#endif
 #include <torch/library.h>
 
 #ifdef WITH_CUDA
@@ -16,14 +11,10 @@
 
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension.
-// For PyMODINIT_FUNC to work, we need to include Python.h
 #if !defined(MOBILE) && defined(_WIN32)
-#ifdef USE_PYTHON
-PyMODINIT_FUNC PyInit__C(void) {
-  // No need to do anything.
-  return NULL;
+void* PyInit__C(void) {
+  return nullptr;
 }
-#endif // USE_PYTHON
 #endif // !defined(MOBILE) && defined(_WIN32)
 
 namespace vision {
diff --git a/torchvision/csrc/vision.h b/torchvision/csrc/vision.h
index 22f8c6cdd38..651ef3ca143 100644
--- a/torchvision/csrc/vision.h
+++ b/torchvision/csrc/vision.h
@@ -7,10 +7,6 @@ namespace vision {
 VISION_API int64_t cuda_version();
 
 namespace detail {
-extern "C" VISION_INLINE_VARIABLE auto _register_ops = &cuda_version;
-#ifdef HINT_MSVC_LINKER_INCLUDE_SYMBOL
-#pragma comment(linker, "/include:_register_ops")
-#endif
-
+extern "C" inline auto _register_ops = &cuda_version;
 } // namespace detail
 } // namespace vision
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index d5303849a41..4005b4a9072 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,11 +1,13 @@
 from ._optical_flow import FlyingChairs, FlyingThings3D, HD1K, KittiFlow, Sintel
 from ._stereo_matching import (
     CarlaStereo,
+    CREStereo,
     ETH3DStereo,
     FallingThingsStereo,
     InStereo2k,
     Kitti2012Stereo,
     Kitti2015Stereo,
+    Middlebury2014Stereo,
     SceneFlowStereo,
     SintelStereo,
 )
@@ -28,12 +30,14 @@
 from .gtsrb import GTSRB
 from .hmdb51 import HMDB51
 from .imagenet import ImageNet
+from .imagenette import Imagenette
 from .inaturalist import INaturalist
-from .kinetics import Kinetics, Kinetics400
+from .kinetics import Kinetics
 from .kitti import Kitti
 from .lfw import LFWPairs, LFWPeople
 from .lsun import LSUN, LSUNClass
 from .mnist import EMNIST, FashionMNIST, KMNIST, MNIST, QMNIST
+from .moving_mnist import MovingMNIST
 from .omniglot import Omniglot
 from .oxford_iiit_pet import OxfordIIITPet
 from .pcam import PCAM
@@ -68,6 +72,7 @@
     "QMNIST",
     "MNIST",
     "KMNIST",
+    "MovingMNIST",
     "StanfordCars",
     "STL10",
     "SUN397",
@@ -90,7 +95,6 @@
     "SBDataset",
     "VisionDataset",
     "USPS",
-    "Kinetics400",
     "Kinetics",
     "HMDB51",
     "UCF101",
@@ -118,9 +122,26 @@
     "Kitti2012Stereo",
     "Kitti2015Stereo",
     "CarlaStereo",
+    "Middlebury2014Stereo",
+    "CREStereo",
     "FallingThingsStereo",
     "SceneFlowStereo",
     "SintelStereo",
     "InStereo2k",
     "ETH3DStereo",
+    "wrap_dataset_for_transforms_v2",
+    "Imagenette",
 )
+
+
+# We override current module's attributes to handle the import:
+# from torchvision.datasets import wrap_dataset_for_transforms_v2
+# without a cyclic error.
+# Ref: https://peps.python.org/pep-0562/
+def __getattr__(name):
+    if name in ("wrap_dataset_for_transforms_v2",):
+        from torchvision.tv_tensors._dataset_wrapper import wrap_dataset_for_transforms_v2
+
+        return wrap_dataset_for_transforms_v2
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py
index bc26f51dc75..52884dbc183 100644
--- a/torchvision/datasets/_optical_flow.py
+++ b/torchvision/datasets/_optical_flow.py
@@ -3,15 +3,20 @@
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import torch
 from PIL import Image
 
-from ..io.image import _read_png_16
+from ..io.image import decode_png, read_file
+from .folder import default_loader
 from .utils import _read_pfm, verify_str_arg
 from .vision import VisionDataset
 
+T1 = tuple[Image.Image, Image.Image, Optional[np.ndarray], Optional[np.ndarray]]
+T2 = tuple[Image.Image, Image.Image, Optional[np.ndarray]]
+
 
 __all__ = (
     "KittiFlow",
@@ -28,26 +33,29 @@ class FlowDataset(ABC, VisionDataset):
     # and it's up to whatever consumes the dataset to decide what valid_flow_mask should be.
     _has_builtin_flow_mask = False
 
-    def __init__(self, root, transforms=None):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        transforms: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
 
         super().__init__(root=root)
         self.transforms = transforms
 
-        self._flow_list = []
-        self._image_list = []
+        self._flow_list: list[str] = []
+        self._image_list: list[list[str]] = []
+        self._loader = loader
 
-    def _read_img(self, file_name):
-        img = Image.open(file_name)
-        if img.mode != "RGB":
-            img = img.convert("RGB")
-        return img
+    def _read_img(self, file_name: str) -> Union[Image.Image, torch.Tensor]:
+        return self._loader(file_name)
 
     @abstractmethod
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str):
         # Return the flow or a tuple with the flow and the valid_flow_mask if _has_builtin_flow_mask is True
         pass
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
 
         img1 = self._read_img(self._image_list[index][0])
         img2 = self._read_img(self._image_list[index][1])
@@ -66,14 +74,14 @@ def __getitem__(self, index):
 
         if self._has_builtin_flow_mask or valid_flow_mask is not None:
             # The `or valid_flow_mask is not None` part is here because the mask can be generated within a transform
-            return img1, img2, flow, valid_flow_mask
+            return img1, img2, flow, valid_flow_mask  # type: ignore[return-value]
         else:
-            return img1, img2, flow
+            return img1, img2, flow  # type: ignore[return-value]
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._image_list)
 
-    def __rmul__(self, v):
+    def __rmul__(self, v: int) -> torch.utils.data.ConcatDataset:
         return torch.utils.data.ConcatDataset([self] * v)
 
 
@@ -108,7 +116,7 @@ class Sintel(FlowDataset):
                         ...
 
     Args:
-        root (string): Root directory of the Sintel Dataset.
+        root (str or ``pathlib.Path``): Root directory of the Sintel Dataset.
         split (string, optional): The dataset split, either "train" (default) or "test"
         pass_name (string, optional): The pass to use, either "clean" (default), "final", or "both". See link above for
             details on the different passes.
@@ -116,10 +124,20 @@ class Sintel(FlowDataset):
             ``img1, img2, flow, valid_flow_mask`` and returns a transformed version.
             ``valid_flow_mask`` is expected for consistency with other datasets which
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
-    def __init__(self, root, split="train", pass_name="clean", transforms=None):
-        super().__init__(root=root, transforms=transforms)
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str = "train",
+        pass_name: str = "clean",
+        transforms: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root=root, transforms=transforms, loader=loader)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
         verify_str_arg(pass_name, "pass_name", valid_values=("clean", "final", "both"))
@@ -139,7 +157,7 @@ def __init__(self, root, split="train", pass_name="clean", transforms=None):
                 if split == "train":
                     self._flow_list += sorted(glob(str(flow_root / scene / "*.flo")))
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -154,7 +172,7 @@ def __getitem__(self, index):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_flo(file_name)
 
 
@@ -172,16 +190,25 @@ class KittiFlow(FlowDataset):
                     flow_occ
 
     Args:
-        root (string): Root directory of the KittiFlow Dataset.
+        root (str or ``pathlib.Path``): Root directory of the KittiFlow Dataset.
         split (string, optional): The dataset split, either "train" (default) or "test"
         transforms (callable, optional): A function/transform that takes in
             ``img1, img2, flow, valid_flow_mask`` and returns a transformed version.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _has_builtin_flow_mask = True
 
-    def __init__(self, root, split="train", transforms=None):
-        super().__init__(root=root, transforms=transforms)
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str = "train",
+        transforms: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root=root, transforms=transforms, loader=loader)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
@@ -200,7 +227,7 @@ def __init__(self, root, split="train", transforms=None):
         if split == "train":
             self._flow_list = sorted(glob(str(root / "flow_occ" / "*_10.png")))
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -215,7 +242,7 @@ def __getitem__(self, index):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> tuple[np.ndarray, np.ndarray]:
         return _read_16bits_png_with_flow_and_valid_mask(file_name)
 
 
@@ -237,7 +264,7 @@ class FlyingChairs(FlowDataset):
 
 
     Args:
-        root (string): Root directory of the FlyingChairs Dataset.
+        root (str or ``pathlib.Path``): Root directory of the FlyingChairs Dataset.
         split (string, optional): The dataset split, either "train" (default) or "val"
         transforms (callable, optional): A function/transform that takes in
             ``img1, img2, flow, valid_flow_mask`` and returns a transformed version.
@@ -245,7 +272,7 @@ class FlyingChairs(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "val"))
@@ -268,7 +295,7 @@ def __init__(self, root, split="train", transforms=None):
                 self._flow_list += [flows[i]]
                 self._image_list += [[images[2 * i], images[2 * i + 1]]]
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -283,7 +310,7 @@ def __getitem__(self, index):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_flo(file_name)
 
 
@@ -305,7 +332,7 @@ class FlyingThings3D(FlowDataset):
                     TRAIN
 
     Args:
-        root (string): Root directory of the intel FlyingThings3D Dataset.
+        root (str or ``pathlib.Path``): Root directory of the intel FlyingThings3D Dataset.
         split (string, optional): The dataset split, either "train" (default) or "test"
         pass_name (string, optional): The pass to use, either "clean" (default) or "final" or "both". See link above for
             details on the different passes.
@@ -314,10 +341,21 @@ class FlyingThings3D(FlowDataset):
             ``img1, img2, flow, valid_flow_mask`` and returns a transformed version.
             ``valid_flow_mask`` is expected for consistency with other datasets which
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
-    def __init__(self, root, split="train", pass_name="clean", camera="left", transforms=None):
-        super().__init__(root=root, transforms=transforms)
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str = "train",
+        pass_name: str = "clean",
+        camera: str = "left",
+        transforms: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root=root, transforms=transforms, loader=loader)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
         split = split.upper()
@@ -359,7 +397,7 @@ def __init__(self, root, split="train", pass_name="clean", camera="left", transf
                         self._image_list += [[images[i + 1], images[i]]]
                         self._flow_list += [flows[i + 1]]
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -374,7 +412,7 @@ def __getitem__(self, index):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_pfm(file_name)
 
 
@@ -393,16 +431,25 @@ class HD1K(FlowDataset):
                     image_2
 
     Args:
-        root (string): Root directory of the HD1K Dataset.
+        root (str or ``pathlib.Path``): Root directory of the HD1K Dataset.
         split (string, optional): The dataset split, either "train" (default) or "test"
         transforms (callable, optional): A function/transform that takes in
             ``img1, img2, flow, valid_flow_mask`` and returns a transformed version.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _has_builtin_flow_mask = True
 
-    def __init__(self, root, split="train", transforms=None):
-        super().__init__(root=root, transforms=transforms)
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str = "train",
+        transforms: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root=root, transforms=transforms, loader=loader)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
@@ -426,10 +473,10 @@ def __init__(self, root, split="train", transforms=None):
                 "Could not find the HD1K images. Please make sure the directory structure is correct."
             )
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> tuple[np.ndarray, np.ndarray]:
         return _read_16bits_png_with_flow_and_valid_mask(file_name)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -445,7 +492,7 @@ def __getitem__(self, index):
         return super().__getitem__(index)
 
 
-def _read_flo(file_name):
+def _read_flo(file_name: str) -> np.ndarray:
     """Read .flo file in Middlebury format"""
     # Code adapted from:
     # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
@@ -462,9 +509,9 @@ def _read_flo(file_name):
         return data.reshape(h, w, 2).transpose(2, 0, 1)
 
 
-def _read_16bits_png_with_flow_and_valid_mask(file_name):
+def _read_16bits_png_with_flow_and_valid_mask(file_name: str) -> tuple[np.ndarray, np.ndarray]:
 
-    flow_and_valid = _read_png_16(file_name).to(torch.float32)
+    flow_and_valid = decode_png(read_file(file_name)).to(torch.float32)
     flow, valid_flow_mask = flow_and_valid[:2, :, :], flow_and_valid[2, :, :]
     flow = (flow - 2**15) / 64  # This conversion is explained somewhere on the kitti archive
     valid_flow_mask = valid_flow_mask.bool()
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 3938af68c7b..bc2236e97b8 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,17 +1,22 @@
 import functools
 import json
 import os
+import random
+import shutil
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, cast, Optional, Union
 
 import numpy as np
 from PIL import Image
 
-from .utils import _read_pfm, verify_str_arg
+from .utils import _read_pfm, download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
 
+T1 = tuple[Image.Image, Image.Image, Optional[np.ndarray], np.ndarray]
+T2 = tuple[Image.Image, Image.Image, Optional[np.ndarray]]
+
 __all__ = ()
 
 _read_pfm_file = functools.partial(_read_pfm, slice_channels=1)
@@ -22,7 +27,7 @@ class StereoMatchingDataset(ABC, VisionDataset):
 
     _has_built_in_disparity_mask = False
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], transforms: Optional[Callable] = None) -> None:
         """
         Args:
             root(str): Root directory of the dataset.
@@ -50,17 +55,21 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         self._images = []  # type: ignore
         self._disparities = []  # type: ignore
 
-    def _read_img(self, file_path: str) -> Image.Image:
+    def _read_img(self, file_path: Union[str, Path]) -> Image.Image:
         img = Image.open(file_path)
         if img.mode != "RGB":
-            img = img.convert("RGB")
+            img = img.convert("RGB")  # type: ignore [assignment]
         return img
 
-    def _scan_pairs(self, paths_left_pattern: str, paths_right_pattern: Optional[str] = None):
+    def _scan_pairs(
+        self,
+        paths_left_pattern: str,
+        paths_right_pattern: Optional[str] = None,
+    ) -> list[tuple[str, Optional[str]]]:
 
         left_paths = list(sorted(glob(paths_left_pattern)))
 
-        right_paths: List[Union[None, str]]
+        right_paths: list[Union[None, str]]
         if paths_right_pattern:
             right_paths = list(sorted(glob(paths_right_pattern)))
         else:
@@ -83,11 +92,11 @@ def _scan_pairs(self, paths_left_pattern: str, paths_right_pattern: Optional[str
         return paths
 
     @abstractmethod
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[Optional[np.ndarray], Optional[np.ndarray]]:
         # function that returns a disparity map and an occlusion map
         pass
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -118,7 +127,7 @@ def __getitem__(self, index: int) -> Tuple:
             ) = self.transforms(imgs, dsp_maps, valid_masks)
 
         if self._has_built_in_disparity_mask or valid_masks[0] is not None:
-            return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
+            return imgs[0], imgs[1], dsp_maps[0], cast(np.ndarray, valid_masks[0])
         else:
             return imgs[0], imgs[1], dsp_maps[0]
 
@@ -150,11 +159,11 @@ class CarlaStereo(StereoMatchingDataset):
                     ...
 
     Args:
-        root (string): Root directory where `carla-highres` is located.
+        root (str or ``pathlib.Path``): Root directory where `carla-highres` is located.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "carla-highres"
@@ -169,13 +178,13 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
         self._disparities = disparities
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[np.ndarray, None]:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -187,7 +196,7 @@ def __getitem__(self, index: int) -> Tuple:
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Kitti2012Stereo(StereoMatchingDataset):
@@ -224,14 +233,14 @@ class Kitti2012Stereo(StereoMatchingDataset):
                     calib
 
     Args:
-        root (string): Root directory where `Kitti2012` is located.
+        root (str or ``pathlib.Path``): Root directory where `Kitti2012` is located.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -248,7 +257,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         else:
             self._disparities = list((None, None) for _ in self._images)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[Optional[np.ndarray], None]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -259,7 +268,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -272,7 +281,7 @@ def __getitem__(self, index: int) -> Tuple:
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Kitti2015Stereo(StereoMatchingDataset):
@@ -312,14 +321,14 @@ class Kitti2015Stereo(StereoMatchingDataset):
                     calib
 
     Args:
-        root (string): Root directory where `Kitti2015` is located.
+        root (str or ``pathlib.Path``): Root directory where `Kitti2015` is located.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -336,7 +345,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         else:
             self._disparities = list((None, None) for _ in self._images)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[Optional[np.ndarray], None]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -347,7 +356,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -360,13 +369,357 @@ def __getitem__(self, index: int) -> Tuple:
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
+
+
+class Middlebury2014Stereo(StereoMatchingDataset):
+    """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
+
+    The dataset mostly follows the original format, without containing the ambient subdirectories.  : ::
+
+        root
+            Middlebury2014
+                train
+                    scene1-{perfect,imperfect}
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
+                        disp{0,1}y.pfm
+                    scene2-{perfect,imperfect}
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
+                        disp{0,1}y.pfm
+                    ...
+                additional
+                    scene1-{perfect,imperfect}
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
+                        disp{0,1}y.pfm
+                    ...
+                test
+                    scene1
+                        calib.txt
+                        im{0,1}.png
+                    scene2
+                        calib.txt
+                        im{0,1}.png
+                    ...
+
+    Args:
+        root (str or ``pathlib.Path``): Root directory of the Middleburry 2014 Dataset.
+        split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
+        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
+            The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
+        calibration (string, optional): Whether or not to use the calibrated (default) or uncalibrated scenes.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
+        download (boolean, optional): Whether or not to download the dataset in the ``root`` directory.
+    """
+
+    splits = {
+        "train": [
+            "Adirondack",
+            "Jadeplant",
+            "Motorcycle",
+            "Piano",
+            "Pipes",
+            "Playroom",
+            "Playtable",
+            "Recycle",
+            "Shelves",
+            "Vintage",
+        ],
+        "additional": [
+            "Backpack",
+            "Bicycle1",
+            "Cable",
+            "Classroom1",
+            "Couch",
+            "Flowers",
+            "Mask",
+            "Shopvac",
+            "Sticks",
+            "Storage",
+            "Sword1",
+            "Sword2",
+            "Umbrella",
+        ],
+        "test": [
+            "Plants",
+            "Classroom2E",
+            "Classroom2",
+            "Australia",
+            "DjembeL",
+            "CrusadeP",
+            "Crusade",
+            "Hoops",
+            "Bicycle2",
+            "Staircase",
+            "Newkuba",
+            "AustraliaP",
+            "Djembe",
+            "Livingroom",
+            "Computer",
+        ],
+    }
+
+    _has_built_in_disparity_mask = True
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str = "train",
+        calibration: Optional[str] = "perfect",
+        use_ambient_views: bool = False,
+        transforms: Optional[Callable] = None,
+        download: bool = False,
+    ) -> None:
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
+        self.split = split
+
+        if calibration:
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both", None))  # type: ignore
+            if split == "test":
+                raise ValueError("Split 'test' has only no calibration settings, please set `calibration=None`.")
+        else:
+            if split != "test":
+                raise ValueError(
+                    f"Split '{split}' has calibration settings, however None was provided as an argument."
+                    f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+                )
+
+        if download:
+            self._download_dataset(root)
+
+        root = Path(root) / "Middlebury2014"
+
+        if not os.path.exists(root / split):
+            raise FileNotFoundError(f"The {split} directory was not found in the provided root directory")
+
+        split_scenes = self.splits[split]
+        # check that the provided root folder contains the scene splits
+        if not any(
+            # using startswith to account for perfect / imperfect calibrartion
+            scene.startswith(s)
+            for scene in os.listdir(root / split)
+            for s in split_scenes
+        ):
+            raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
+
+        calibrartion_suffixes = {
+            None: [""],
+            "perfect": ["-perfect"],
+            "imperfect": ["-imperfect"],
+            "both": ["-perfect", "-imperfect"],
+        }[calibration]
+
+        for calibration_suffix in calibrartion_suffixes:
+            scene_pattern = "*" + calibration_suffix
+            left_img_pattern = str(root / split / scene_pattern / "im0.png")
+            right_img_pattern = str(root / split / scene_pattern / "im1.png")
+            self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
+
+            if split == "test":
+                self._disparities = list((None, None) for _ in self._images)
+            else:
+                left_dispartity_pattern = str(root / split / scene_pattern / "disp0.pfm")
+                right_dispartity_pattern = str(root / split / scene_pattern / "disp1.pfm")
+                self._disparities += self._scan_pairs(left_dispartity_pattern, right_dispartity_pattern)
+
+        self.use_ambient_views = use_ambient_views
+
+    def _read_img(self, file_path: Union[str, Path]) -> Image.Image:
+        """
+        Function that reads either the original right image or an augmented view when ``use_ambient_views`` is True.
+        When ``use_ambient_views`` is True, the dataset will return at random one of ``[im1.png, im1E.png, im1L.png]``
+        as the right image.
+        """
+        ambient_file_paths: list[Union[str, Path]]  # make mypy happy
+
+        if not isinstance(file_path, Path):
+            file_path = Path(file_path)
+
+        if file_path.name == "im1.png" and self.use_ambient_views:
+            base_path = file_path.parent
+            # initialize sampleable container
+            ambient_file_paths = list(base_path / view_name for view_name in ["im1E.png", "im1L.png"])
+            # double check that we're not going to try to read from an invalid file path
+            ambient_file_paths = list(filter(lambda p: os.path.exists(p), ambient_file_paths))
+            # keep the original image as an option as well for uniform sampling between base views
+            ambient_file_paths.append(file_path)
+            file_path = random.choice(ambient_file_paths)  # type: ignore
+        return super()._read_img(file_path)
+
+    def _read_disparity(self, file_path: str) -> Union[tuple[None, None], tuple[np.ndarray, np.ndarray]]:
+        # test split has not disparity maps
+        if file_path is None:
+            return None, None
+
+        disparity_map = _read_pfm_file(file_path)
+        disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
+        disparity_map[disparity_map == np.inf] = 0  # remove infinite disparities
+        valid_mask = (disparity_map > 0).squeeze(0)  # mask out invalid disparities
+        return disparity_map, valid_mask
+
+    def _download_dataset(self, root: Union[str, Path]) -> None:
+        base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
+        # train and additional splits have 2 different calibration settings
+        root = Path(root) / "Middlebury2014"
+        split_name = self.split
+
+        if split_name != "test":
+            for split_scene in self.splits[split_name]:
+                split_root = root / split_name
+                for calibration in ["perfect", "imperfect"]:
+                    scene_name = f"{split_scene}-{calibration}"
+                    scene_url = f"{base_url}/{scene_name}.zip"
+                    # download the scene only if it doesn't exist
+                    if not (split_root / scene_name).exists():
+                        download_and_extract_archive(
+                            url=scene_url,
+                            filename=f"{scene_name}.zip",
+                            download_root=str(split_root),
+                            remove_finished=True,
+                        )
+        else:
+            os.makedirs(root / "test")
+            if any(s not in os.listdir(root / "test") for s in self.splits["test"]):
+                # test split is downloaded from a different location
+                test_set_url = "https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip"
+                # the unzip is going to produce a directory MiddEval3 with two subdirectories trainingF and testF
+                # we want to move the contents from testF into the  directory
+                download_and_extract_archive(url=test_set_url, download_root=str(root), remove_finished=True)
+                for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
+                    for scene in scene_names:
+                        scene_dst_dir = root / "test"
+                        scene_src_dir = Path(scene_dir) / scene
+                        os.makedirs(scene_dst_dir, exist_ok=True)
+                        shutil.move(str(scene_src_dir), str(scene_dst_dir))
+
+                # cleanup MiddEval3 directory
+                shutil.rmtree(str(root / "MiddEval3"))
+
+    def __getitem__(self, index: int) -> T2:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            ``valid_mask`` is implicitly ``None`` for `split=test`.
+        """
+        return cast(T2, super().__getitem__(index))
+
+
+class CREStereo(StereoMatchingDataset):
+    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture.
+    Dataset details on the official paper `repo <https://github.com/megvii-research/CREStereo>`_.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            CREStereo
+                tree
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    img2_left.jpg
+                    img2_right.jpg
+                    img2_left.disp.jpg
+                    img2_right.disp.jpg
+                    ...
+                shapenet
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+                reflective
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+                hole
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+
+    Args:
+        root (str): Root directory of the dataset.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
+    """
+
+    _has_built_in_disparity_mask = True
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        transforms: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms)
+
+        root = Path(root) / "CREStereo"
+
+        dirs = ["shapenet", "reflective", "tree", "hole"]
+
+        for s in dirs:
+            left_image_pattern = str(root / s / "*_left.jpg")
+            right_image_pattern = str(root / s / "*_right.jpg")
+            imgs = self._scan_pairs(left_image_pattern, right_image_pattern)
+            self._images += imgs
+
+            left_disparity_pattern = str(root / s / "*_left.disp.png")
+            right_disparity_pattern = str(root / s / "*_right.disp.png")
+            disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
+            self._disparities += disparities
+
+    def _read_disparity(self, file_path: str) -> tuple[np.ndarray, None]:
+        disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity_map = disparity_map[None, :, :] / 32.0
+        valid_mask = None
+        return disparity_map, valid_mask
+
+    def __getitem__(self, index: int) -> T1:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
+            generate a valid mask.
+        """
+        return cast(T1, super().__getitem__(index))
 
 
 class FallingThingsStereo(StereoMatchingDataset):
     """`FallingThings <https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             FallingThings
@@ -403,12 +756,12 @@ class FallingThingsStereo(StereoMatchingDataset):
                     ...
 
     Args:
-        root (string): Root directory where FallingThings is located.
+        root (str or ``pathlib.Path``): Root directory where FallingThings is located.
         variant (string): Which variant to use. Either "single", "mixed", or "both".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, variant: str = "single", transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], variant: str = "single", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "FallingThings"
@@ -435,14 +788,14 @@ def __init__(self, root: str, variant: str = "single", transforms: Optional[Call
             right_disparity_pattern = str(root / s / split_prefix[s] / "*.right.depth.png")
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[np.ndarray, None]:
         # (H, W) image
         depth = np.asarray(Image.open(file_path))
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
         # in order to extract disparity from depth maps
         camera_settings_path = Path(file_path).parent / "_camera_settings.json"
-        with open(camera_settings_path, "r") as f:
-            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constatnt)
+        with open(camera_settings_path) as f:
+            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constant)
             intrinsics = json.load(f)
             focal = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
             baseline, pixel_constant = 6, 100  # pixel constant is inverted
@@ -452,7 +805,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
             valid_mask = None
             return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -464,14 +817,14 @@ def __getitem__(self, index: int) -> Tuple:
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class SceneFlowStereo(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
     This interface provides access to the `FlyingThings3D, `Monkaa` and `Driving` datasets.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             SceneFlow
@@ -514,7 +867,7 @@ class SceneFlowStereo(StereoMatchingDataset):
                     ...
 
     Args:
-        root (string): Root directory where SceneFlow is located.
+        root (str or ``pathlib.Path``): Root directory where SceneFlow is located.
         variant (string): Which dataset variant to user, "FlyingThings3D" (default), "Monkaa" or "Driving".
         pass_name (string): Which pass to use, "clean" (default), "final" or "both".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
@@ -523,11 +876,11 @@ class SceneFlowStereo(StereoMatchingDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         variant: str = "FlyingThings3D",
         pass_name: str = "clean",
         transforms: Optional[Callable] = None,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -558,13 +911,13 @@ def __init__(
             right_disparity_pattern = str(root / "disparity" / prefix_directories[variant] / "right" / "*.pfm")
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[np.ndarray, None]:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -576,7 +929,7 @@ def __getitem__(self, index: int) -> Tuple:
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class SintelStereo(StereoMatchingDataset):
@@ -619,14 +972,14 @@ class SintelStereo(StereoMatchingDataset):
                         ...
 
     Args:
-        root (string): Root directory where Sintel Stereo is located.
+        root (str or ``pathlib.Path``): Root directory where Sintel Stereo is located.
         pass_name (string): The name of the pass to use, either "final", "clean" or "both".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], pass_name: str = "final", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(pass_name, "pass_name", valid_values=("final", "clean", "both"))
@@ -646,7 +999,7 @@ def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Cal
             disparity_pattern = str(root / "training" / "disparities" / "*" / "*.png")
             self._disparities += self._scan_pairs(disparity_pattern, None)
 
-    def _get_occlussion_mask_paths(self, file_path: str) -> Tuple[str, str]:
+    def _get_occlussion_mask_paths(self, file_path: str) -> tuple[str, str]:
         # helper function to get the occlusion mask paths
         # a path will look like  .../.../.../training/disparities/scene1/img1.png
         # we want to get something like .../.../.../training/occlusions/scene1/img1.png
@@ -667,7 +1020,7 @@ def _get_occlussion_mask_paths(self, file_path: str) -> Tuple[str, str]:
 
         return occlusion_path, outofframe_path
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[tuple[None, None], tuple[np.ndarray, np.ndarray]]:
         if file_path is None:
             return None, None
 
@@ -677,7 +1030,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = r * 4 + g / (2**6) + b / (2**14)
         # reshape into (C, H, W) format
         disparity_map = np.transpose(disparity_map, (2, 0, 1))
-        # find the appropiate file paths
+        # find the appropriate file paths
         occlued_mask_path, out_of_frame_mask_path = self._get_occlussion_mask_paths(file_path)
         # occlusion masks
         valid_mask = np.asarray(Image.open(occlued_mask_path)) == 0
@@ -687,7 +1040,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -698,13 +1051,13 @@ def __getitem__(self, index: int) -> Tuple:
             The disparity is a numpy array of shape (1, H, W) and the images are PIL images whilst
             the valid_mask is a numpy array of shape (H, W).
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
 
 
 class InStereo2k(StereoMatchingDataset):
     """`InStereo2k <https://github.com/YuhuaXu/StereoDataset>`_ dataset.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             InStereo2k
@@ -728,12 +1081,12 @@ class InStereo2k(StereoMatchingDataset):
                     ...
 
     Args:
-        root (string): Root directory where InStereo2k is located.
+        root (str or ``pathlib.Path``): Root directory where InStereo2k is located.
         split (string): Either "train" or "test".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "InStereo2k" / split
@@ -748,14 +1101,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         right_disparity_pattern = str(root / "*" / "right_disp.png")
         self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> tuple[np.ndarray, None]:
         disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze disparity to (C, H, W)
         disparity_map = disparity_map[None, :, :] / 1024.0
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -767,7 +1120,7 @@ def __getitem__(self, index: int) -> Tuple:
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class ETH3DStereo(StereoMatchingDataset):
@@ -815,14 +1168,14 @@ class ETH3DStereo(StereoMatchingDataset):
                     ...
 
     Args:
-        root (string): Root directory of the ETH3D Dataset.
+        root (str or ``pathlib.Path``): Root directory of the ETH3D Dataset.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: Union[str, Path], split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -842,7 +1195,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             disparity_pattern = str(root / anot_dir / "*" / "disp0GT.pfm")
             self._disparities = self._scan_pairs(disparity_pattern, None)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[tuple[None, None], tuple[np.ndarray, np.ndarray]]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -854,7 +1207,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.asarray(valid_mask).astype(bool)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -867,4 +1220,4 @@ def __getitem__(self, index: int) -> Tuple:
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
diff --git a/torchvision/datasets/caltech.py b/torchvision/datasets/caltech.py
index 3a9635dfe09..adcb49ff1c2 100644
--- a/torchvision/datasets/caltech.py
+++ b/torchvision/datasets/caltech.py
@@ -1,6 +1,7 @@
 import os
 import os.path
-from typing import Any, Callable, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -16,26 +17,30 @@ class Caltech101(VisionDataset):
         This class needs `scipy <https://docs.scipy.org/doc/>`_ to load target files from `.mat` format.
 
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``caltech101`` exists or will be saved to if download is set to True.
         target_type (string or list, optional): Type of target to use, ``category`` or
             ``annotation``. Can also be a list to output a tuple with all specified
             target types.  ``category`` represents the target class, and
             ``annotation`` is a list of points from a hand-generated outline.
             Defaults to ``category``.
-        transform (callable, optional): A function/transform that takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+
+            .. warning::
+
+                To download the dataset `gdown <https://github.com/wkentaro/gdown>`_ is required.
     """
 
     def __init__(
         self,
-        root: str,
-        target_type: Union[List[str], str] = "category",
+        root: Union[str, Path],
+        target_type: Union[list[str], str] = "category",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
@@ -66,14 +71,14 @@ def __init__(
         }
         self.annotation_categories = list(map(lambda x: name_map[x] if x in name_map else x, self.categories))
 
-        self.index: List[int] = []
+        self.index: list[int] = []
         self.y = []
-        for (i, c) in enumerate(self.categories):
+        for i, c in enumerate(self.categories):
             n = len(os.listdir(os.path.join(self.root, "101_ObjectCategories", c)))
             self.index.extend(range(1, n + 1))
             self.y.extend(n * [i])
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -125,7 +130,6 @@ def __len__(self) -> int:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
         download_and_extract_archive(
@@ -149,9 +153,9 @@ class Caltech256(VisionDataset):
     """`Caltech 256 <https://data.caltech.edu/records/20087>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``caltech256`` exists or will be saved to if download is set to True.
-        transform (callable, optional): A function/transform that takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -177,9 +181,9 @@ def __init__(
             raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")
 
         self.categories = sorted(os.listdir(os.path.join(self.root, "256_ObjectCategories")))
-        self.index: List[int] = []
+        self.index: list[int] = []
         self.y = []
-        for (i, c) in enumerate(self.categories):
+        for i, c in enumerate(self.categories):
             n = len(
                 [
                     item
@@ -190,7 +194,7 @@ def __init__(
             self.index.extend(range(1, n + 1))
             self.y.extend(n * [i])
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -226,7 +230,6 @@ def __len__(self) -> int:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
         download_and_extract_archive(
diff --git a/torchvision/datasets/celeba.py b/torchvision/datasets/celeba.py
index dbacece88c9..469af6ed3b7 100644
--- a/torchvision/datasets/celeba.py
+++ b/torchvision/datasets/celeba.py
@@ -1,7 +1,8 @@
 import csv
 import os
 from collections import namedtuple
-from typing import Any, Callable, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import PIL
 import torch
@@ -16,32 +17,36 @@ class CelebA(VisionDataset):
     """`Large-scale CelebFaces Attributes (CelebA) Dataset <http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html>`_ Dataset.
 
     Args:
-        root (string): Root directory where images are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images are downloaded to.
         split (string): One of {'train', 'valid', 'test', 'all'}.
             Accordingly dataset is selected.
         target_type (string or list, optional): Type of target to use, ``attr``, ``identity``, ``bbox``,
             or ``landmarks``. Can also be a list to output a tuple with all specified target types.
             The targets represent:
 
-                - ``attr`` (np.array shape=(40,) dtype=int): binary (0, 1) labels for attributes
+                - ``attr`` (Tensor shape=(40,) dtype=int): binary (0, 1) labels for attributes
                 - ``identity`` (int): label for each person (data points with the same identity are the same person)
-                - ``bbox`` (np.array shape=(4,) dtype=int): bounding box (x, y, width, height)
-                - ``landmarks`` (np.array shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
+                - ``bbox`` (Tensor shape=(4,) dtype=int): bounding box (x, y, width, height)
+                - ``landmarks`` (Tensor shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
                   righteye_y, nose_x, nose_y, leftmouth_x, leftmouth_y, rightmouth_x, rightmouth_y)
 
             Defaults to ``attr``. If empty, ``None`` will be returned as target.
 
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.PILToTensor``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+
+            .. warning::
+
+                To download the dataset `gdown <https://github.com/wkentaro/gdown>`_ is required.
     """
 
     base_folder = "celeba"
-    # There currently does not appear to be a easy way to extract 7z in python (without introducing additional
+    # There currently does not appear to be an easy way to extract 7z in python (without introducing additional
     # dependencies). The "in-the-wild" (not aligned+cropped) images are only in 7z, so they are not available
     # right now.
     file_list = [
@@ -59,9 +64,9 @@ class CelebA(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
-        target_type: Union[List[str], str] = "attr",
+        target_type: Union[list[str], str] = "attr",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
@@ -88,7 +93,13 @@ def __init__(
             "test": 2,
             "all": None,
         }
-        split_ = split_map[verify_str_arg(split.lower(), "split", ("train", "valid", "test", "all"))]
+        split_ = split_map[
+            verify_str_arg(
+                split.lower() if isinstance(split, str) else split,
+                "split",
+                ("train", "valid", "test", "all"),
+            )
+        ]
         splits = self._load_csv("list_eval_partition.txt")
         identity = self._load_csv("identity_CelebA.txt")
         bbox = self._load_csv("list_bbox_celeba.txt", header=1)
@@ -100,7 +111,7 @@ def __init__(
         if mask == slice(None):  # if split == "all"
             self.filename = splits.index
         else:
-            self.filename = [splits.index[i] for i in torch.squeeze(torch.nonzero(mask))]
+            self.filename = [splits.index[i] for i in torch.squeeze(torch.nonzero(mask))]  # type: ignore[arg-type]
         self.identity = identity.data[mask]
         self.bbox = bbox.data[mask]
         self.landmarks_align = landmarks_align.data[mask]
@@ -130,7 +141,7 @@ def _load_csv(
         return CSV(headers, indices, torch.tensor(data_int))
 
     def _check_integrity(self) -> bool:
-        for (_, md5, filename) in self.file_list:
+        for _, md5, filename in self.file_list:
             fpath = os.path.join(self.root, self.base_folder, filename)
             _, ext = os.path.splitext(filename)
             # Allow original archive to be deleted (zip and 7z)
@@ -143,15 +154,14 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
-        for (file_id, md5, filename) in self.file_list:
+        for file_id, md5, filename in self.file_list:
             download_file_from_google_drive(file_id, os.path.join(self.root, self.base_folder), filename, md5)
 
         extract_archive(os.path.join(self.root, self.base_folder, "img_align_celeba.zip"))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         X = PIL.Image.open(os.path.join(self.root, self.base_folder, "img_align_celeba", self.filename[index]))
 
         target: Any = []
diff --git a/torchvision/datasets/cifar.py b/torchvision/datasets/cifar.py
index a2c4a7dc4c2..45893a44995 100644
--- a/torchvision/datasets/cifar.py
+++ b/torchvision/datasets/cifar.py
@@ -1,6 +1,7 @@
 import os.path
 import pickle
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 from PIL import Image
@@ -13,11 +14,11 @@ class CIFAR10(VisionDataset):
     """`CIFAR10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``cifar-10-batches-py`` exists or will be saved to if download is set to True.
         train (bool, optional): If True, creates dataset from training set, otherwise
             creates from test set.
-        transform (callable, optional): A function/transform that takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -50,7 +51,7 @@ class CIFAR10(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         train: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -100,7 +101,7 @@ def _load_meta(self) -> None:
             self.classes = data[self.meta["key"]]
         self.class_to_idx = {_class: i for i, _class in enumerate(self.classes)}
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -134,7 +135,6 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
         download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
 
diff --git a/torchvision/datasets/cityscapes.py b/torchvision/datasets/cityscapes.py
index 86d65c7c091..a124439932f 100644
--- a/torchvision/datasets/cityscapes.py
+++ b/torchvision/datasets/cityscapes.py
@@ -1,7 +1,8 @@
 import json
 import os
 from collections import namedtuple
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -13,7 +14,7 @@ class Cityscapes(VisionDataset):
     """`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where directory ``leftImg8bit``
+        root (str or ``pathlib.Path``): Root directory of dataset where directory ``leftImg8bit``
             and ``gtFine`` or ``gtCoarse`` are located.
         split (string, optional): The image split to use, ``train``, ``test`` or ``val`` if mode="fine"
             otherwise ``train``, ``train_extra`` or ``val``
@@ -103,10 +104,10 @@ class Cityscapes(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         mode: str = "fine",
-        target_type: Union[List[str], str] = "instance",
+        target_type: Union[list[str], str] = "instance",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         transforms: Optional[Callable] = None,
@@ -171,13 +172,13 @@ def __init__(
                 self.images.append(os.path.join(img_dir, file_name))
                 self.targets.append(target_types)
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
         Returns:
             tuple: (image, target) where target is a tuple of all target types if target_type is a list with more
-            than one item. Otherwise target is a json object if target_type="polygon", else the image segmentation.
+            than one item. Otherwise, target is a json object if target_type="polygon", else the image segmentation.
         """
 
         image = Image.open(self.images[index]).convert("RGB")
@@ -187,11 +188,11 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
             if t == "polygon":
                 target = self._load_json(self.targets[index][i])
             else:
-                target = Image.open(self.targets[index][i])
+                target = Image.open(self.targets[index][i])  # type: ignore[assignment]
 
             targets.append(target)
 
-        target = tuple(targets) if len(targets) > 1 else targets[0]
+        target = tuple(targets) if len(targets) > 1 else targets[0]  # type: ignore[assignment]
 
         if self.transforms is not None:
             image, target = self.transforms(image, target)
@@ -205,7 +206,7 @@ def extra_repr(self) -> str:
         lines = ["Split: {split}", "Mode: {mode}", "Type: {target_type}"]
         return "\n".join(lines).format(**self.__dict__)
 
-    def _load_json(self, path: str) -> Dict[str, Any]:
+    def _load_json(self, path: str) -> dict[str, Any]:
         with open(path) as file:
             data = json.load(file)
         return data
diff --git a/torchvision/datasets/clevr.py b/torchvision/datasets/clevr.py
index 94e261e3355..2bf24bc3c80 100644
--- a/torchvision/datasets/clevr.py
+++ b/torchvision/datasets/clevr.py
@@ -1,9 +1,9 @@
 import json
 import pathlib
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 from urllib.parse import urlparse
 
-from PIL import Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -15,14 +15,17 @@ class CLEVRClassification(VisionDataset):
     The number of objects in a scene are used as label.
 
     Args:
-        root (string): Root directory of dataset where directory ``root/clevr`` exists or will be saved to if download is
+        root (str or ``pathlib.Path``): Root directory of dataset where directory ``root/clevr`` exists or will be saved to if download is
             set to True.
         split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
-        transform (callable, optional): A function/transform that takes in an PIL image and returns a transformed
-            version. E.g, ``transforms.RandomCrop``
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in them target and transforms it.
         download (bool, optional): If true, downloads the dataset from the internet and puts it in root directory. If
             dataset is already downloaded, it is not downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip"
@@ -30,14 +33,16 @@ class CLEVRClassification(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[Union[str, pathlib.Path]], Any] = default_loader,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
         super().__init__(root, transform=transform, target_transform=target_transform)
+        self.loader = loader
         self._base_folder = pathlib.Path(self.root) / "clevr"
         self._data_folder = self._base_folder / pathlib.Path(urlparse(self._URL).path).stem
 
@@ -49,7 +54,7 @@ def __init__(
 
         self._image_files = sorted(self._data_folder.joinpath("images", self._split).glob("*"))
 
-        self._labels: List[Optional[int]]
+        self._labels: list[Optional[int]]
         if self._split != "test":
             with open(self._data_folder / "scenes" / f"CLEVR_{self._split}_scenes.json") as file:
                 content = json.load(file)
@@ -61,11 +66,11 @@ def __init__(
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file = self._image_files[idx]
         label = self._labels[idx]
 
-        image = Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
diff --git a/torchvision/datasets/coco.py b/torchvision/datasets/coco.py
index f53aba16e5f..8f3b5d2dfe4 100644
--- a/torchvision/datasets/coco.py
+++ b/torchvision/datasets/coco.py
@@ -1,5 +1,6 @@
 import os.path
-from typing import Any, Callable, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -9,12 +10,13 @@
 class CocoDetection(VisionDataset):
     """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
 
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+    It requires `pycocotools <https://github.com/ppwwyyxx/cocoapi>`_ to be installed,
+    which could be installed via ``pip install pycocotools`` or ``conda install conda-forge::pycocotools``.
 
     Args:
-        root (string): Root directory where images are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images are downloaded to.
         annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.PILToTensor``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -24,7 +26,7 @@ class CocoDetection(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         annFile: str,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -40,10 +42,14 @@ def _load_image(self, id: int) -> Image.Image:
         path = self.coco.loadImgs(id)[0]["file_name"]
         return Image.open(os.path.join(self.root, path)).convert("RGB")
 
-    def _load_target(self, id: int) -> List[Any]:
+    def _load_target(self, id: int) -> list[Any]:
         return self.coco.loadAnns(self.coco.getAnnIds(id))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
+
+        if not isinstance(index, int):
+            raise ValueError(f"Index must be of type integer, got {type(index)} instead.")
+
         id = self.ids[index]
         image = self._load_image(id)
         target = self._load_target(id)
@@ -60,12 +66,13 @@ def __len__(self) -> int:
 class CocoCaptions(CocoDetection):
     """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
 
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+    It requires `pycocotools <https://github.com/ppwwyyxx/cocoapi>`_ to be installed,
+    which could be installed via ``pip install pycocotools`` or ``conda install conda-forge::pycocotools``.
 
     Args:
-        root (string): Root directory where images are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images are downloaded to.
         annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that  takes in a PIL image
             and returns a transformed version. E.g, ``transforms.PILToTensor``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -100,5 +107,5 @@ class CocoCaptions(CocoDetection):
 
     """
 
-    def _load_target(self, id: int) -> List[str]:
+    def _load_target(self, id: int) -> list[str]:
         return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/torchvision/datasets/country211.py b/torchvision/datasets/country211.py
index 9a62520fe2b..50d49db00a7 100644
--- a/torchvision/datasets/country211.py
+++ b/torchvision/datasets/country211.py
@@ -1,7 +1,7 @@
 from pathlib import Path
-from typing import Callable, Optional
+from typing import Any, Callable, Optional, Union
 
-from .folder import ImageFolder
+from .folder import default_loader, ImageFolder
 from .utils import download_and_extract_archive, verify_str_arg
 
 
@@ -11,16 +11,19 @@ class Country211(ImageFolder):
     This dataset was built by filtering the images from the YFCC100m dataset
     that have GPS coordinate corresponding to a ISO-3166 country code. The
     dataset is balanced by sampling 150 train images, 50 validation images, and
-    100 test images images for each country.
+    100 test images for each country.
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default), ``"valid"`` and ``"test"``.
-        transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
-            version. E.g, ``transforms.RandomCrop``.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and puts it into
             ``root/country211/``. If dataset is already downloaded, it is not downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "https://openaipublic.azureedge.net/clip/data/country211.tgz"
@@ -28,11 +31,12 @@ class Country211(ImageFolder):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "valid", "test"))
 
@@ -46,7 +50,12 @@ def __init__(
         if not self._check_exists():
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
-        super().__init__(str(self._base_folder / self._split), transform=transform, target_transform=target_transform)
+        super().__init__(
+            str(self._base_folder / self._split),
+            transform=transform,
+            target_transform=target_transform,
+            loader=loader,
+        )
         self.root = str(root)
 
     def _check_exists(self) -> bool:
diff --git a/torchvision/datasets/dtd.py b/torchvision/datasets/dtd.py
index 2d8314346b9..8fb347955d4 100644
--- a/torchvision/datasets/dtd.py
+++ b/torchvision/datasets/dtd.py
@@ -1,8 +1,8 @@
 import os
 import pathlib
-from typing import Callable, Optional
+from typing import Any, Callable, Optional, Union
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -12,7 +12,7 @@ class DTD(VisionDataset):
     """`Describable Textures Dataset (DTD) <https://www.robots.ox.ac.uk/~vgg/data/dtd/>`_.
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
         partition (int, optional): The dataset partition. Should be ``1 <= partition <= 10``. Defaults to ``1``.
 
@@ -21,12 +21,15 @@ class DTD(VisionDataset):
                 The partition only changes which split each image belongs to. Thus, regardless of the selected
                 partition, combining all splits will result in all images.
 
-        transform (callable, optional): A function/transform that  takes in a PIL image and returns a transformed
-            version. E.g, ``transforms.RandomCrop``.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again. Default is False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz"
@@ -34,12 +37,13 @@ class DTD(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "train",
         partition: int = 1,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[Union[str, pathlib.Path]], Any] = default_loader,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
         if not isinstance(partition, int) and not (1 <= partition <= 10):
@@ -72,13 +76,14 @@ def __init__(
         self.classes = sorted(set(classes))
         self.class_to_idx = dict(zip(self.classes, range(len(self.classes))))
         self._labels = [self.class_to_idx[cls] for cls in classes]
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
-        image = PIL.Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
diff --git a/torchvision/datasets/eurosat.py b/torchvision/datasets/eurosat.py
index bec6df5312d..4efec57029f 100644
--- a/torchvision/datasets/eurosat.py
+++ b/torchvision/datasets/eurosat.py
@@ -1,30 +1,38 @@
 import os
-from typing import Callable, Optional
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
-from .folder import ImageFolder
+from .folder import default_loader, ImageFolder
 from .utils import download_and_extract_archive
 
 
 class EuroSAT(ImageFolder):
     """RGB version of the `EuroSAT <https://github.com/phelber/eurosat>`_ Dataset.
 
+    For the MS version of the dataset, see
+    `TorchGeo <https://torchgeo.readthedocs.io/en/stable/api/datasets.html#eurosat>`__.
+
     Args:
-        root (string): Root directory of dataset where ``root/eurosat`` exists.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        root (str or ``pathlib.Path``): Root directory of dataset where ``root/eurosat`` exists.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again. Default is False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         self.root = os.path.expanduser(root)
         self._base_folder = os.path.join(self.root, "eurosat")
@@ -36,7 +44,12 @@ def __init__(
         if not self._check_exists():
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
-        super().__init__(self._data_folder, transform=transform, target_transform=target_transform)
+        super().__init__(
+            self._data_folder,
+            transform=transform,
+            target_transform=target_transform,
+            loader=loader,
+        )
         self.root = os.path.expanduser(root)
 
     def __len__(self) -> int:
@@ -52,7 +65,7 @@ def download(self) -> None:
 
         os.makedirs(self._base_folder, exist_ok=True)
         download_and_extract_archive(
-            "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
+            "https://huggingface.co/datasets/torchgeo/eurosat/resolve/c877bcd43f099cd0196738f714544e355477f3fd/EuroSAT.zip",
             download_root=self._base_folder,
             md5="c8fa014336c82ac7804f0398fcb19387",
         )
diff --git a/torchvision/datasets/fakedata.py b/torchvision/datasets/fakedata.py
index 244af634989..bcb413cdd32 100644
--- a/torchvision/datasets/fakedata.py
+++ b/torchvision/datasets/fakedata.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 
@@ -11,9 +11,9 @@ class FakeData(VisionDataset):
 
     Args:
         size (int, optional): Size of the dataset. Default: 1000 images
-        image_size(tuple, optional): Size if the returned images. Default: (3, 224, 224)
+        image_size(tuple, optional): Size of the returned images. Default: (3, 224, 224)
         num_classes(int, optional): Number of classes in the dataset. Default: 10
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -25,19 +25,19 @@ class FakeData(VisionDataset):
     def __init__(
         self,
         size: int = 1000,
-        image_size: Tuple[int, int, int] = (3, 224, 224),
+        image_size: tuple[int, int, int] = (3, 224, 224),
         num_classes: int = 10,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         random_offset: int = 0,
     ) -> None:
-        super().__init__(None, transform=transform, target_transform=target_transform)  # type: ignore[arg-type]
+        super().__init__(transform=transform, target_transform=target_transform)
         self.size = size
         self.num_classes = num_classes
         self.image_size = image_size
         self.random_offset = random_offset
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
diff --git a/torchvision/datasets/fer2013.py b/torchvision/datasets/fer2013.py
index bcd20c1e4a2..f33afbeebc8 100644
--- a/torchvision/datasets/fer2013.py
+++ b/torchvision/datasets/fer2013.py
@@ -1,6 +1,6 @@
 import csv
 import pathlib
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
 import torch
 from PIL import Image
@@ -13,11 +13,23 @@ class FER2013(VisionDataset):
     """`FER2013
     <https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge>`_ Dataset.
 
+    .. note::
+        This dataset can return test labels only if ``fer2013.csv`` OR
+        ``icml_face_data.csv`` are present in ``root/fer2013/``. If only
+        ``train.csv`` and ``test.csv`` are present, the test labels are set to
+        ``None``.
+
     Args:
-        root (string): Root directory of dataset where directory
-            ``root/fer2013`` exists.
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
+            ``root/fer2013`` exists. This directory may contain either
+            ``fer2013.csv``, ``icml_face_data.csv``, or both ``train.csv`` and
+            ``test.csv``. Precendence is given in that order, i.e. if
+            ``fer2013.csv`` is present then the rest of the files will be
+            ignored. All these (combinations of) files contain the same data and
+            are supported for convenience, but only ``fer2013.csv`` and
+            ``icml_face_data.csv`` are able to return non-None test labels.
         split (string, optional): The dataset split, supports ``"train"`` (default), or ``"test"``.
-        transform (callable, optional): A function/transform that takes in an PIL image and returns a transformed
+        transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
     """
@@ -25,20 +37,41 @@ class FER2013(VisionDataset):
     _RESOURCES = {
         "train": ("train.csv", "3f0dfb3d3fd99c811a1299cb947e3131"),
         "test": ("test.csv", "b02c2298636a634e8c2faabbf3ea9a23"),
+        # The fer2013.csv and icml_face_data.csv files contain both train and
+        # tests instances, and unlike test.csv they contain the labels for the
+        # test instances. We give these 2 files precedence over train.csv and
+        # test.csv. And yes, they both contain the same data, but with different
+        # column names (note the spaces) and ordering:
+        # $ head -n 1 fer2013.csv icml_face_data.csv train.csv test.csv
+        # ==> fer2013.csv <==
+        # emotion,pixels,Usage
+        #
+        # ==> icml_face_data.csv <==
+        # emotion, Usage, pixels
+        #
+        # ==> train.csv <==
+        # emotion,pixels
+        #
+        # ==> test.csv <==
+        # pixels
+        "fer": ("fer2013.csv", "f8428a1edbd21e88f42c73edd2a14f95"),
+        "icml": ("icml_face_data.csv", "b114b9e04e6949e5fe8b6a98b3892b1d"),
     }
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
     ) -> None:
-        self._split = verify_str_arg(split, "split", self._RESOURCES.keys())
+        self._split = verify_str_arg(split, "split", ("train", "test"))
         super().__init__(root, transform=transform, target_transform=target_transform)
 
         base_folder = pathlib.Path(self.root) / "fer2013"
-        file_name, md5 = self._RESOURCES[self._split]
+        use_fer_file = (base_folder / self._RESOURCES["fer"][0]).exists()
+        use_icml_file = not use_fer_file and (base_folder / self._RESOURCES["icml"][0]).exists()
+        file_name, md5 = self._RESOURCES["fer" if use_fer_file else "icml" if use_icml_file else self._split]
         data_file = base_folder / file_name
         if not check_integrity(str(data_file), md5=md5):
             raise RuntimeError(
@@ -47,19 +80,31 @@ def __init__(
                 f"https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge"
             )
 
-        with open(data_file, "r", newline="") as file:
-            self._samples = [
-                (
-                    torch.tensor([int(idx) for idx in row["pixels"].split()], dtype=torch.uint8).reshape(48, 48),
-                    int(row["emotion"]) if "emotion" in row else None,
-                )
-                for row in csv.DictReader(file)
-            ]
+        pixels_key = " pixels" if use_icml_file else "pixels"
+        usage_key = " Usage" if use_icml_file else "Usage"
+
+        def get_img(row):
+            return torch.tensor([int(idx) for idx in row[pixels_key].split()], dtype=torch.uint8).reshape(48, 48)
+
+        def get_label(row):
+            if use_fer_file or use_icml_file or self._split == "train":
+                return int(row["emotion"])
+            else:
+                return None
+
+        with open(data_file, newline="") as file:
+            rows = (row for row in csv.DictReader(file))
+
+            if use_fer_file or use_icml_file:
+                valid_keys = ("Training",) if self._split == "train" else ("PublicTest", "PrivateTest")
+                rows = (row for row in rows if row[usage_key] in valid_keys)
+
+            self._samples = [(get_img(row), get_label(row)) for row in rows]
 
     def __len__(self) -> int:
         return len(self._samples)
 
-    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_tensor, target = self._samples[idx]
         image = Image.fromarray(image_tensor.numpy())
 
diff --git a/torchvision/datasets/fgvc_aircraft.py b/torchvision/datasets/fgvc_aircraft.py
index 2e4993361ca..a3f2277b233 100644
--- a/torchvision/datasets/fgvc_aircraft.py
+++ b/torchvision/datasets/fgvc_aircraft.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
 import os
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -23,30 +24,34 @@ class FGVCAircraft(VisionDataset):
     - ``manufacturer``, e.g. Boeing. The dataset comprises 30 different manufacturers.
 
     Args:
-        root (string): Root directory of the FGVC Aircraft dataset.
+        root (str or ``pathlib.Path``): Root directory of the FGVC Aircraft dataset.
         split (string, optional): The dataset split, supports ``train``, ``val``,
             ``trainval`` and ``test``.
         annotation_level (str, optional): The annotation level, supports ``variant``,
             ``family`` and ``manufacturer``.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz"
 
     def __init__(
         self,
-        root: str,
+        root: str | Path,
         split: str = "trainval",
         annotation_level: str = "variant",
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
+        transform: Callable | None = None,
+        target_transform: Callable | None = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "val", "trainval", "test"))
@@ -70,7 +75,7 @@ def __init__(
                 "manufacturer": "manufacturers.txt",
             }[self._annotation_level],
         )
-        with open(annotation_file, "r") as f:
+        with open(annotation_file) as f:
             self.classes = [line.strip() for line in f]
 
         self.class_to_idx = dict(zip(self.classes, range(len(self.classes))))
@@ -81,18 +86,19 @@ def __init__(
         self._image_files = []
         self._labels = []
 
-        with open(labels_file, "r") as f:
+        with open(labels_file) as f:
             for line in f:
                 image_name, label_name = line.strip().split(" ", 1)
                 self._image_files.append(os.path.join(image_data_folder, f"{image_name}.jpg"))
                 self._labels.append(self.class_to_idx[label_name])
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
-        image = PIL.Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
diff --git a/torchvision/datasets/flickr.py b/torchvision/datasets/flickr.py
index 0047a12268b..84f1dc0e170 100644
--- a/torchvision/datasets/flickr.py
+++ b/torchvision/datasets/flickr.py
@@ -2,30 +2,30 @@
 import os
 from collections import defaultdict
 from html.parser import HTMLParser
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-from PIL import Image
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
+from .folder import default_loader
 from .vision import VisionDataset
 
 
 class Flickr8kParser(HTMLParser):
     """Parser for extracting captions from the Flickr8k dataset web page."""
 
-    def __init__(self, root: str) -> None:
+    def __init__(self, root: Union[str, Path]) -> None:
         super().__init__()
 
         self.root = root
 
         # Data structure to store captions
-        self.annotations: Dict[str, List[str]] = {}
+        self.annotations: dict[str, list[str]] = {}
 
         # State variables
         self.in_table = False
         self.current_tag: Optional[str] = None
         self.current_img: Optional[str] = None
 
-    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
         self.current_tag = tag
 
         if tag == "table":
@@ -56,20 +56,24 @@ class Flickr8k(VisionDataset):
     """`Flickr8k Entities <http://hockenmaier.cs.illinois.edu/8k-pictures.html>`_ Dataset.
 
     Args:
-        root (string): Root directory where images are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images are downloaded to.
         ann_file (string): Path to annotation file.
-        transform (callable, optional): A function/transform that takes in a PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         ann_file: str,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self.ann_file = os.path.expanduser(ann_file)
@@ -81,8 +85,9 @@ def __init__(
         self.annotations = parser.annotations
 
         self.ids = list(sorted(self.annotations.keys()))
+        self.loader = loader
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -93,7 +98,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
         img_id = self.ids[index]
 
         # Image
-        img = Image.open(img_id).convert("RGB")
+        img = self.loader(img_id)
         if self.transform is not None:
             img = self.transform(img)
 
@@ -112,12 +117,15 @@ class Flickr30k(VisionDataset):
     """`Flickr30k Entities <https://bryanplummer.com/Flickr30kEntities/>`_ Dataset.
 
     Args:
-        root (string): Root directory where images are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images are downloaded to.
         ann_file (string): Path to annotation file.
-        transform (callable, optional): A function/transform that takes in a PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     def __init__(
@@ -126,6 +134,7 @@ def __init__(
         ann_file: str,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self.ann_file = os.path.expanduser(ann_file)
@@ -138,8 +147,9 @@ def __init__(
                 self.annotations[img_id[:-2]].append(caption)
 
         self.ids = list(sorted(self.annotations.keys()))
+        self.loader = loader
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -151,7 +161,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
 
         # Image
         filename = os.path.join(self.root, img_id)
-        img = Image.open(filename).convert("RGB")
+        img = self.loader(filename)
         if self.transform is not None:
             img = self.transform(img)
 
diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py
index ad3a6dda0e8..80bca71e967 100644
--- a/torchvision/datasets/flowers102.py
+++ b/torchvision/datasets/flowers102.py
@@ -1,7 +1,7 @@
 from pathlib import Path
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
 from .vision import VisionDataset
@@ -22,14 +22,17 @@ class Flowers102(VisionDataset):
     have large variations within the category, and several very similar categories.
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
-        transform (callable, optional): A function/transform that takes in an PIL image and returns a
-            transformed version. E.g, ``transforms.RandomCrop``.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/"
@@ -42,11 +45,12 @@ class Flowers102(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[Union[str, Path]], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
@@ -73,12 +77,14 @@ def __init__(
             self._labels.append(image_id_to_label[image_id])
             self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg")
 
+        self.loader = loader
+
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
-        image = PIL.Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
@@ -112,3 +118,108 @@ def download(self):
         for id in ["label", "setid"]:
             filename, md5 = self._file_dict[id]
             download_url(self._download_url_prefix + filename, str(self._base_folder), md5=md5)
+
+    classes = [
+        "pink primrose",
+        "hard-leaved pocket orchid",
+        "canterbury bells",
+        "sweet pea",
+        "english marigold",
+        "tiger lily",
+        "moon orchid",
+        "bird of paradise",
+        "monkshood",
+        "globe thistle",
+        "snapdragon",
+        "colt's foot",
+        "king protea",
+        "spear thistle",
+        "yellow iris",
+        "globe-flower",
+        "purple coneflower",
+        "peruvian lily",
+        "balloon flower",
+        "giant white arum lily",
+        "fire lily",
+        "pincushion flower",
+        "fritillary",
+        "red ginger",
+        "grape hyacinth",
+        "corn poppy",
+        "prince of wales feathers",
+        "stemless gentian",
+        "artichoke",
+        "sweet william",
+        "carnation",
+        "garden phlox",
+        "love in the mist",
+        "mexican aster",
+        "alpine sea holly",
+        "ruby-lipped cattleya",
+        "cape flower",
+        "great masterwort",
+        "siam tulip",
+        "lenten rose",
+        "barbeton daisy",
+        "daffodil",
+        "sword lily",
+        "poinsettia",
+        "bolero deep blue",
+        "wallflower",
+        "marigold",
+        "buttercup",
+        "oxeye daisy",
+        "common dandelion",
+        "petunia",
+        "wild pansy",
+        "primula",
+        "sunflower",
+        "pelargonium",
+        "bishop of llandaff",
+        "gaura",
+        "geranium",
+        "orange dahlia",
+        "pink-yellow dahlia?",
+        "cautleya spicata",
+        "japanese anemone",
+        "black-eyed susan",
+        "silverbush",
+        "californian poppy",
+        "osteospermum",
+        "spring crocus",
+        "bearded iris",
+        "windflower",
+        "tree poppy",
+        "gazania",
+        "azalea",
+        "water lily",
+        "rose",
+        "thorn apple",
+        "morning glory",
+        "passion flower",
+        "lotus",
+        "toad lily",
+        "anthurium",
+        "frangipani",
+        "clematis",
+        "hibiscus",
+        "columbine",
+        "desert-rose",
+        "tree mallow",
+        "magnolia",
+        "cyclamen",
+        "watercress",
+        "canna lily",
+        "hippeastrum",
+        "bee balm",
+        "ball moss",
+        "foxglove",
+        "bougainvillea",
+        "camellia",
+        "mallow",
+        "mexican petunia",
+        "bromelia",
+        "blanket flower",
+        "trumpet creeper",
+        "blackberry lily",
+    ]
diff --git a/torchvision/datasets/folder.py b/torchvision/datasets/folder.py
index 40d5e26d242..387439c0433 100644
--- a/torchvision/datasets/folder.py
+++ b/torchvision/datasets/folder.py
@@ -1,13 +1,14 @@
 import os
 import os.path
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, cast, Optional, Union
 
 from PIL import Image
 
 from .vision import VisionDataset
 
 
-def has_file_allowed_extension(filename: str, extensions: Union[str, Tuple[str, ...]]) -> bool:
+def has_file_allowed_extension(filename: str, extensions: Union[str, tuple[str, ...]]) -> bool:
     """Checks if a file is an allowed extension.
 
     Args:
@@ -32,7 +33,7 @@ def is_image_file(filename: str) -> bool:
     return has_file_allowed_extension(filename, IMG_EXTENSIONS)
 
 
-def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
+def find_classes(directory: Union[str, Path]) -> tuple[list[str], dict[str, int]]:
     """Finds the class folders in a dataset.
 
     See :class:`DatasetFolder` for details.
@@ -46,11 +47,12 @@ def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
 
 
 def make_dataset(
-    directory: str,
-    class_to_idx: Optional[Dict[str, int]] = None,
-    extensions: Optional[Union[str, Tuple[str, ...]]] = None,
+    directory: Union[str, Path],
+    class_to_idx: Optional[dict[str, int]] = None,
+    extensions: Optional[Union[str, tuple[str, ...]]] = None,
     is_valid_file: Optional[Callable[[str], bool]] = None,
-) -> List[Tuple[str, int]]:
+    allow_empty: bool = False,
+) -> list[tuple[str, int]]:
     """Generates a list of samples of a form (path_to_sample, class).
 
     See :class:`DatasetFolder` for details.
@@ -95,7 +97,7 @@ def is_valid_file(x: str) -> bool:
                         available_classes.add(target_class)
 
     empty_classes = set(class_to_idx.keys()) - available_classes
-    if empty_classes:
+    if empty_classes and not allow_empty:
         msg = f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. "
         if extensions is not None:
             msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}"
@@ -111,7 +113,7 @@ class DatasetFolder(VisionDataset):
     :meth:`find_classes` method.
 
     Args:
-        root (string): Root directory path.
+        root (str or ``pathlib.Path``): Root directory path.
         loader (callable): A function to load a sample given its path.
         extensions (tuple[string]): A list of allowed extensions.
             both extensions and is_valid_file should not be passed.
@@ -123,6 +125,8 @@ class DatasetFolder(VisionDataset):
         is_valid_file (callable, optional): A function that takes path of a file
             and check if the file is a valid file (used to check of corrupt files)
             both extensions and is_valid_file should not be passed.
+        allow_empty(bool, optional): If True, empty folders are considered to be valid classes.
+            An error is raised on empty folders if False (default).
 
      Attributes:
         classes (list): List of the class names sorted alphabetically.
@@ -133,16 +137,23 @@ class DatasetFolder(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         loader: Callable[[str], Any],
-        extensions: Optional[Tuple[str, ...]] = None,
+        extensions: Optional[tuple[str, ...]] = None,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         is_valid_file: Optional[Callable[[str], bool]] = None,
+        allow_empty: bool = False,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         classes, class_to_idx = self.find_classes(self.root)
-        samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file)
+        samples = self.make_dataset(
+            self.root,
+            class_to_idx=class_to_idx,
+            extensions=extensions,
+            is_valid_file=is_valid_file,
+            allow_empty=allow_empty,
+        )
 
         self.loader = loader
         self.extensions = extensions
@@ -154,11 +165,12 @@ def __init__(
 
     @staticmethod
     def make_dataset(
-        directory: str,
-        class_to_idx: Dict[str, int],
-        extensions: Optional[Tuple[str, ...]] = None,
+        directory: Union[str, Path],
+        class_to_idx: dict[str, int],
+        extensions: Optional[tuple[str, ...]] = None,
         is_valid_file: Optional[Callable[[str], bool]] = None,
-    ) -> List[Tuple[str, int]]:
+        allow_empty: bool = False,
+    ) -> list[tuple[str, int]]:
         """Generates a list of samples of a form (path_to_sample, class).
 
         This can be overridden to e.g. read files from a compressed zip file instead of from the disk.
@@ -172,6 +184,8 @@ def make_dataset(
                 and checks if the file is a valid file
                 (used to check of corrupt files) both extensions and
                 is_valid_file should not be passed. Defaults to None.
+            allow_empty(bool, optional): If True, empty folders are considered to be valid classes.
+                An error is raised on empty folders if False (default).
 
         Raises:
             ValueError: In case ``class_to_idx`` is empty.
@@ -186,9 +200,11 @@ def make_dataset(
             # find_classes() function, instead of using that of the find_classes() method, which
             # is potentially overridden and thus could have a different logic.
             raise ValueError("The class_to_idx parameter cannot be None.")
-        return make_dataset(directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file)
+        return make_dataset(
+            directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file, allow_empty=allow_empty
+        )
 
-    def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
+    def find_classes(self, directory: Union[str, Path]) -> tuple[list[str], dict[str, int]]:
         """Find the class folders in a dataset structured as follows::
 
             directory/
@@ -217,7 +233,7 @@ def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
         """
         return find_classes(directory)
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -241,7 +257,7 @@ def __len__(self) -> int:
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
 
 
-def pil_loader(path: str) -> Image.Image:
+def pil_loader(path: Union[str, Path]) -> Image.Image:
     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
     with open(path, "rb") as f:
         img = Image.open(f)
@@ -249,7 +265,7 @@ def pil_loader(path: str) -> Image.Image:
 
 
 # TODO: specify the return type
-def accimage_loader(path: str) -> Any:
+def accimage_loader(path: Union[str, Path]) -> Any:
     import accimage
 
     try:
@@ -259,7 +275,7 @@ def accimage_loader(path: str) -> Any:
         return pil_loader(path)
 
 
-def default_loader(path: str) -> Any:
+def default_loader(path: Union[str, Path]) -> Any:
     from torchvision import get_image_backend
 
     if get_image_backend() == "accimage":
@@ -283,14 +299,16 @@ class ImageFolder(DatasetFolder):
     the same methods can be overridden to customize the dataset.
 
     Args:
-        root (string): Root directory path.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        root (str or ``pathlib.Path``): Root directory path.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         loader (callable, optional): A function to load an image given its path.
         is_valid_file (callable, optional): A function that takes path of an Image file
             and check if the file is a valid file (used to check of corrupt files)
+        allow_empty(bool, optional): If True, empty folders are considered to be valid classes.
+            An error is raised on empty folders if False (default).
 
      Attributes:
         classes (list): List of the class names sorted alphabetically.
@@ -300,11 +318,12 @@ class ImageFolder(DatasetFolder):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         loader: Callable[[str], Any] = default_loader,
         is_valid_file: Optional[Callable[[str], bool]] = None,
+        allow_empty: bool = False,
     ):
         super().__init__(
             root,
@@ -313,5 +332,6 @@ def __init__(
             transform=transform,
             target_transform=target_transform,
             is_valid_file=is_valid_file,
+            allow_empty=allow_empty,
         )
         self.imgs = self.samples
diff --git a/torchvision/datasets/food101.py b/torchvision/datasets/food101.py
index e7d1bd19447..fee23680b05 100644
--- a/torchvision/datasets/food101.py
+++ b/torchvision/datasets/food101.py
@@ -1,8 +1,8 @@
 import json
 from pathlib import Path
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -19,14 +19,17 @@ class Food101(VisionDataset):
 
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default) and ``"test"``.
-        transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
-            version. E.g, ``transforms.RandomCrop``.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again. Default is False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz"
@@ -34,11 +37,12 @@ class Food101(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[Union[str, Path]], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "test"))
@@ -65,13 +69,14 @@ def __init__(
             self._image_files += [
                 self._images_folder.joinpath(*f"{im_rel_path}.jpg".split("/")) for im_rel_path in im_rel_paths
             ]
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
-        image = PIL.Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
diff --git a/torchvision/datasets/gtsrb.py b/torchvision/datasets/gtsrb.py
index f99a688586d..e6b60116c40 100644
--- a/torchvision/datasets/gtsrb.py
+++ b/torchvision/datasets/gtsrb.py
@@ -1,6 +1,6 @@
 import csv
 import pathlib
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
 import PIL
 
@@ -13,9 +13,9 @@ class GTSRB(VisionDataset):
     """`German Traffic Sign Recognition Benchmark (GTSRB) <https://benchmark.ini.rub.de/>`_ Dataset.
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default), or ``"test"``.
-        transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
+        transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
@@ -25,7 +25,7 @@ class GTSRB(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -62,7 +62,7 @@ def __init__(
     def __len__(self) -> int:
         return len(self._samples)
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
 
         path, target = self._samples[index]
         sample = PIL.Image.open(path).convert("RGB")
diff --git a/torchvision/datasets/hmdb51.py b/torchvision/datasets/hmdb51.py
index 9067418d847..b9b84771cac 100644
--- a/torchvision/datasets/hmdb51.py
+++ b/torchvision/datasets/hmdb51.py
@@ -1,6 +1,7 @@
 import glob
 import os
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from torch import Tensor
 
@@ -28,7 +29,7 @@ class HMDB51(VisionDataset):
     Internally, it uses a VideoClips object to handle clip creation.
 
     Args:
-        root (string): Root directory of the HMDB51 Dataset.
+        root (str or ``pathlib.Path``): Root directory of the HMDB51 Dataset.
         annotation_path (str): Path to the folder containing the split files.
         frames_per_clip (int): Number of frames in a clip.
         step_between_clips (int): Number of frames between each clip.
@@ -59,7 +60,7 @@ class HMDB51(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         annotation_path: str,
         frames_per_clip: int,
         step_between_clips: int = 1,
@@ -67,7 +68,7 @@ def __init__(
         fold: int = 1,
         train: bool = True,
         transform: Optional[Callable] = None,
-        _precomputed_metadata: Optional[Dict[str, Any]] = None,
+        _precomputed_metadata: Optional[dict[str, Any]] = None,
         num_workers: int = 1,
         _video_width: int = 0,
         _video_height: int = 0,
@@ -102,7 +103,7 @@ def __init__(
             output_format=output_format,
         )
         # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
         # video clips
         self.full_video_clips = video_clips
         self.fold = fold
@@ -112,10 +113,10 @@ def __init__(
         self.transform = transform
 
     @property
-    def metadata(self) -> Dict[str, Any]:
+    def metadata(self) -> dict[str, Any]:
         return self.full_video_clips.metadata
 
-    def _select_fold(self, video_list: List[str], annotations_dir: str, fold: int, train: bool) -> List[int]:
+    def _select_fold(self, video_list: list[str], annotations_dir: str, fold: int, train: bool) -> list[int]:
         target_tag = self.TRAIN_TAG if train else self.TEST_TAG
         split_pattern_name = f"*test_split{fold}.txt"
         split_pattern_path = os.path.join(annotations_dir, split_pattern_name)
@@ -140,7 +141,7 @@ def _select_fold(self, video_list: List[str], annotations_dir: str, fold: int, t
     def __len__(self) -> int:
         return self.video_clips.num_clips()
 
-    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
+    def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, int]:
         video, audio, _, video_idx = self.video_clips.get_clip(idx)
         sample_index = self.indices[video_idx]
         _, class_index = self.samples[sample_index]
diff --git a/torchvision/datasets/imagenet.py b/torchvision/datasets/imagenet.py
index 4b86bf2f2b9..1808dc4f85b 100644
--- a/torchvision/datasets/imagenet.py
+++ b/torchvision/datasets/imagenet.py
@@ -1,8 +1,10 @@
 import os
 import shutil
 import tempfile
+from collections.abc import Iterator
 from contextlib import contextmanager
-from typing import Any, Dict, Iterator, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Optional, Union
 
 import torch
 
@@ -21,14 +23,22 @@
 class ImageNet(ImageFolder):
     """`ImageNet <http://image-net.org/>`_ 2012 Classification Dataset.
 
+    .. note::
+        Before using this class, it is required to download ImageNet 2012 dataset from
+        `here <https://image-net.org/challenges/LSVRC/2012/2012-downloads.php>`_ and
+        place the files ``ILSVRC2012_devkit_t12.tar.gz`` and ``ILSVRC2012_img_train.tar``
+        or ``ILSVRC2012_img_val.tar`` based on ``split`` in the root directory.
+
     Args:
-        root (string): Root directory of the ImageNet Dataset.
+        root (str or ``pathlib.Path``): Root directory of the ImageNet Dataset.
         split (string, optional): The dataset split, supports ``train``, or ``val``.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
 
      Attributes:
         classes (list): List of the class name tuples.
@@ -39,7 +49,7 @@ class ImageNet(ImageFolder):
         targets (list): The class_index value for each image in the dataset
     """
 
-    def __init__(self, root: str, split: str = "train", **kwargs: Any) -> None:
+    def __init__(self, root: Union[str, Path], split: str = "train", **kwargs: Any) -> None:
         root = self.root = os.path.expanduser(root)
         self.split = verify_str_arg(split, "split", ("train", "val"))
 
@@ -72,13 +82,13 @@ def extra_repr(self) -> str:
         return "Split: {split}".format(**self.__dict__)
 
 
-def load_meta_file(root: str, file: Optional[str] = None) -> Tuple[Dict[str, str], List[str]]:
+def load_meta_file(root: Union[str, Path], file: Optional[str] = None) -> tuple[dict[str, str], list[str]]:
     if file is None:
         file = META_FILE
     file = os.path.join(root, file)
 
     if check_integrity(file):
-        return torch.load(file)
+        return torch.load(file, weights_only=True)
     else:
         msg = (
             "The meta file {} is not present in the root directory or is corrupted. "
@@ -87,7 +97,7 @@ def load_meta_file(root: str, file: Optional[str] = None) -> Tuple[Dict[str, str
         raise RuntimeError(msg.format(file, root))
 
 
-def _verify_archive(root: str, file: str, md5: str) -> None:
+def _verify_archive(root: Union[str, Path], file: str, md5: str) -> None:
     if not check_integrity(os.path.join(root, file), md5):
         msg = (
             "The archive {} is not present in the root directory or is corrupted. "
@@ -96,18 +106,18 @@ def _verify_archive(root: str, file: str, md5: str) -> None:
         raise RuntimeError(msg.format(file, root))
 
 
-def parse_devkit_archive(root: str, file: Optional[str] = None) -> None:
+def parse_devkit_archive(root: Union[str, Path], file: Optional[str] = None) -> None:
     """Parse the devkit archive of the ImageNet2012 classification dataset and save
     the meta information in a binary file.
 
     Args:
-        root (str): Root directory containing the devkit archive
+        root (str or ``pathlib.Path``): Root directory containing the devkit archive
         file (str, optional): Name of devkit archive. Defaults to
             'ILSVRC2012_devkit_t12.tar.gz'
     """
     import scipy.io as sio
 
-    def parse_meta_mat(devkit_root: str) -> Tuple[Dict[int, str], Dict[str, Tuple[str, ...]]]:
+    def parse_meta_mat(devkit_root: str) -> tuple[dict[int, str], dict[str, tuple[str, ...]]]:
         metafile = os.path.join(devkit_root, "data", "meta.mat")
         meta = sio.loadmat(metafile, squeeze_me=True)["synsets"]
         nums_children = list(zip(*meta))[4]
@@ -118,7 +128,7 @@ def parse_meta_mat(devkit_root: str) -> Tuple[Dict[int, str], Dict[str, Tuple[st
         wnid_to_classes = {wnid: clss for wnid, clss in zip(wnids, classes)}
         return idx_to_wnid, wnid_to_classes
 
-    def parse_val_groundtruth_txt(devkit_root: str) -> List[int]:
+    def parse_val_groundtruth_txt(devkit_root: str) -> list[int]:
         file = os.path.join(devkit_root, "data", "ILSVRC2012_validation_ground_truth.txt")
         with open(file) as txtfh:
             val_idcs = txtfh.readlines()
@@ -150,12 +160,12 @@ def get_tmp_dir() -> Iterator[str]:
         torch.save((wnid_to_classes, val_wnids), os.path.join(root, META_FILE))
 
 
-def parse_train_archive(root: str, file: Optional[str] = None, folder: str = "train") -> None:
+def parse_train_archive(root: Union[str, Path], file: Optional[str] = None, folder: str = "train") -> None:
     """Parse the train images archive of the ImageNet2012 classification dataset and
     prepare it for usage with the ImageNet dataset.
 
     Args:
-        root (str): Root directory containing the train images archive
+        root (str or ``pathlib.Path``): Root directory containing the train images archive
         file (str, optional): Name of train images archive. Defaults to
             'ILSVRC2012_img_train.tar'
         folder (str, optional): Optional name for train images folder. Defaults to
@@ -177,13 +187,13 @@ def parse_train_archive(root: str, file: Optional[str] = None, folder: str = "tr
 
 
 def parse_val_archive(
-    root: str, file: Optional[str] = None, wnids: Optional[List[str]] = None, folder: str = "val"
+    root: Union[str, Path], file: Optional[str] = None, wnids: Optional[list[str]] = None, folder: str = "val"
 ) -> None:
     """Parse the validation images archive of the ImageNet2012 classification dataset
     and prepare it for usage with the ImageNet dataset.
 
     Args:
-        root (str): Root directory containing the validation images archive
+        root (str or ``pathlib.Path``): Root directory containing the validation images archive
         file (str, optional): Name of validation images archive. Defaults to
             'ILSVRC2012_img_val.tar'
         wnids (list, optional): List of WordNet IDs of the validation images. If None
diff --git a/torchvision/datasets/imagenette.py b/torchvision/datasets/imagenette.py
new file mode 100644
index 00000000000..16bac9bfadc
--- /dev/null
+++ b/torchvision/datasets/imagenette.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
+
+from .folder import default_loader, find_classes, make_dataset
+from .utils import download_and_extract_archive, verify_str_arg
+from .vision import VisionDataset
+
+
+class Imagenette(VisionDataset):
+    """`Imagenette <https://github.com/fastai/imagenette#imagenette-1>`_ image classification dataset.
+
+    Args:
+        root (str or ``pathlib.Path``): Root directory of the Imagenette dataset.
+        split (string, optional): The dataset split. Supports ``"train"`` (default), and ``"val"``.
+        size (string, optional): The image size. Supports ``"full"`` (default), ``"320px"``, and ``"160px"``.
+        download (bool, optional): If ``True``, downloads the dataset components and places them in ``root``. Already
+            downloaded archives are not downloaded again.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
+
+     Attributes:
+        classes (list): List of the class name tuples.
+        class_to_idx (dict): Dict with items (class name, class index).
+        wnids (list): List of the WordNet IDs.
+        wnid_to_idx (dict): Dict with items (WordNet ID, class index).
+    """
+
+    _ARCHIVES = {
+        "full": ("https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz", "fe2fc210e6bb7c5664d602c3cd71e612"),
+        "320px": ("https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz", "3df6f0d01a2c9592104656642f5e78a3"),
+        "160px": ("https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz", "e793b78cc4c9e9a4ccc0c1155377a412"),
+    }
+    _WNID_TO_CLASS = {
+        "n01440764": ("tench", "Tinca tinca"),
+        "n02102040": ("English springer", "English springer spaniel"),
+        "n02979186": ("cassette player",),
+        "n03000684": ("chain saw", "chainsaw"),
+        "n03028079": ("church", "church building"),
+        "n03394916": ("French horn", "horn"),
+        "n03417042": ("garbage truck", "dustcart"),
+        "n03425413": ("gas pump", "gasoline pump", "petrol pump", "island dispenser"),
+        "n03445777": ("golf ball",),
+        "n03888257": ("parachute", "chute"),
+    }
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str = "train",
+        size: str = "full",
+        download=False,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root, transform=transform, target_transform=target_transform)
+
+        self._split = verify_str_arg(split, "split", ["train", "val"])
+        self._size = verify_str_arg(size, "size", ["full", "320px", "160px"])
+
+        self._url, self._md5 = self._ARCHIVES[self._size]
+        self._size_root = Path(self.root) / Path(self._url).stem
+        self._image_root = str(self._size_root / self._split)
+
+        if download:
+            self._download()
+        elif not self._check_exists():
+            raise RuntimeError("Dataset not found. You can use download=True to download it.")
+
+        self.wnids, self.wnid_to_idx = find_classes(self._image_root)
+        self.classes = [self._WNID_TO_CLASS[wnid] for wnid in self.wnids]
+        self.class_to_idx = {
+            class_name: idx for wnid, idx in self.wnid_to_idx.items() for class_name in self._WNID_TO_CLASS[wnid]
+        }
+        self._samples = make_dataset(self._image_root, self.wnid_to_idx, extensions=".jpeg")
+        self.loader = loader
+
+    def _check_exists(self) -> bool:
+        return self._size_root.exists()
+
+    def _download(self):
+        if self._check_exists():
+            return
+
+        download_and_extract_archive(self._url, self.root, md5=self._md5)
+
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
+        path, label = self._samples[idx]
+        image = self.loader(path)
+
+        if self.transform is not None:
+            image = self.transform(image)
+
+        if self.target_transform is not None:
+            label = self.target_transform(label)
+
+        return image, label
+
+    def __len__(self) -> int:
+        return len(self._samples)
diff --git a/torchvision/datasets/inaturalist.py b/torchvision/datasets/inaturalist.py
index 50b32ef0f4a..a47483e158d 100644
--- a/torchvision/datasets/inaturalist.py
+++ b/torchvision/datasets/inaturalist.py
@@ -1,6 +1,7 @@
 import os
 import os.path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -32,7 +33,7 @@ class INaturalist(VisionDataset):
     """`iNaturalist <https://github.com/visipedia/inat_comp>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where the image files are stored.
+        root (str or ``pathlib.Path``): Root directory of dataset where the image files are stored.
             This class does not require/use annotation files.
         version (string, optional): Which version of the dataset to download/use. One of
             '2017', '2018', '2019', '2021_train', '2021_train_mini', '2021_valid'.
@@ -54,23 +55,27 @@ class INaturalist(VisionDataset):
 
             Can also be a list to output a tuple with all specified target types.
             Defaults to ``full``.
-        transform (callable, optional): A function/transform that takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         version: str = "2021_train",
-        target_type: Union[List[str], str] = "full",
+        target_type: Union[list[str], str] = "full",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Optional[Callable[[Union[str, Path]], Any]] = None,
     ) -> None:
         self.version = verify_str_arg(version, "version", DATASET_URLS.keys())
 
@@ -80,16 +85,16 @@ def __init__(
         if download:
             self.download()
 
-        if not self._check_integrity():
+        if not self._check_exists():
             raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")
 
-        self.all_categories: List[str] = []
+        self.all_categories: list[str] = []
 
         # map: category type -> name of category -> index
-        self.categories_index: Dict[str, Dict[str, int]] = {}
+        self.categories_index: dict[str, dict[str, int]] = {}
 
         # list indexed by category id, containing mapping from category type -> index
-        self.categories_map: List[Dict[str, int]] = []
+        self.categories_map: list[dict[str, int]] = []
 
         if not isinstance(target_type, list):
             target_type = [target_type]
@@ -101,13 +106,15 @@ def __init__(
             self._init_pre2021()
 
         # index of all files: (full category id, filename)
-        self.index: List[Tuple[int, str]] = []
+        self.index: list[tuple[int, str]] = []
 
         for dir_index, dir_name in enumerate(self.all_categories):
             files = os.listdir(os.path.join(self.root, dir_name))
             for fname in files:
                 self.index.append((dir_index, fname))
 
+        self.loader = loader
+
     def _init_2021(self) -> None:
         """Initialize based on 2021 layout"""
 
@@ -167,7 +174,7 @@ def _init_pre2021(self) -> None:
             if not c:
                 raise RuntimeError(f"Missing category {cindex}")
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -177,7 +184,8 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
         """
 
         cat_id, fname = self.index[index]
-        img = Image.open(os.path.join(self.root, self.all_categories[cat_id], fname))
+        image_path = os.path.join(self.root, self.all_categories[cat_id], fname)
+        img = self.loader(image_path) if self.loader is not None else Image.open(image_path)
 
         target: Any = []
         for t in self.target_type:
@@ -218,15 +226,12 @@ def category_name(self, category_type: str, category_id: int) -> str:
                         return name
                 raise ValueError(f"Invalid category id {category_id} for {category_type}")
 
-    def _check_integrity(self) -> bool:
+    def _check_exists(self) -> bool:
         return os.path.exists(self.root) and len(os.listdir(self.root)) > 0
 
     def download(self) -> None:
-        if self._check_integrity():
-            raise RuntimeError(
-                f"The directory {self.root} already exists. "
-                f"If you want to re-download or re-extract the images, delete the directory."
-            )
+        if self._check_exists():
+            return
 
         base_root = os.path.dirname(self.root)
 
@@ -238,4 +243,3 @@ def download(self) -> None:
         if not os.path.exists(orig_dir_name):
             raise RuntimeError(f"Unable to find downloaded files at {orig_dir_name}")
         os.rename(orig_dir_name, self.root)
-        print(f"Dataset version '{self.version}' has been downloaded and prepared for use")
diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py
index 9352355522d..c568e46a62d 100644
--- a/torchvision/datasets/kinetics.py
+++ b/torchvision/datasets/kinetics.py
@@ -1,12 +1,11 @@
 import csv
 import os
-import time
 import urllib
-import warnings
 from functools import partial
 from multiprocessing import Pool
 from os import path
-from typing import Any, Callable, Dict, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from torch import Tensor
 
@@ -16,7 +15,7 @@
 from .vision import VisionDataset
 
 
-def _dl_wrap(tarpath: str, videopath: str, line: str) -> None:
+def _dl_wrap(tarpath: Union[str, Path], videopath: Union[str, Path], line: str) -> None:
     download_and_extract_archive(line, tarpath, videopath)
 
 
@@ -36,19 +35,19 @@ class Kinetics(VisionDataset):
     frames in a video might be present.
 
     Args:
-        root (string): Root directory of the Kinetics Dataset.
+        root (str or ``pathlib.Path``): Root directory of the Kinetics Dataset.
             Directory should be structured as follows:
             .. code::
 
                 root/
                 ├── split
                 │   ├──  class1
-                │   │   ├──  clip1.mp4
-                │   │   ├──  clip2.mp4
-                │   │   ├──  clip3.mp4
+                │   │   ├──  vid1.mp4
+                │   │   ├──  vid2.mp4
+                │   │   ├──  vid3.mp4
                 │   │   ├──  ...
                 │   ├──  class2
-                │   │   ├──   clipx.mp4
+                │   │   ├──   vidx.mp4
                 │   │    └── ...
 
             Note: split is appended automatically using the split argument.
@@ -57,7 +56,7 @@ class Kinetics(VisionDataset):
         split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` ``"test"``
         frame_rate (float): If omitted, interpolate different frame rate for each clip.
         step_between_clips (int): number of frames between each clip
-        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
+        transform (callable, optional): A function/transform that takes in a TxHxWxC video
             and returns a transformed version.
         download (bool): Download the official version of the dataset to root folder.
         num_workers (int): Use multiple workers for VideoClips creation
@@ -91,18 +90,18 @@ class Kinetics(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         frames_per_clip: int,
         num_classes: str = "400",
         split: str = "train",
         frame_rate: Optional[int] = None,
         step_between_clips: int = 1,
         transform: Optional[Callable] = None,
-        extensions: Tuple[str, ...] = ("avi", "mp4"),
+        extensions: tuple[str, ...] = ("avi", "mp4"),
         download: bool = False,
         num_download_workers: int = 1,
         num_workers: int = 1,
-        _precomputed_metadata: Optional[Dict[str, Any]] = None,
+        _precomputed_metadata: Optional[dict[str, Any]] = None,
         _video_width: int = 0,
         _video_height: int = 0,
         _video_min_dimension: int = 0,
@@ -121,7 +120,6 @@ def __init__(
         self._legacy = _legacy
 
         if _legacy:
-            print("Using legacy structure")
             self.split_folder = root
             self.split = "unknown"
             output_format = "THWC"
@@ -157,14 +155,8 @@ def __init__(
 
     def download_and_process_videos(self) -> None:
         """Downloads all the videos to the _root_ folder in the expected format."""
-        tic = time.time()
         self._download_videos()
-        toc = time.time()
-        print("Elapsed time for downloading in mins ", (toc - tic) / 60)
         self._make_ds_structure()
-        toc2 = time.time()
-        print("Elapsed time for processing in mins ", (toc2 - toc) / 60)
-        print("Elapsed time overall in mins ", (toc2 - tic) / 60)
 
     def _download_videos(self) -> None:
         """download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
@@ -174,10 +166,7 @@ def _download_videos(self) -> None:
             RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
         """
         if path.exists(self.split_folder):
-            raise RuntimeError(
-                f"The directory {self.split_folder} already exists. "
-                f"If you want to re-download or re-extract the images, delete the directory."
-            )
+            return
         tar_path = path.join(self.root, "tars")
         file_list_path = path.join(self.root, "files")
 
@@ -232,13 +221,13 @@ def _make_ds_structure(self) -> None:
                     )
 
     @property
-    def metadata(self) -> Dict[str, Any]:
+    def metadata(self) -> dict[str, Any]:
         return self.video_clips.metadata
 
     def __len__(self) -> int:
         return self.video_clips.num_clips()
 
-    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
+    def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, int]:
         video, audio, info, video_idx = self.video_clips.get_clip(idx)
         label = self.samples[video_idx][1]
 
@@ -246,83 +235,3 @@ def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
             video = self.transform(video)
 
         return video, audio, label
-
-
-class Kinetics400(Kinetics):
-    """
-    `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
-    dataset.
-
-    .. warning::
-        This class was deprecated in ``0.12`` and will be removed in ``0.14``. Please use
-        ``Kinetics(..., num_classes='400')`` instead.
-
-    Kinetics-400 is an action recognition video dataset.
-    This dataset consider every video as a collection of video clips of fixed size, specified
-    by ``frames_per_clip``, where the step in frames between each clip is given by
-    ``step_between_clips``.
-
-    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
-    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
-    elements will come from video 1, and the next three elements from video 2.
-    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
-    frames in a video might be present.
-
-    Internally, it uses a VideoClips object to handle clip creation.
-
-    Args:
-        root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows:
-
-            .. code::
-
-                root/
-                ├── class1
-                │   ├── clip1.avi
-                │   ├── clip2.avi
-                │   ├── clip3.mp4
-                │   └── ...
-                └── class2
-                    ├── clipx.avi
-                    └── ...
-
-        frames_per_clip (int): number of frames in a clip
-        step_between_clips (int): number of frames between each clip
-        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
-            and returns a transformed version.
-
-    Returns:
-        tuple: A 3-tuple with the following entries:
-
-            - video (Tensor[T, H, W, C]): the `T` video frames
-            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
-              and `L` is the number of points
-            - label (int): class of the video clip
-    """
-
-    def __init__(
-        self,
-        root: str,
-        frames_per_clip: int,
-        num_classes: Any = None,
-        split: Any = None,
-        download: Any = None,
-        num_download_workers: Any = None,
-        **kwargs: Any,
-    ) -> None:
-        warnings.warn(
-            "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14."
-            "Please use Kinetics(..., num_classes='400') instead."
-            "Note that Kinetics(..., num_classes='400') returns video in a Tensor[T, C, H, W] format."
-        )
-        if any(value is not None for value in (num_classes, split, download, num_download_workers)):
-            raise RuntimeError(
-                "Usage of 'num_classes', 'split', 'download', or 'num_download_workers' is not supported in "
-                "Kinetics400. Please use Kinetics instead."
-            )
-
-        super().__init__(
-            root=root,
-            frames_per_clip=frames_per_clip,
-            _legacy=True,
-            **kwargs,
-        )
diff --git a/torchvision/datasets/kitti.py b/torchvision/datasets/kitti.py
index c166a25c7d8..d275248d92a 100644
--- a/torchvision/datasets/kitti.py
+++ b/torchvision/datasets/kitti.py
@@ -1,6 +1,7 @@
 import csv
 import os
-from typing import Any, Callable, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -14,7 +15,7 @@ class Kitti(VisionDataset):
     It corresponds to the "left color images of object" dataset, for object detection.
 
     Args:
-        root (string): Root directory where images are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images are downloaded to.
             Expects the following folder structure if download=False:
 
             .. code::
@@ -51,7 +52,7 @@ class Kitti(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         train: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -66,7 +67,6 @@ def __init__(
         )
         self.images = []
         self.targets = []
-        self.root = root
         self.train = train
         self._location = "training" if self.train else "testing"
 
@@ -83,7 +83,7 @@ def __init__(
             if self.train:
                 self.targets.append(os.path.join(labels_dir, f"{img_file.split('.')[0]}.txt"))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """Get item at a given index.
 
         Args:
@@ -108,7 +108,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
             image, target = self.transforms(image, target)
         return image, target
 
-    def _parse_target(self, index: int) -> List:
+    def _parse_target(self, index: int) -> list:
         target = []
         with open(self.targets[index]) as inp:
             content = csv.reader(inp, delimiter=" ")
diff --git a/torchvision/datasets/lfw.py b/torchvision/datasets/lfw.py
index a25765d5725..2ff17af5328 100644
--- a/torchvision/datasets/lfw.py
+++ b/torchvision/datasets/lfw.py
@@ -1,8 +1,8 @@
 import os
-from typing import Any, Callable, List, Optional, Tuple
-
-from PIL import Image
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
+from .folder import default_loader
 from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
 from .vision import VisionDataset
 
@@ -31,14 +31,15 @@ class _LFW(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str,
         image_set: str,
         view: str,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
         super().__init__(os.path.join(root, self.base_folder), transform=transform, target_transform=target_transform)
 
         self.image_set = verify_str_arg(image_set.lower(), "image_set", self.file_dict.keys())
@@ -47,22 +48,22 @@ def __init__(
         self.view = verify_str_arg(view.lower(), "view", ["people", "pairs"])
         self.split = verify_str_arg(split.lower(), "split", ["10fold", "train", "test"])
         self.labels_file = f"{self.view}{self.annot_file[self.split]}.txt"
-        self.data: List[Any] = []
+        self.data: list[Any] = []
 
         if download:
+            raise ValueError(
+                "LFW dataset is no longer available for download."
+                "Please download the dataset manually and place it in the specified directory"
+            )
             self.download()
 
         if not self._check_integrity():
             raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")
 
         self.images_dir = os.path.join(self.root, images_dir)
+        self._loader = loader
 
-    def _loader(self, path: str) -> Image.Image:
-        with open(path, "rb") as f:
-            img = Image.open(f)
-            return img.convert("RGB")
-
-    def _check_integrity(self):
+    def _check_integrity(self) -> bool:
         st1 = check_integrity(os.path.join(self.root, self.filename), self.md5)
         st2 = check_integrity(os.path.join(self.root, self.labels_file), self.checksums[self.labels_file])
         if not st1 or not st2:
@@ -71,9 +72,8 @@ def _check_integrity(self):
             return check_integrity(os.path.join(self.root, self.names), self.checksums[self.names])
         return True
 
-    def download(self):
+    def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
         url = f"{self.download_url_prefix}{self.filename}"
         download_and_extract_archive(url, self.root, filename=self.filename, md5=self.md5)
@@ -81,34 +81,39 @@ def download(self):
         if self.view == "people":
             download_url(f"{self.download_url_prefix}{self.names}", self.root)
 
-    def _get_path(self, identity, no):
+    def _get_path(self, identity: str, no: Union[int, str]) -> str:
         return os.path.join(self.images_dir, identity, f"{identity}_{int(no):04d}.jpg")
 
     def extra_repr(self) -> str:
         return f"Alignment: {self.image_set}\nSplit: {self.split}"
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.data)
 
 
 class LFWPeople(_LFW):
     """`LFW <http://vis-www.cs.umass.edu/lfw/>`_ Dataset.
 
+    .. warning:
+
+        The LFW dataset is no longer available for automatic download. Please
+        download it manually and place it in the specified directory.
+
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``lfw-py`` exists or will be saved to if download is set to True.
         split (string, optional): The image split to use. Can be one of ``train``, ``test``,
             ``10fold`` (default).
         image_set (str, optional): Type of image funneling to use, ``original``, ``funneled`` or
             ``deepfunneled``. Defaults to ``funneled``.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomRotation``
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
-        download (bool, optional): If true, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
-
+        download (bool, optional): NOT SUPPORTED ANYMORE, leave to False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     def __init__(
@@ -119,13 +124,14 @@ def __init__(
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
-        super().__init__(root, split, image_set, "people", transform, target_transform, download)
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root, split, image_set, "people", transform, target_transform, download, loader=loader)
 
         self.class_to_idx = self._get_classes()
         self.data, self.targets = self._get_people()
 
-    def _get_people(self):
+    def _get_people(self) -> tuple[list[str], list[int]]:
         data, targets = [], []
         with open(os.path.join(self.root, self.labels_file)) as f:
             lines = f.readlines()
@@ -143,14 +149,14 @@ def _get_people(self):
 
         return data, targets
 
-    def _get_classes(self):
+    def _get_classes(self) -> dict[str, int]:
         with open(os.path.join(self.root, self.names)) as f:
             lines = f.readlines()
             names = [line.strip().split()[0] for line in lines]
         class_to_idx = {name: i for i, name in enumerate(names)}
         return class_to_idx
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -176,20 +182,26 @@ def extra_repr(self) -> str:
 class LFWPairs(_LFW):
     """`LFW <http://vis-www.cs.umass.edu/lfw/>`_ Dataset.
 
+    .. warning:
+
+        The LFW dataset is no longer available for automatic download. Please
+        download it manually and place it in the specified directory.
+
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``lfw-py`` exists or will be saved to if download is set to True.
         split (string, optional): The image split to use. Can be one of ``train``, ``test``,
             ``10fold``. Defaults to ``10fold``.
         image_set (str, optional): Type of image funneling to use, ``original``, ``funneled`` or
             ``deepfunneled``. Defaults to ``funneled``.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomRotation``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
-        download (bool, optional): If true, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
+        download (bool, optional): NOT SUPPORTED ANYMORE, leave to False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
 
     """
 
@@ -201,12 +213,13 @@ def __init__(
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
-        super().__init__(root, split, image_set, "pairs", transform, target_transform, download)
+        loader: Callable[[str], Any] = default_loader,
+    ) -> None:
+        super().__init__(root, split, image_set, "pairs", transform, target_transform, download, loader=loader)
 
         self.pair_names, self.data, self.targets = self._get_pairs(self.images_dir)
 
-    def _get_pairs(self, images_dir):
+    def _get_pairs(self, images_dir: str) -> tuple[list[tuple[str, str]], list[tuple[str, str]], list[int]]:
         pair_names, data, targets = [], [], []
         with open(os.path.join(self.root, self.labels_file)) as f:
             lines = f.readlines()
@@ -234,7 +247,7 @@ def _get_pairs(self, images_dir):
 
         return pair_names, data, targets
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any, int]:
+    def __getitem__(self, index: int) -> tuple[Any, Any, int]:
         """
         Args:
             index (int): Index
diff --git a/torchvision/datasets/lsun.py b/torchvision/datasets/lsun.py
index a936351cdcc..6f6c7a5eb63 100644
--- a/torchvision/datasets/lsun.py
+++ b/torchvision/datasets/lsun.py
@@ -3,7 +3,8 @@
 import pickle
 import string
 from collections.abc import Iterable
-from typing import Any, Callable, cast, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, cast, Optional, Union
 
 from PIL import Image
 
@@ -30,7 +31,7 @@ def __init__(
                 self.keys = [key for key in txn.cursor().iternext(keys=True, values=False)]
             pickle.dump(self.keys, open(cache_file, "wb"))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         img, target = None, None
         env = self.env
         with env.begin(write=False) as txn:
@@ -54,16 +55,16 @@ def __len__(self) -> int:
 
 
 class LSUN(VisionDataset):
-    """`LSUN <https://www.yf.io/p/lsun>`_ dataset.
+    """`LSUN <https://paperswithcode.com/dataset/lsun>`_ dataset.
 
     You will need to install the ``lmdb`` package to use this dataset: run
     ``pip install lmdb``
 
     Args:
-        root (string): Root directory for the database files.
+        root (str or ``pathlib.Path``): Root directory for the database files.
         classes (string or list): One of {'train', 'val', 'test'} or a list of
             categories to load. e,g. ['bedroom_train', 'church_outdoor_train'].
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -71,8 +72,8 @@ class LSUN(VisionDataset):
 
     def __init__(
         self,
-        root: str,
-        classes: Union[str, List[str]] = "train",
+        root: Union[str, Path],
+        classes: Union[str, list[str]] = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
     ) -> None:
@@ -92,7 +93,7 @@ def __init__(
 
         self.length = count
 
-    def _verify_classes(self, classes: Union[str, List[str]]) -> List[str]:
+    def _verify_classes(self, classes: Union[str, list[str]]) -> list[str]:
         categories = [
             "bedroom",
             "bridge",
@@ -135,7 +136,7 @@ def _verify_classes(self, classes: Union[str, List[str]]) -> List[str]:
 
         return classes
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
diff --git a/torchvision/datasets/mnist.py b/torchvision/datasets/mnist.py
index fd742544935..06a658cbea4 100644
--- a/torchvision/datasets/mnist.py
+++ b/torchvision/datasets/mnist.py
@@ -5,14 +5,15 @@
 import string
 import sys
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 from urllib.error import URLError
 
 import numpy as np
 import torch
-from PIL import Image
 
-from .utils import check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
+from ..utils import _Image_fromarray
+from .utils import _flip_byte_order, check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
 from .vision import VisionDataset
 
 
@@ -20,22 +21,22 @@ class MNIST(VisionDataset):
     """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where ``MNIST/raw/train-images-idx3-ubyte``
+        root (str or ``pathlib.Path``): Root directory of dataset where ``MNIST/raw/train-images-idx3-ubyte``
             and  ``MNIST/raw/t10k-images-idx3-ubyte`` exist.
         train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
             otherwise from ``t10k-images-idx3-ubyte``.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that  takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
     """
 
     mirrors = [
-        "http://yann.lecun.com/exdb/mnist/",
         "https://ossci-datasets.s3.amazonaws.com/mnist/",
+        "http://yann.lecun.com/exdb/mnist/",
     ]
 
     resources = [
@@ -82,7 +83,7 @@ def test_data(self):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         train: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -116,7 +117,7 @@ def _load_legacy_data(self):
         # This is for BC only. We no longer cache the data in a custom binary, but simply read from the raw data
         # directly.
         data_file = self.training_file if self.train else self.test_file
-        return torch.load(os.path.join(self.processed_folder, data_file))
+        return torch.load(os.path.join(self.processed_folder, data_file), weights_only=True)
 
     def _load_data(self):
         image_file = f"{'train' if self.train else 't10k'}-images-idx3-ubyte"
@@ -127,7 +128,7 @@ def _load_data(self):
 
         return data, targets
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -139,7 +140,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
 
         # doing this so that it is consistent with all other datasets
         # to return a PIL Image
-        img = Image.fromarray(img.numpy(), mode="L")
+        img = _Image_fromarray(img.numpy(), mode="L")
 
         if self.transform is not None:
             img = self.transform(img)
@@ -161,7 +162,7 @@ def processed_folder(self) -> str:
         return os.path.join(self.root, self.__class__.__name__, "processed")
 
     @property
-    def class_to_idx(self) -> Dict[str, int]:
+    def class_to_idx(self) -> dict[str, int]:
         return {_class: i for i, _class in enumerate(self.classes)}
 
     def _check_exists(self) -> bool:
@@ -180,19 +181,20 @@ def download(self) -> None:
 
         # download files
         for filename, md5 in self.resources:
+            errors = []
             for mirror in self.mirrors:
                 url = f"{mirror}{filename}"
                 try:
-                    print(f"Downloading {url}")
                     download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
-                except URLError as error:
-                    print(f"Failed to download (trying next):\n{error}")
+                except URLError as e:
+                    errors.append(e)
                     continue
-                finally:
-                    print()
                 break
             else:
-                raise RuntimeError(f"Error downloading {filename}")
+                s = f"Error downloading {filename}:\n"
+                for mirror, err in zip(self.mirrors, errors):
+                    s += f"Tried {mirror}, got:\n{str(err)}\n"
+                raise RuntimeError(s)
 
     def extra_repr(self) -> str:
         split = "Train" if self.train is True else "Test"
@@ -203,17 +205,17 @@ class FashionMNIST(MNIST):
     """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where ``FashionMNIST/raw/train-images-idx3-ubyte``
+        root (str or ``pathlib.Path``): Root directory of dataset where ``FashionMNIST/raw/train-images-idx3-ubyte``
             and  ``FashionMNIST/raw/t10k-images-idx3-ubyte`` exist.
         train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
             otherwise from ``t10k-images-idx3-ubyte``.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that  takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
     """
 
     mirrors = ["http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"]
@@ -231,17 +233,17 @@ class KMNIST(MNIST):
     """`Kuzushiji-MNIST <https://github.com/rois-codh/kmnist>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where ``KMNIST/raw/train-images-idx3-ubyte``
+        root (str or ``pathlib.Path``): Root directory of dataset where ``KMNIST/raw/train-images-idx3-ubyte``
             and  ``KMNIST/raw/t10k-images-idx3-ubyte`` exist.
         train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
             otherwise from ``t10k-images-idx3-ubyte``.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that  takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
     """
 
     mirrors = ["http://codh.rois.ac.jp/kmnist/dataset/kmnist/"]
@@ -259,7 +261,7 @@ class EMNIST(MNIST):
     """`EMNIST <https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where ``EMNIST/raw/train-images-idx3-ubyte``
+        root (str or ``pathlib.Path``): Root directory of dataset where ``EMNIST/raw/train-images-idx3-ubyte``
             and  ``EMNIST/raw/t10k-images-idx3-ubyte`` exist.
         split (string): The dataset has 6 different splits: ``byclass``, ``bymerge``,
             ``balanced``, ``letters``, ``digits`` and ``mnist``. This argument specifies
@@ -269,13 +271,13 @@ class EMNIST(MNIST):
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that  takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
     """
 
-    url = "https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip"
+    url = "https://biometrics.nist.gov/cs_links/EMNIST/gzip.zip"
     md5 = "58c8d27c78d21e728a6bc7b3cc06412e"
     splits = ("byclass", "bymerge", "balanced", "letters", "digits", "mnist")
     # Merged Classes assumes Same structure for both uppercase and lowercase version
@@ -290,7 +292,7 @@ class EMNIST(MNIST):
         "mnist": list(string.digits),
     }
 
-    def __init__(self, root: str, split: str, **kwargs: Any) -> None:
+    def __init__(self, root: Union[str, Path], split: str, **kwargs: Any) -> None:
         self.split = verify_str_arg(split, "split", self.splits)
         self.training_file = self._training_file(split)
         self.test_file = self._test_file(split)
@@ -343,7 +345,7 @@ class QMNIST(MNIST):
     """`QMNIST <https://github.com/facebookresearch/qmnist>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset whose ``raw``
+        root (str or ``pathlib.Path``): Root directory of dataset whose ``raw``
             subdir contains binary files of the datasets.
         what (string,optional): Can be 'train', 'test', 'test10k',
             'test50k', or 'nist' for respectively the mnist compatible
@@ -356,21 +358,21 @@ class QMNIST(MNIST):
             for each example is class number (for compatibility with
             the MNIST dataloader) or a torch vector containing the
             full qmnist information. Default=True.
+        train (bool,optional,compatibility): When argument 'what' is
+            not specified, this boolean decides whether to load the
+            training set or the testing set.  Default: True.
         download (bool, optional): If True, downloads the dataset from
             the internet and puts it in root directory. If dataset is
             already downloaded, it is not downloaded again.
         transform (callable, optional): A function/transform that
-            takes in an PIL image and returns a transformed
+            takes in a PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform
             that takes in the target and transforms it.
-        train (bool,optional,compatibility): When argument 'what' is
-            not specified, this boolean decides whether to load the
-            training set ot the testing set.  Default: True.
     """
 
     subsets = {"train": "train", "test": "test", "test10k": "test", "test50k": "test", "nist": "nist"}
-    resources: Dict[str, List[Tuple[str, str]]] = {  # type: ignore[assignment]
+    resources: dict[str, list[tuple[str, str]]] = {  # type: ignore[assignment]
         "train": [
             (
                 "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-train-images-idx3-ubyte.gz",
@@ -416,7 +418,7 @@ class QMNIST(MNIST):
     ]
 
     def __init__(
-        self, root: str, what: Optional[str] = None, compat: bool = True, train: bool = True, **kwargs: Any
+        self, root: Union[str, Path], what: Optional[str] = None, compat: bool = True, train: bool = True, **kwargs: Any
     ) -> None:
         if what is None:
             what = "train" if train else "test"
@@ -473,10 +475,10 @@ def download(self) -> None:
         for url, md5 in split:
             download_and_extract_archive(url, self.raw_folder, md5=md5)
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         # redefined to handle the compat flag
         img, target = self.data[index], self.targets[index]
-        img = Image.fromarray(img.numpy(), mode="L")
+        img = _Image_fromarray(img.numpy(), mode="L")
         if self.transform is not None:
             img = self.transform(img)
         if self.compat:
@@ -510,22 +512,31 @@ def read_sn3_pascalvincent_tensor(path: str, strict: bool = True) -> torch.Tenso
     # read
     with open(path, "rb") as f:
         data = f.read()
+
     # parse
-    magic = get_int(data[0:4])
-    nd = magic % 256
-    ty = magic // 256
+    if sys.byteorder == "little" or sys.platform == "aix":
+        magic = get_int(data[0:4])
+        nd = magic % 256
+        ty = magic // 256
+    else:
+        nd = get_int(data[0:1])
+        ty = get_int(data[1:2]) + get_int(data[2:3]) * 256 + get_int(data[3:4]) * 256 * 256
+
     assert 1 <= nd <= 3
     assert 8 <= ty <= 14
     torch_type = SN3_PASCALVINCENT_TYPEMAP[ty]
     s = [get_int(data[4 * (i + 1) : 4 * (i + 2)]) for i in range(nd)]
 
-    num_bytes_per_value = torch.iinfo(torch_type).bits // 8
-    # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
-    # we need to reverse the bytes before we can read them with torch.frombuffer().
-    needs_byte_reversal = sys.byteorder == "little" and num_bytes_per_value > 1
+    if sys.byteorder == "big" and not sys.platform == "aix":
+        for i in range(len(s)):
+            s[i] = int.from_bytes(s[i].to_bytes(4, byteorder="little"), byteorder="big", signed=False)
+
     parsed = torch.frombuffer(bytearray(data), dtype=torch_type, offset=(4 * (nd + 1)))
-    if needs_byte_reversal:
-        parsed = parsed.flip(0)
+
+    # The MNIST format uses the big endian byte order, while `torch.frombuffer` uses whatever the system uses. In case
+    # that is little endian and the dtype has more than one byte, we need to flip them.
+    if sys.byteorder == "little" and parsed.element_size() > 1:
+        parsed = _flip_byte_order(parsed)
 
     assert parsed.shape[0] == np.prod(s) or not strict
     return parsed.view(*s)
diff --git a/torchvision/datasets/moving_mnist.py b/torchvision/datasets/moving_mnist.py
new file mode 100644
index 00000000000..4466d82291b
--- /dev/null
+++ b/torchvision/datasets/moving_mnist.py
@@ -0,0 +1,94 @@
+import os.path
+from pathlib import Path
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torchvision.datasets.utils import download_url, verify_str_arg
+from torchvision.datasets.vision import VisionDataset
+
+
+class MovingMNIST(VisionDataset):
+    """`MovingMNIST <http://www.cs.toronto.edu/~nitish/unsupervised_video/>`_ Dataset.
+
+    Args:
+        root (str or ``pathlib.Path``): Root directory of dataset where ``MovingMNIST/mnist_test_seq.npy`` exists.
+        split (string, optional): The dataset split, supports ``None`` (default), ``"train"`` and ``"test"``.
+            If ``split=None``, the full data is returned.
+        split_ratio (int, optional): The split ratio of number of frames. If ``split="train"``, the first split
+            frames ``data[:, :split_ratio]`` is returned. If ``split="test"``, the last split frames ``data[:, split_ratio:]``
+            is returned. If ``split=None``, this parameter is ignored and the all frames data is returned.
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+        transform (callable, optional): A function/transform that takes in a torch Tensor
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+    """
+
+    _URL = "http://www.cs.toronto.edu/~nitish/unsupervised_video/mnist_test_seq.npy"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: Optional[str] = None,
+        split_ratio: int = 10,
+        download: bool = False,
+        transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transform=transform)
+
+        self._base_folder = os.path.join(self.root, self.__class__.__name__)
+        self._filename = self._URL.split("/")[-1]
+
+        if split is not None:
+            verify_str_arg(split, "split", ("train", "test"))
+        self.split = split
+
+        if not isinstance(split_ratio, int):
+            raise TypeError(f"`split_ratio` should be an integer, but got {type(split_ratio)}")
+        elif not (1 <= split_ratio <= 19):
+            raise ValueError(f"`split_ratio` should be `1 <= split_ratio <= 19`, but got {split_ratio} instead.")
+        self.split_ratio = split_ratio
+
+        if download:
+            self.download()
+
+        if not self._check_exists():
+            raise RuntimeError("Dataset not found. You can use download=True to download it.")
+
+        data = torch.from_numpy(np.load(os.path.join(self._base_folder, self._filename)))
+        if self.split == "train":
+            data = data[: self.split_ratio]
+        elif self.split == "test":
+            data = data[self.split_ratio :]
+        self.data = data.transpose(0, 1).unsqueeze(2).contiguous()
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        """
+        Args:
+            idx (int): Index
+        Returns:
+            torch.Tensor: Video frames (torch Tensor[T, C, H, W]). The `T` is the number of frames.
+        """
+        data = self.data[idx]
+        if self.transform is not None:
+            data = self.transform(data)
+
+        return data
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def _check_exists(self) -> bool:
+        return os.path.exists(os.path.join(self._base_folder, self._filename))
+
+    def download(self) -> None:
+        if self._check_exists():
+            return
+
+        download_url(
+            url=self._URL,
+            root=self._base_folder,
+            filename=self._filename,
+            md5="be083ec986bfe91a449d63653c411eb2",
+        )
diff --git a/torchvision/datasets/omniglot.py b/torchvision/datasets/omniglot.py
index 41d18c1bdd5..22fd59aa9c2 100644
--- a/torchvision/datasets/omniglot.py
+++ b/torchvision/datasets/omniglot.py
@@ -1,5 +1,6 @@
 from os.path import join
-from typing import Any, Callable, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -11,17 +12,20 @@ class Omniglot(VisionDataset):
     """`Omniglot <https://github.com/brendenlake/omniglot>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``omniglot-py`` exists.
         background (bool, optional): If True, creates dataset from the "background" set, otherwise
             creates from the "evaluation" set. This terminology is defined by the authors.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If true, downloads the dataset zip files from the internet and
             puts it in root directory. If the zip files are already downloaded, they are not
             downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     folder = "omniglot-py"
@@ -33,11 +37,12 @@ class Omniglot(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         background: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Optional[Callable[[Union[str, Path]], Any]] = None,
     ) -> None:
         super().__init__(join(root, self.folder), transform=transform, target_transform=target_transform)
         self.background = background
@@ -50,19 +55,20 @@ def __init__(
 
         self.target_folder = join(self.root, self._get_target_folder())
         self._alphabets = list_dir(self.target_folder)
-        self._characters: List[str] = sum(
+        self._characters: list[str] = sum(
             ([join(a, c) for c in list_dir(join(self.target_folder, a))] for a in self._alphabets), []
         )
         self._character_images = [
             [(image, idx) for image in list_files(join(self.target_folder, character), ".png")]
             for idx, character in enumerate(self._characters)
         ]
-        self._flat_character_images: List[Tuple[str, int]] = sum(self._character_images, [])
+        self._flat_character_images: list[tuple[str, int]] = sum(self._character_images, [])
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._flat_character_images)
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -72,7 +78,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
         """
         image_name, character_class = self._flat_character_images[index]
         image_path = join(self.target_folder, self._characters[character_class], image_name)
-        image = Image.open(image_path, mode="r").convert("L")
+        image = Image.open(image_path, mode="r").convert("L") if self.loader is None else self.loader(image_path)
 
         if self.transform:
             image = self.transform(image)
@@ -90,7 +96,6 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
         filename = self._get_target_folder()
diff --git a/torchvision/datasets/oxford_iiit_pet.py b/torchvision/datasets/oxford_iiit_pet.py
index 667ee13717d..e598920f8fe 100644
--- a/torchvision/datasets/oxford_iiit_pet.py
+++ b/torchvision/datasets/oxford_iiit_pet.py
@@ -1,7 +1,8 @@
 import os
 import os.path
 import pathlib
-from typing import Any, Callable, Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -13,19 +14,22 @@ class OxfordIIITPet(VisionDataset):
     """`Oxford-IIIT Pet Dataset   <https://www.robots.ox.ac.uk/~vgg/data/pets/>`_.
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"trainval"`` (default) or ``"test"``.
         target_types (string, sequence of strings, optional): Types of target to use. Can be ``category`` (default) or
             ``segmentation``. Can also be a list to output a tuple with all specified target types. The types represent:
 
                 - ``category`` (int): Label for one of the 37 pet categories.
+                - ``binary-category`` (int): Binary label for cat or dog.
                 - ``segmentation`` (PIL image): Segmentation trimap of the image.
 
             If empty, ``None`` will be returned as target.
 
-        transform (callable, optional): A function/transform that  takes in a PIL image and returns a transformed
+        transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample
+            and its target as entry and returns a transformed version.
         download (bool, optional): If True, downloads the dataset from the internet and puts it into
             ``root/oxford-iiit-pet``. If dataset is already downloaded, it is not downloaded again.
     """
@@ -34,11 +38,11 @@ class OxfordIIITPet(VisionDataset):
         ("https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz", "5c4f3ee8e5d25df40f4fd59a7f44e54c"),
         ("https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz", "95a8c909bbe2e81eed6a22bccdf3f68f"),
     )
-    _VALID_TARGET_TYPES = ("category", "segmentation")
+    _VALID_TARGET_TYPES = ("category", "binary-category", "segmentation")
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "trainval",
         target_types: Union[Sequence[str], str] = "category",
         transforms: Optional[Callable] = None,
@@ -67,12 +71,15 @@ def __init__(
 
         image_ids = []
         self._labels = []
+        self._bin_labels = []
         with open(self._anns_folder / f"{self._split}.txt") as file:
             for line in file:
-                image_id, label, *_ = line.strip().split()
+                image_id, label, bin_label, _ = line.strip().split()
                 image_ids.append(image_id)
                 self._labels.append(int(label) - 1)
+                self._bin_labels.append(int(bin_label) - 1)
 
+        self.bin_classes = ["Cat", "Dog"]
         self.classes = [
             " ".join(part.title() for part in raw_cls.split("_"))
             for raw_cls, _ in sorted(
@@ -80,6 +87,7 @@ def __init__(
                 key=lambda image_id_and_label: image_id_and_label[1],
             )
         ]
+        self.bin_class_to_idx = dict(zip(self.bin_classes, range(len(self.bin_classes))))
         self.class_to_idx = dict(zip(self.classes, range(len(self.classes))))
 
         self._images = [self._images_folder / f"{image_id}.jpg" for image_id in image_ids]
@@ -88,13 +96,15 @@ def __init__(
     def __len__(self) -> int:
         return len(self._images)
 
-    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image = Image.open(self._images[idx]).convert("RGB")
 
         target: Any = []
         for target_type in self._target_types:
             if target_type == "category":
                 target.append(self._labels[idx])
+            elif target_type == "binary-category":
+                target.append(self._bin_labels[idx])
             else:  # target_type == "segmentation"
                 target.append(Image.open(self._segs[idx]))
 
diff --git a/torchvision/datasets/pcam.py b/torchvision/datasets/pcam.py
index 63faf721a0f..00d10f6a010 100644
--- a/torchvision/datasets/pcam.py
+++ b/torchvision/datasets/pcam.py
@@ -1,5 +1,5 @@
 import pathlib
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
 from PIL import Image
 
@@ -18,13 +18,17 @@ class PCAM(VisionDataset):
     This dataset requires the ``h5py`` package which you can install with ``pip install h5py``.
 
     Args:
-         root (string): Root directory of the dataset.
+         root (str or ``pathlib.Path``): Root directory of the dataset.
          split (string, optional): The dataset split, supports ``"train"`` (default), ``"test"`` or ``"val"``.
-         transform (callable, optional): A function/transform that  takes in a PIL image and returns a transformed
+         transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
              version. E.g, ``transforms.RandomCrop``.
          target_transform (callable, optional): A function/transform that takes in the target and transforms it.
          download (bool, optional): If True, downloads the dataset from the internet and puts it into ``root/pcam``. If
              dataset is already downloaded, it is not downloaded again.
+
+             .. warning::
+
+                To download the dataset `gdown <https://github.com/wkentaro/gdown>`_ is required.
     """
 
     _FILES = {
@@ -44,31 +48,31 @@ class PCAM(VisionDataset):
             "images": (
                 "camelyonpatch_level_2_split_test_x.h5",
                 "1qV65ZqZvWzuIVthK8eVDhIwrbnsJdbg_",
-                "d5b63470df7cfa627aeec8b9dc0c066e",
+                "d8c2d60d490dbd479f8199bdfa0cf6ec",
             ),
             "targets": (
                 "camelyonpatch_level_2_split_test_y.h5",
                 "17BHrSrwWKjYsOgTMmoqrIjDy6Fa2o_gP",
-                "2b85f58b927af9964a4c15b8f7e8f179",
+                "60a7035772fbdb7f34eb86d4420cf66a",
             ),
         },
         "val": {
             "images": (
                 "camelyonpatch_level_2_split_valid_x.h5",
                 "1hgshYGWK8V-eGRy8LToWJJgDU_rXWVJ3",
-                "d8c2d60d490dbd479f8199bdfa0cf6ec",
+                "d5b63470df7cfa627aeec8b9dc0c066e",
             ),
             "targets": (
                 "camelyonpatch_level_2_split_valid_y.h5",
                 "1bH8ZRbhSVAhScTS0p9-ZzGnX91cHT3uO",
-                "60a7035772fbdb7f34eb86d4420cf66a",
+                "2b85f58b927af9964a4c15b8f7e8f179",
             ),
         },
     }
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -99,7 +103,7 @@ def __len__(self) -> int:
         with self.h5py.File(self._base_folder / images_file) as images_data:
             return images_data["x"].shape[0]
 
-    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         images_file = self._FILES[self._split]["images"][0]
         with self.h5py.File(self._base_folder / images_file) as images_data:
             image = Image.fromarray(images_data["x"][idx]).convert("RGB")
diff --git a/torchvision/datasets/phototour.py b/torchvision/datasets/phototour.py
index edf1d2ee256..5d625b51ece 100644
--- a/torchvision/datasets/phototour.py
+++ b/torchvision/datasets/phototour.py
@@ -1,5 +1,6 @@
 import os
-from typing import Any, Callable, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import torch
@@ -24,9 +25,9 @@ class PhotoTour(VisionDataset):
 
 
     Args:
-        root (string): Root directory where images are.
+        root (str or ``pathlib.Path``): Root directory where images are.
         name (string): Name of the dataset to load.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
@@ -87,7 +88,12 @@ class PhotoTour(VisionDataset):
     matches_files = "m50_100000_100000_0.txt"
 
     def __init__(
-        self, root: str, name: str, train: bool = True, transform: Optional[Callable] = None, download: bool = False
+        self,
+        root: Union[str, Path],
+        name: str,
+        train: bool = True,
+        transform: Optional[Callable] = None,
+        download: bool = False,
     ) -> None:
         super().__init__(root, transform=transform)
         self.name = name
@@ -106,9 +112,9 @@ def __init__(
             self.cache()
 
         # load the serialized data
-        self.data, self.labels, self.matches = torch.load(self.data_file)
+        self.data, self.labels, self.matches = torch.load(self.data_file, weights_only=True)
 
-    def __getitem__(self, index: int) -> Union[torch.Tensor, Tuple[Any, Any, torch.Tensor]]:
+    def __getitem__(self, index: int) -> Union[torch.Tensor, tuple[Any, Any, torch.Tensor]]:
         """
         Args:
             index (int): Index
@@ -139,7 +145,6 @@ def _check_downloaded(self) -> bool:
 
     def download(self) -> None:
         if self._check_datafile_exists():
-            print(f"# Found cached data {self.data_file}")
             return
 
         if not self._check_downloaded():
@@ -151,8 +156,6 @@ def download(self) -> None:
 
             download_url(url, self.root, filename, md5)
 
-            print(f"# Extracting data {self.data_down}\n")
-
             import zipfile
 
             with zipfile.ZipFile(fpath, "r") as z:
@@ -162,7 +165,6 @@ def download(self) -> None:
 
     def cache(self) -> None:
         # process and save as torch files
-        print(f"# Caching data {self.data_file}")
 
         dataset = (
             read_image_file(self.data_dir, self.image_ext, self.lens[self.name]),
@@ -185,7 +187,7 @@ def PIL2array(_img: Image.Image) -> np.ndarray:
         """Convert PIL image type to numpy 2D array"""
         return np.array(_img.getdata(), dtype=np.uint8).reshape(64, 64)
 
-    def find_files(_data_dir: str, _image_ext: str) -> List[str]:
+    def find_files(_data_dir: str, _image_ext: str) -> list[str]:
         """Return a list with the file names of the images containing the patches"""
         files = []
         # find those files with the specified extension
diff --git a/torchvision/datasets/places365.py b/torchvision/datasets/places365.py
index c26b6f03074..51b845de723 100644
--- a/torchvision/datasets/places365.py
+++ b/torchvision/datasets/places365.py
@@ -1,6 +1,7 @@
 import os
 from os import path
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, cast, Optional, Union
 from urllib.parse import urljoin
 
 from .folder import default_loader
@@ -12,14 +13,14 @@ class Places365(VisionDataset):
     r"""`Places365 <http://places2.csail.mit.edu/index.html>`_ classification dataset.
 
     Args:
-        root (string): Root directory of the Places365 dataset.
+        root (str or ``pathlib.Path``): Root directory of the Places365 dataset.
         split (string, optional): The dataset split. Can be one of ``train-standard`` (default), ``train-challenge``,
-            ``val``.
-        small (bool, optional): If ``True``, uses the small images, i. e. resized to 256 x 256 pixels, instead of the
+            ``val``, ``test``.
+        small (bool, optional): If ``True``, uses the small images, i.e. resized to 256 x 256 pixels, instead of the
             high resolution ones.
         download (bool, optional): If ``True``, downloads the dataset components and places them in ``root``. Already
             downloaded archives are not downloaded again.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -32,10 +33,11 @@ class Places365(VisionDataset):
         targets (list): The class_index value for each image in the dataset
 
     Raises:
-        RuntimeError: If ``download is False`` and the meta files, i. e. the devkit, are not present or corrupted.
+        RuntimeError: If ``download is False`` and the meta files, i.e. the devkit, are not present or corrupted.
         RuntimeError: If ``download is True`` and the image archive is already extracted.
     """
-    _SPLITS = ("train-standard", "train-challenge", "val")
+
+    _SPLITS = ("train-standard", "train-challenge", "val", "test")
     _BASE_URL = "http://data.csail.mit.edu/places/places365/"
     # {variant: (archive, md5)}
     _DEVKIT_META = {
@@ -49,20 +51,23 @@ class Places365(VisionDataset):
         "train-standard": ("places365_train_standard.txt", "30f37515461640559006b8329efbed1a"),
         "train-challenge": ("places365_train_challenge.txt", "b2931dc997b8c33c27e7329c073a6b57"),
         "val": ("places365_val.txt", "e9f2fd57bfd9d07630173f4e8708e4b1"),
+        "test": ("places365_test.txt", "2fce8233fe493576d724142e45d93653"),
     }
     # {(split, small): (file, md5)}
     _IMAGES_META = {
         ("train-standard", False): ("train_large_places365standard.tar", "67e186b496a84c929568076ed01a8aa1"),
         ("train-challenge", False): ("train_large_places365challenge.tar", "605f18e68e510c82b958664ea134545f"),
         ("val", False): ("val_large.tar", "9b71c4993ad89d2d8bcbdc4aef38042f"),
+        ("test", False): ("test_large.tar", "41a4b6b724b1d2cd862fb3871ed59913"),
         ("train-standard", True): ("train_256_places365standard.tar", "53ca1c756c3d1e7809517cc47c5561c5"),
         ("train-challenge", True): ("train_256_places365challenge.tar", "741915038a5e3471ec7332404dfb64ef"),
         ("val", True): ("val_256.tar", "e27b17d8d44f4af9a78502beb927f808"),
+        ("test", True): ("test_256.tar", "f532f6ad7b582262a2ec8009075e186b"),
     }
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train-standard",
         small: bool = False,
         download: bool = False,
@@ -82,7 +87,7 @@ def __init__(
         if download:
             self.download_images()
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         file, target = self.imgs[index]
         image = self.loader(file)
 
@@ -107,8 +112,8 @@ def images_dir(self) -> str:
             dir = f"{self.split}_{size}"
         return path.join(self.root, dir)
 
-    def load_categories(self, download: bool = True) -> Tuple[List[str], Dict[str, int]]:
-        def process(line: str) -> Tuple[str, int]:
+    def load_categories(self, download: bool = True) -> tuple[list[str], dict[str, int]]:
+        def process(line: str) -> tuple[str, int]:
             cls, idx = line.split()
             return cls, int(idx)
 
@@ -122,10 +127,14 @@ def process(line: str) -> Tuple[str, int]:
 
         return sorted(class_to_idx.keys()), class_to_idx
 
-    def load_file_list(self, download: bool = True) -> Tuple[List[Tuple[str, int]], List[int]]:
-        def process(line: str, sep="/") -> Tuple[str, int]:
-            image, idx = line.split()
-            return path.join(self.images_dir, image.lstrip(sep).replace(sep, os.sep)), int(idx)
+    def load_file_list(
+        self, download: bool = True
+    ) -> tuple[list[tuple[str, Union[int, None]]], list[Union[int, None]]]:
+        def process(line: str, sep="/") -> tuple[str, Union[int, None]]:
+            image, idx = (line.split() + [None])[:2]
+            image = cast(str, image)
+            idx = int(idx) if idx is not None else None
+            return path.join(self.images_dir, image.lstrip(sep).replace(sep, os.sep)), idx
 
         file, md5 = self._FILE_LIST_META[self.split]
         file = path.join(self.root, file)
@@ -144,10 +153,7 @@ def download_devkit(self) -> None:
 
     def download_images(self) -> None:
         if path.exists(self.images_dir):
-            raise RuntimeError(
-                f"The directory {self.images_dir} already exists. If you want to re-download or re-extract the images, "
-                f"delete the directory."
-            )
+            return
 
         file, md5 = self._IMAGES_META[(self.split, self.small)]
         download_and_extract_archive(urljoin(self._BASE_URL, file), self.root, md5=md5)
diff --git a/torchvision/datasets/rendered_sst2.py b/torchvision/datasets/rendered_sst2.py
index 89adf8cf8d8..62ad3bc6d00 100644
--- a/torchvision/datasets/rendered_sst2.py
+++ b/torchvision/datasets/rendered_sst2.py
@@ -1,9 +1,7 @@
 from pathlib import Path
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
-import PIL.Image
-
-from .folder import make_dataset
+from .folder import default_loader, make_dataset
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
 
@@ -20,14 +18,17 @@ class RenderedSST2(VisionDataset):
     (444 positive and 428 negative), and a test split containing 1821 images (909 positive and 912 negative).
 
     Args:
-        root (string): Root directory of the dataset.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default), `"val"` and ``"test"``.
-        transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
-            version. E.g, ``transforms.RandomCrop``.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again. Default is False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "https://openaipublic.azureedge.net/clip/data/rendered-sst2.tgz"
@@ -35,11 +36,12 @@ class RenderedSST2(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
@@ -55,13 +57,14 @@ def __init__(
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
         self._samples = make_dataset(str(self._base_folder / self._split_to_folder[self._split]), extensions=("png",))
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._samples)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file, label = self._samples[idx]
-        image = PIL.Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
diff --git a/torchvision/datasets/samplers/clip_sampler.py b/torchvision/datasets/samplers/clip_sampler.py
index 026c3d75d3b..570bc85eee9 100644
--- a/torchvision/datasets/samplers/clip_sampler.py
+++ b/torchvision/datasets/samplers/clip_sampler.py
@@ -1,5 +1,6 @@
 import math
-from typing import cast, Iterator, List, Optional, Sized, Union
+from collections.abc import Iterator, Sized
+from typing import cast, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -71,7 +72,7 @@ def __iter__(self) -> Iterator[int]:
         # deterministically shuffle based on epoch
         g = torch.Generator()
         g.manual_seed(self.epoch)
-        indices: Union[torch.Tensor, List[int]]
+        indices: Union[torch.Tensor, list[int]]
         if self.shuffle:
             indices = torch.randperm(len(self.dataset), generator=g).tolist()
         else:
@@ -132,7 +133,7 @@ def __iter__(self) -> Iterator[int]:
             sampled = torch.linspace(s, s + length - 1, steps=self.num_clips_per_video).floor().to(torch.int64)
             s += length
             idxs.append(sampled)
-        return iter(cast(List[int], torch.cat(idxs).tolist()))
+        return iter(cast(list[int], torch.cat(idxs).tolist()))
 
     def __len__(self) -> int:
         return sum(self.num_clips_per_video for c in self.video_clips.clips if len(c) > 0)
diff --git a/torchvision/datasets/sbd.py b/torchvision/datasets/sbd.py
index 8399d025b1b..091e8698197 100644
--- a/torchvision/datasets/sbd.py
+++ b/torchvision/datasets/sbd.py
@@ -1,6 +1,7 @@
 import os
 import shutil
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 from PIL import Image
@@ -27,7 +28,7 @@ class SBDataset(VisionDataset):
         This class needs `scipy <https://docs.scipy.org/doc/>`_ to load target files from `.mat` format.
 
     Args:
-        root (string): Root directory of the Semantic Boundaries Dataset
+        root (str or ``pathlib.Path``): Root directory of the Semantic Boundaries Dataset
         image_set (string, optional): Select the image_set to use, ``train``, ``val`` or ``train_noval``.
             Image set ``train_noval`` excludes VOC 2012 val images.
         mode (string, optional): Select target type. Possible values 'boundaries' or 'segmentation'.
@@ -45,13 +46,13 @@ class SBDataset(VisionDataset):
     md5 = "82b4d87ceb2ed10f6038a1cba92111cb"
     filename = "benchmark.tgz"
 
-    voc_train_url = "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt"
+    voc_train_url = "https://www.cs.cornell.edu/~bharathh/train_noval.txt"
     voc_split_filename = "train_noval.txt"
     voc_split_md5 = "79bff800c5f0b1ec6b21080a3c066722"
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         image_set: str = "train",
         mode: str = "boundaries",
         download: bool = False,
@@ -80,7 +81,9 @@ def __init__(
             for f in ["cls", "img", "inst", "train.txt", "val.txt"]:
                 old_path = os.path.join(extracted_ds_root, f)
                 shutil.move(old_path, sbd_root)
-            download_url(self.voc_train_url, sbd_root, self.voc_split_filename, self.voc_split_md5)
+            if self.image_set == "train_noval":
+                # Note: this is failing as of June 2024 https://github.com/pytorch/vision/issues/8471
+                download_url(self.voc_train_url, sbd_root, self.voc_split_filename, self.voc_split_md5)
 
         if not os.path.isdir(sbd_root):
             raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")
@@ -106,7 +109,7 @@ def _get_boundaries_target(self, filepath: str) -> np.ndarray:
             axis=0,
         )
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         img = Image.open(self.images[index]).convert("RGB")
         target = self._get_target(self.masks[index])
 
diff --git a/torchvision/datasets/sbu.py b/torchvision/datasets/sbu.py
index 6bfe0b88cba..c0c97503eec 100644
--- a/torchvision/datasets/sbu.py
+++ b/torchvision/datasets/sbu.py
@@ -1,9 +1,10 @@
 import os
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
-from PIL import Image
+from .folder import default_loader
 
-from .utils import check_integrity, download_url
+from .utils import check_integrity, download_and_extract_archive, download_url
 from .vision import VisionDataset
 
 
@@ -11,29 +12,34 @@ class SBU(VisionDataset):
     """`SBU Captioned Photo <http://www.cs.virginia.edu/~vicente/sbucaptions/>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where tarball
+        root (str or ``pathlib.Path``): Root directory of dataset where tarball
             ``SBUCaptionedPhotoDataset.tar.gz`` exists.
-        transform (callable, optional): A function/transform that takes in a PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
-    url = "http://www.cs.virginia.edu/~vicente/sbucaptions/SBUCaptionedPhotoDataset.tar.gz"
+    url = "https://www.cs.rice.edu/~vo9/sbucaptions/SBUCaptionedPhotoDataset.tar.gz"
     filename = "SBUCaptionedPhotoDataset.tar.gz"
     md5_checksum = "9aec147b3488753cf758b4d493422285"
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = True,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
+        self.loader = loader
 
         if download:
             self.download()
@@ -57,7 +63,7 @@ def __init__(
                 self.photos.append(photo)
                 self.captions.append(caption)
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -66,7 +72,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
             tuple: (image, target) where target is a caption for the photo.
         """
         filename = os.path.join(self.root, "dataset", self.photos[index])
-        img = Image.open(filename).convert("RGB")
+        img = self.loader(filename)
         if self.transform is not None:
             img = self.transform(img)
 
@@ -90,17 +96,11 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         """Download and extract the tarball, and download each individual photo."""
-        import tarfile
 
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
-        download_url(self.url, self.root, self.filename, self.md5_checksum)
-
-        # Extract file
-        with tarfile.open(os.path.join(self.root, self.filename), "r:gz") as tar:
-            tar.extractall(path=self.root)
+        download_and_extract_archive(self.url, self.root, self.root, self.filename, self.md5_checksum)
 
         # Download individual photos
         with open(os.path.join(self.root, "dataset", "SBU_captioned_photo_dataset_urls.txt")) as fh:
diff --git a/torchvision/datasets/semeion.py b/torchvision/datasets/semeion.py
index c47703afbde..cd8d139cb21 100644
--- a/torchvision/datasets/semeion.py
+++ b/torchvision/datasets/semeion.py
@@ -1,9 +1,10 @@
 import os.path
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
-from PIL import Image
 
+from ..utils import _Image_fromarray
 from .utils import check_integrity, download_url
 from .vision import VisionDataset
 
@@ -12,9 +13,9 @@ class SEMEION(VisionDataset):
     r"""`SEMEION <http://archive.ics.uci.edu/ml/datasets/semeion+handwritten+digit>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``semeion.py`` exists.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -23,13 +24,14 @@ class SEMEION(VisionDataset):
             downloaded again.
 
     """
+
     url = "http://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data"
     filename = "semeion.data"
     md5_checksum = "cb545d371d2ce14ec121470795a77432"
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = True,
@@ -50,7 +52,7 @@ def __init__(
         self.data = np.reshape(self.data, (-1, 16, 16))
         self.labels = np.nonzero(data[:, 256:])[1]
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -62,7 +64,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
 
         # doing this so that it is consistent with all other datasets
         # to return a PIL Image
-        img = Image.fromarray(img, mode="L")
+        img = _Image_fromarray(img, mode="L")
 
         if self.transform is not None:
             img = self.transform(img)
@@ -84,7 +86,6 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
         root = self.root
diff --git a/torchvision/datasets/stanford_cars.py b/torchvision/datasets/stanford_cars.py
index 3e9430ef214..e73fb1f3141 100644
--- a/torchvision/datasets/stanford_cars.py
+++ b/torchvision/datasets/stanford_cars.py
@@ -1,41 +1,48 @@
 import pathlib
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
-from PIL import Image
+from .folder import default_loader
 
-from .utils import download_and_extract_archive, download_url, verify_str_arg
+from .utils import verify_str_arg
 from .vision import VisionDataset
 
 
 class StanfordCars(VisionDataset):
-    """`Stanford Cars <https://ai.stanford.edu/~jkrause/cars/car_dataset.html>`_ Dataset
+    """Stanford Cars  Dataset
 
     The Cars dataset contains 16,185 images of 196 classes of cars. The data is
     split into 8,144 training images and 8,041 testing images, where each class
     has been split roughly in a 50-50 split
 
+    The original URL is https://ai.stanford.edu/~jkrause/cars/car_dataset.html,
+    the dataset isn't available online anymore.
+
     .. note::
 
         This class needs `scipy <https://docs.scipy.org/doc/>`_ to load target files from `.mat` format.
 
     Args:
-        root (string): Root directory of dataset
+        root (str or ``pathlib.Path``): Root directory of dataset
         split (string, optional): The dataset split, supports ``"train"`` (default) or ``"test"``.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again."""
+        download (bool, optional): This parameter exists for backward compatibility but it does not
+            download the dataset, since the original URL is not available anymore.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
+    """
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, pathlib.Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
 
         try:
@@ -60,7 +67,7 @@ def __init__(
             self.download()
 
         if not self._check_exists():
-            raise RuntimeError("Dataset not found. You can use download=True to download it")
+            raise RuntimeError("Dataset not found.")
 
         self._samples = [
             (
@@ -72,50 +79,27 @@ def __init__(
 
         self.classes = sio.loadmat(str(devkit / "cars_meta.mat"), squeeze_me=True)["class_names"].tolist()
         self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._samples)
 
-    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         """Returns pil_image and class_id for given index"""
         image_path, target = self._samples[idx]
-        pil_image = Image.open(image_path).convert("RGB")
+        image = self.loader(image_path)
 
         if self.transform is not None:
-            pil_image = self.transform(pil_image)
+            image = self.transform(image)
         if self.target_transform is not None:
             target = self.target_transform(target)
-        return pil_image, target
-
-    def download(self) -> None:
-        if self._check_exists():
-            return
-
-        download_and_extract_archive(
-            url="https://ai.stanford.edu/~jkrause/cars/car_devkit.tgz",
-            download_root=str(self._base_folder),
-            md5="c3b158d763b6e2245038c8ad08e45376",
-        )
-        if self._split == "train":
-            download_and_extract_archive(
-                url="https://ai.stanford.edu/~jkrause/car196/cars_train.tgz",
-                download_root=str(self._base_folder),
-                md5="065e5b463ae28d29e77c1b4b166cfe61",
-            )
-        else:
-            download_and_extract_archive(
-                url="https://ai.stanford.edu/~jkrause/car196/cars_test.tgz",
-                download_root=str(self._base_folder),
-                md5="4ce7ebf6a94d07f1952d94dd34c4d501",
-            )
-            download_url(
-                url="https://ai.stanford.edu/~jkrause/car196/cars_test_annos_withlabels.mat",
-                root=str(self._base_folder),
-                md5="b0a2b23655a3edd16d84508592a98d10",
-            )
+        return image, target
 
     def _check_exists(self) -> bool:
         if not (self._base_folder / "devkit").is_dir():
             return False
 
         return self._annotations_mat_path.exists() and self._images_base_path.is_dir()
+
+    def download(self):
+        raise ValueError("The original URL is broken so the StanfordCars dataset cannot be downloaded anymore.")
diff --git a/torchvision/datasets/stl10.py b/torchvision/datasets/stl10.py
index 11b170b196c..6d7212a1b55 100644
--- a/torchvision/datasets/stl10.py
+++ b/torchvision/datasets/stl10.py
@@ -1,5 +1,6 @@
 import os.path
-from typing import Any, Callable, cast, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, cast, Optional, Union
 
 import numpy as np
 from PIL import Image
@@ -12,14 +13,14 @@ class STL10(VisionDataset):
     """`STL10 <https://cs.stanford.edu/~acoates/stl10/>`_ Dataset.
 
     Args:
-        root (string): Root directory of dataset where directory
+        root (str or ``pathlib.Path``): Root directory of dataset where directory
             ``stl10_binary`` exists.
         split (string): One of {'train', 'test', 'unlabeled', 'train+unlabeled'}.
-            Accordingly dataset is selected.
+            Accordingly, dataset is selected.
         folds (int, optional): One of {0-9} or None.
             For training, loads one of the 10 pre-defined folds of 1k samples for the
             standard evaluation procedure. If no value is passed, loads the 5k samples.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -45,7 +46,7 @@ class STL10(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         folds: Optional[int] = None,
         transform: Optional[Callable] = None,
@@ -99,7 +100,7 @@ def _verify_folds(self, folds: Optional[int]) -> Optional[int]:
             msg = "Expected type None or int for argument folds, but got type {}."
             raise ValueError(msg.format(type(folds)))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -128,7 +129,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
     def __len__(self) -> int:
         return self.data.shape[0]
 
-    def __loadfile(self, data_file: str, labels_file: Optional[str] = None) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+    def __loadfile(self, data_file: str, labels_file: Optional[str] = None) -> tuple[np.ndarray, Optional[np.ndarray]]:
         labels = None
         if labels_file:
             path_to_labels = os.path.join(self.root, self.base_folder, labels_file)
@@ -153,7 +154,6 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
         download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
         self._check_integrity()
diff --git a/torchvision/datasets/sun397.py b/torchvision/datasets/sun397.py
index 05cb910dde8..a27f86d9579 100644
--- a/torchvision/datasets/sun397.py
+++ b/torchvision/datasets/sun397.py
@@ -1,7 +1,7 @@
 from pathlib import Path
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Union
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive
 from .vision import VisionDataset
@@ -14,13 +14,16 @@ class SUN397(VisionDataset):
     397 categories with 108'754 images.
 
     Args:
-        root (string): Root directory of the dataset.
-        transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
-            version. E.g, ``transforms.RandomCrop``.
+        root (str or ``pathlib.Path``): Root directory of the dataset.
+        transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _DATASET_URL = "http://vision.princeton.edu/projects/2010/SUN/SUN397.tar.gz"
@@ -28,10 +31,11 @@ class SUN397(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[Union[str, Path]], Any] = default_loader,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._data_dir = Path(self.root) / "SUN397"
@@ -51,13 +55,14 @@ def __init__(
         self._labels = [
             self.class_to_idx["/".join(path.relative_to(self._data_dir).parts[1:-1])] for path in self._image_files
         ]
+        self.loader = loader
 
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
-        image = PIL.Image.open(image_file).convert("RGB")
+        image = self.loader(image_file)
 
         if self.transform:
             image = self.transform(image)
diff --git a/torchvision/datasets/svhn.py b/torchvision/datasets/svhn.py
index facb2d8858e..b59f78ec050 100644
--- a/torchvision/datasets/svhn.py
+++ b/torchvision/datasets/svhn.py
@@ -1,5 +1,6 @@
 import os.path
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 from PIL import Image
@@ -19,10 +20,10 @@ class SVHN(VisionDataset):
         This class needs `scipy <https://docs.scipy.org/doc/>`_ to load data from `.mat` format.
 
     Args:
-        root (string): Root directory of the dataset where the data is stored.
+        root (str or ``pathlib.Path``): Root directory of the dataset where the data is stored.
         split (string): One of {'train', 'test', 'extra'}.
             Accordingly dataset is selected. 'extra' is Extra training set.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -52,7 +53,7 @@ class SVHN(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -78,7 +79,7 @@ def __init__(
         loaded_mat = sio.loadmat(os.path.join(self.root, self.filename))
 
         self.data = loaded_mat["X"]
-        # loading from the .mat file gives an np array of type np.uint8
+        # loading from the .mat file gives an np.ndarray of type np.uint8
         # converting to np.int64, so that we have a LongTensor after
         # the conversion from the numpy array
         # the squeeze is needed to obtain a 1D tensor
@@ -90,7 +91,7 @@ def __init__(
         np.place(self.labels, self.labels == 10, 0)
         self.data = np.transpose(self.data, (3, 2, 0, 1))
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
diff --git a/torchvision/datasets/ucf101.py b/torchvision/datasets/ucf101.py
index 749646080fd..85930dbc742 100644
--- a/torchvision/datasets/ucf101.py
+++ b/torchvision/datasets/ucf101.py
@@ -1,5 +1,6 @@
 import os
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 from torch import Tensor
 
@@ -28,7 +29,7 @@ class UCF101(VisionDataset):
     Internally, it uses a VideoClips object to handle clip creation.
 
     Args:
-        root (string): Root directory of the UCF101 Dataset.
+        root (str or ``pathlib.Path``): Root directory of the UCF101 Dataset.
         annotation_path (str): path to the folder containing the split files;
             see docstring above for download instructions of these files
         frames_per_clip (int): number of frames in a clip.
@@ -36,7 +37,7 @@ class UCF101(VisionDataset):
         fold (int, optional): which fold to use. Should be between 1 and 3.
         train (bool, optional): if ``True``, creates a dataset from the train split,
             otherwise from the ``test`` split.
-        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
+        transform (callable, optional): A function/transform that takes in a TxHxWxC video
             and returns a transformed version.
         output_format (str, optional): The format of the output video tensors (before transforms).
             Can be either "THWC" (default) or "TCHW".
@@ -52,7 +53,7 @@ class UCF101(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         annotation_path: str,
         frames_per_clip: int,
         step_between_clips: int = 1,
@@ -60,7 +61,7 @@ def __init__(
         fold: int = 1,
         train: bool = True,
         transform: Optional[Callable] = None,
-        _precomputed_metadata: Optional[Dict[str, Any]] = None,
+        _precomputed_metadata: Optional[dict[str, Any]] = None,
         num_workers: int = 1,
         _video_width: int = 0,
         _video_height: int = 0,
@@ -93,7 +94,7 @@ def __init__(
             output_format=output_format,
         )
         # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
         # video clips
         self.full_video_clips = video_clips
         self.indices = self._select_fold(video_list, annotation_path, fold, train)
@@ -101,10 +102,10 @@ def __init__(
         self.transform = transform
 
     @property
-    def metadata(self) -> Dict[str, Any]:
+    def metadata(self) -> dict[str, Any]:
         return self.full_video_clips.metadata
 
-    def _select_fold(self, video_list: List[str], annotation_path: str, fold: int, train: bool) -> List[int]:
+    def _select_fold(self, video_list: list[str], annotation_path: str, fold: int, train: bool) -> list[int]:
         name = "train" if train else "test"
         name = f"{name}list{fold:02d}.txt"
         f = os.path.join(annotation_path, name)
@@ -120,7 +121,7 @@ def _select_fold(self, video_list: List[str], annotation_path: str, fold: int, t
     def __len__(self) -> int:
         return self.video_clips.num_clips()
 
-    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
+    def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, int]:
         video, audio, info, video_idx = self.video_clips.get_clip(idx)
         label = self.samples[self.indices[video_idx]][1]
 
diff --git a/torchvision/datasets/usps.py b/torchvision/datasets/usps.py
index d61d8c30368..e09ac96e45e 100644
--- a/torchvision/datasets/usps.py
+++ b/torchvision/datasets/usps.py
@@ -1,9 +1,10 @@
 import os
-from typing import Any, Callable, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
-from PIL import Image
 
+from ..utils import _Image_fromarray
 from .utils import download_url
 from .vision import VisionDataset
 
@@ -15,10 +16,10 @@ class USPS(VisionDataset):
     and make pixel values in ``[0, 255]``.
 
     Args:
-        root (string): Root directory of dataset to store``USPS`` data files.
+        root (str or ``pathlib.Path``): Root directory of dataset to store``USPS`` data files.
         train (bool, optional): If True, creates dataset from ``usps.bz2``,
             otherwise from ``usps.t.bz2``.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -43,7 +44,7 @@ class USPS(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         train: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -69,7 +70,7 @@ def __init__(
         self.data = imgs
         self.targets = targets
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -81,7 +82,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
 
         # doing this so that it is consistent with all other datasets
         # to return a PIL Image
-        img = Image.fromarray(img, mode="L")
+        img = _Image_fromarray(img, mode="L")
 
         if self.transform is not None:
             img = self.transform(img)
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index 32839163b7b..0b6670800d2 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -1,25 +1,21 @@
 import bz2
-import contextlib
 import gzip
 import hashlib
-import itertools
 import lzma
 import os
 import os.path
 import pathlib
 import re
-import sys
 import tarfile
 import urllib
 import urllib.error
 import urllib.request
-import warnings
 import zipfile
-from typing import Any, Callable, Dict, IO, Iterable, Iterator, List, Optional, Tuple, TypeVar
+from collections.abc import Iterable
+from typing import Any, Callable, IO, Optional, TypeVar, Union
 from urllib.parse import urlparse
 
 import numpy as np
-import requests
 import torch
 from torch.utils.model_zoo import tqdm
 
@@ -28,55 +24,30 @@
 USER_AGENT = "pytorch/vision"
 
 
-def _save_response_content(
-    content: Iterator[bytes],
-    destination: str,
-    length: Optional[int] = None,
-) -> None:
-    with open(destination, "wb") as fh, tqdm(total=length) as pbar:
-        for chunk in content:
-            # filter out keep-alive new chunks
-            if not chunk:
-                continue
-
-            fh.write(chunk)
-            pbar.update(len(chunk))
-
-
-def _urlretrieve(url: str, filename: str, chunk_size: int = 1024 * 32) -> None:
+def _urlretrieve(url: str, filename: Union[str, pathlib.Path], chunk_size: int = 1024 * 32) -> None:
     with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
-        _save_response_content(iter(lambda: response.read(chunk_size), b""), filename, length=response.length)
-
-
-def gen_bar_updater() -> Callable[[int, int, int], None]:
-    warnings.warn("The function `gen_bar_update` is deprecated since 0.13 and will be removed in 0.15.")
-    pbar = tqdm(total=None)
-
-    def bar_update(count, block_size, total_size):
-        if pbar.total is None and total_size:
-            pbar.total = total_size
-        progress_bytes = count * block_size
-        pbar.update(progress_bytes - pbar.n)
-
-    return bar_update
+        with open(filename, "wb") as fh, tqdm(total=response.length, unit="B", unit_scale=True) as pbar:
+            while chunk := response.read(chunk_size):
+                fh.write(chunk)
+                pbar.update(len(chunk))
 
 
-def calculate_md5(fpath: str, chunk_size: int = 1024 * 1024) -> str:
+def calculate_md5(fpath: Union[str, pathlib.Path], chunk_size: int = 1024 * 1024) -> str:
     # Setting the `usedforsecurity` flag does not change anything about the functionality, but indicates that we are
     # not using the MD5 checksum for cryptography. This enables its usage in restricted environments like FIPS. Without
     # it torchvision.datasets is unusable in these environments since we perform a MD5 check everywhere.
-    md5 = hashlib.md5(**dict(usedforsecurity=False) if sys.version_info >= (3, 9) else dict())
+    md5 = hashlib.md5(usedforsecurity=False)
     with open(fpath, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
+        while chunk := f.read(chunk_size):
             md5.update(chunk)
     return md5.hexdigest()
 
 
-def check_md5(fpath: str, md5: str, **kwargs: Any) -> bool:
+def check_md5(fpath: Union[str, pathlib.Path], md5: str, **kwargs: Any) -> bool:
     return md5 == calculate_md5(fpath, **kwargs)
 
 
-def check_integrity(fpath: str, md5: Optional[str] = None) -> bool:
+def check_integrity(fpath: Union[str, pathlib.Path], md5: Optional[str] = None) -> bool:
     if not os.path.isfile(fpath):
         return False
     if md5 is None:
@@ -114,7 +85,11 @@ def _get_google_drive_file_id(url: str) -> Optional[str]:
 
 
 def download_url(
-    url: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None, max_redirect_hops: int = 3
+    url: str,
+    root: Union[str, pathlib.Path],
+    filename: Optional[Union[str, pathlib.Path]] = None,
+    md5: Optional[str] = None,
+    max_redirect_hops: int = 3,
 ) -> None:
     """Download a file from a url and place it in root.
 
@@ -128,13 +103,12 @@ def download_url(
     root = os.path.expanduser(root)
     if not filename:
         filename = os.path.basename(url)
-    fpath = os.path.join(root, filename)
+    fpath = os.fspath(os.path.join(root, filename))
 
     os.makedirs(root, exist_ok=True)
 
     # check if file is already present locally
     if check_integrity(fpath, md5):
-        print("Using downloaded and verified file: " + fpath)
         return
 
     if _is_remote_location_available():
@@ -150,12 +124,10 @@ def download_url(
 
         # download the file
         try:
-            print("Downloading " + url + " to " + fpath)
             _urlretrieve(url, fpath)
         except (urllib.error.URLError, OSError) as e:  # type: ignore[attr-defined]
             if url[:5] == "https":
                 url = url.replace("https:", "http:")
-                print("Failed download. Trying https -> http instead. Downloading " + url + " to " + fpath)
                 _urlretrieve(url, fpath)
             else:
                 raise e
@@ -165,7 +137,7 @@ def download_url(
         raise RuntimeError("File not found or corrupted.")
 
 
-def list_dir(root: str, prefix: bool = False) -> List[str]:
+def list_dir(root: Union[str, pathlib.Path], prefix: bool = False) -> list[str]:
     """List all directories at a given root
 
     Args:
@@ -180,7 +152,7 @@ def list_dir(root: str, prefix: bool = False) -> List[str]:
     return directories
 
 
-def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]:
+def list_files(root: Union[str, pathlib.Path], suffix: str, prefix: bool = False) -> list[str]:
     """List all files ending with a suffix at a given root
 
     Args:
@@ -197,23 +169,12 @@ def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]:
     return files
 
 
-def _extract_gdrive_api_response(response, chunk_size: int = 32 * 1024) -> Tuple[bytes, Iterator[bytes]]:
-    content = response.iter_content(chunk_size)
-    first_chunk = None
-    # filter out keep-alive new chunks
-    while not first_chunk:
-        first_chunk = next(content)
-    content = itertools.chain([first_chunk], content)
-
-    try:
-        match = re.search("<title>Google Drive - (?P<api_response>.+?)</title>", first_chunk.decode())
-        api_response = match["api_response"] if match is not None else None
-    except UnicodeDecodeError:
-        api_response = None
-    return api_response, content
-
-
-def download_file_from_google_drive(file_id: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None):
+def download_file_from_google_drive(
+    file_id: str,
+    root: Union[str, pathlib.Path],
+    filename: Optional[Union[str, pathlib.Path]] = None,
+    md5: Optional[str] = None,
+):
     """Download a Google Drive file from  and place it in root.
 
     Args:
@@ -222,101 +183,68 @@ def download_file_from_google_drive(file_id: str, root: str, filename: Optional[
         filename (str, optional): Name to save the file under. If None, use the id of the file.
         md5 (str, optional): MD5 checksum of the download. If None, do not check
     """
-    # Based on https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
+    try:
+        import gdown
+    except ModuleNotFoundError:
+        raise RuntimeError(
+            "To download files from GDrive, 'gdown' is required. You can install it with 'pip install gdown'."
+        )
 
     root = os.path.expanduser(root)
     if not filename:
         filename = file_id
-    fpath = os.path.join(root, filename)
+    fpath = os.fspath(os.path.join(root, filename))
 
     os.makedirs(root, exist_ok=True)
 
     if check_integrity(fpath, md5):
-        print(f"Using downloaded {'and verified ' if md5 else ''}file: {fpath}")
         return
 
-    url = "https://drive.google.com/uc"
-    params = dict(id=file_id, export="download")
-    with requests.Session() as session:
-        response = session.get(url, params=params, stream=True)
+    gdown.download(id=file_id, output=fpath, quiet=False, user_agent=USER_AGENT)
 
-        for key, value in response.cookies.items():
-            if key.startswith("download_warning"):
-                token = value
-                break
-        else:
-            api_response, content = _extract_gdrive_api_response(response)
-            token = "t" if api_response == "Virus scan warning" else None
-
-        if token is not None:
-            response = session.get(url, params=dict(params, confirm=token), stream=True)
-            api_response, content = _extract_gdrive_api_response(response)
-
-        if api_response == "Quota exceeded":
-            raise RuntimeError(
-                f"The daily quota of the file {filename} is exceeded and it "
-                f"can't be downloaded. This is a limitation of Google Drive "
-                f"and can only be overcome by trying again later."
-            )
-
-        _save_response_content(content, fpath)
-
-    # In case we deal with an unhandled GDrive API response, the file should be smaller than 10kB and contain only text
-    if os.stat(fpath).st_size < 10 * 1024:
-        with contextlib.suppress(UnicodeDecodeError), open(fpath) as fh:
-            text = fh.read()
-            # Regular expression to detect HTML. Copied from https://stackoverflow.com/a/70585604
-            if re.search(r"</?\s*[a-z-][^>]*\s*>|(&(?:[\w\d]+|#\d+|#x[a-f\d]+);)", text):
-                warnings.warn(
-                    f"We detected some HTML elements in the downloaded file. "
-                    f"This most likely means that the download triggered an unhandled API response by GDrive. "
-                    f"Please report this to torchvision at https://github.com/pytorch/vision/issues including "
-                    f"the response:\n\n{text}"
-                )
-
-    if md5 and not check_md5(fpath, md5):
-        raise RuntimeError(
-            f"The MD5 checksum of the download file {fpath} does not match the one on record."
-            f"Please delete the file and try again. "
-            f"If the issue persists, please report this to torchvision at https://github.com/pytorch/vision/issues."
-        )
+    if not check_integrity(fpath, md5):
+        raise RuntimeError("File not found or corrupted.")
 
 
-def _extract_tar(from_path: str, to_path: str, compression: Optional[str]) -> None:
+def _extract_tar(
+    from_path: Union[str, pathlib.Path], to_path: Union[str, pathlib.Path], compression: Optional[str]
+) -> None:
     with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar:
         tar.extractall(to_path)
 
 
-_ZIP_COMPRESSION_MAP: Dict[str, int] = {
+_ZIP_COMPRESSION_MAP: dict[str, int] = {
     ".bz2": zipfile.ZIP_BZIP2,
     ".xz": zipfile.ZIP_LZMA,
 }
 
 
-def _extract_zip(from_path: str, to_path: str, compression: Optional[str]) -> None:
+def _extract_zip(
+    from_path: Union[str, pathlib.Path], to_path: Union[str, pathlib.Path], compression: Optional[str]
+) -> None:
     with zipfile.ZipFile(
         from_path, "r", compression=_ZIP_COMPRESSION_MAP[compression] if compression else zipfile.ZIP_STORED
     ) as zip:
         zip.extractall(to_path)
 
 
-_ARCHIVE_EXTRACTORS: Dict[str, Callable[[str, str, Optional[str]], None]] = {
+_ARCHIVE_EXTRACTORS: dict[str, Callable[[Union[str, pathlib.Path], Union[str, pathlib.Path], Optional[str]], None]] = {
     ".tar": _extract_tar,
     ".zip": _extract_zip,
 }
-_COMPRESSED_FILE_OPENERS: Dict[str, Callable[..., IO]] = {
+_COMPRESSED_FILE_OPENERS: dict[str, Callable[..., IO]] = {
     ".bz2": bz2.open,
     ".gz": gzip.open,
     ".xz": lzma.open,
 }
-_FILE_TYPE_ALIASES: Dict[str, Tuple[Optional[str], Optional[str]]] = {
+_FILE_TYPE_ALIASES: dict[str, tuple[Optional[str], Optional[str]]] = {
     ".tbz": (".tar", ".bz2"),
     ".tbz2": (".tar", ".bz2"),
     ".tgz": (".tar", ".gz"),
 }
 
 
-def _detect_file_type(file: str) -> Tuple[str, Optional[str], Optional[str]]:
+def _detect_file_type(file: Union[str, pathlib.Path]) -> tuple[str, Optional[str], Optional[str]]:
     """Detect the archive type and/or compression of a file.
 
     Args:
@@ -359,7 +287,11 @@ def _detect_file_type(file: str) -> Tuple[str, Optional[str], Optional[str]]:
     raise RuntimeError(f"Unknown compression or archive type: '{suffix}'.\nKnown suffixes are: '{valid_suffixes}'.")
 
 
-def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
+def _decompress(
+    from_path: Union[str, pathlib.Path],
+    to_path: Optional[Union[str, pathlib.Path]] = None,
+    remove_finished: bool = False,
+) -> pathlib.Path:
     r"""Decompress a file.
 
     The compression is automatically detected from the file name.
@@ -377,7 +309,7 @@ def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished:
         raise RuntimeError(f"Couldn't detect a compression from suffix {suffix}.")
 
     if to_path is None:
-        to_path = from_path.replace(suffix, archive_type if archive_type is not None else "")
+        to_path = pathlib.Path(os.fspath(from_path).replace(suffix, archive_type if archive_type is not None else ""))
 
     # We don't need to check for a missing key here, since this was already done in _detect_file_type()
     compressed_file_opener = _COMPRESSED_FILE_OPENERS[compression]
@@ -388,10 +320,14 @@ def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished:
     if remove_finished:
         os.remove(from_path)
 
-    return to_path
+    return pathlib.Path(to_path)
 
 
-def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
+def extract_archive(
+    from_path: Union[str, pathlib.Path],
+    to_path: Optional[Union[str, pathlib.Path]] = None,
+    remove_finished: bool = False,
+) -> Union[str, pathlib.Path]:
     """Extract an archive.
 
     The archive type and a possible compression is automatically detected from the file name. If the file is compressed
@@ -406,16 +342,24 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finish
     Returns:
         (str): Path to the directory the file was extracted to.
     """
+
+    def path_or_str(ret_path: pathlib.Path) -> Union[str, pathlib.Path]:
+        if isinstance(from_path, str):
+            return os.fspath(ret_path)
+        else:
+            return ret_path
+
     if to_path is None:
         to_path = os.path.dirname(from_path)
 
     suffix, archive_type, compression = _detect_file_type(from_path)
     if not archive_type:
-        return _decompress(
+        ret_path = _decompress(
             from_path,
             os.path.join(to_path, os.path.basename(from_path).replace(suffix, "")),
             remove_finished=remove_finished,
         )
+        return path_or_str(ret_path)
 
     # We don't need to check for a missing key here, since this was already done in _detect_file_type()
     extractor = _ARCHIVE_EXTRACTORS[archive_type]
@@ -424,14 +368,14 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finish
     if remove_finished:
         os.remove(from_path)
 
-    return to_path
+    return path_or_str(pathlib.Path(to_path))
 
 
 def download_and_extract_archive(
     url: str,
-    download_root: str,
-    extract_root: Optional[str] = None,
-    filename: Optional[str] = None,
+    download_root: Union[str, pathlib.Path],
+    extract_root: Optional[Union[str, pathlib.Path]] = None,
+    filename: Optional[Union[str, pathlib.Path]] = None,
     md5: Optional[str] = None,
     remove_finished: bool = False,
 ) -> None:
@@ -444,7 +388,6 @@ def download_and_extract_archive(
     download_url(url, download_root, filename, md5)
 
     archive = os.path.join(download_root, filename)
-    print(f"Extracting {archive} to {extract_root}")
     extract_archive(archive, extract_root, remove_finished)
 
 
@@ -461,7 +404,7 @@ def verify_str_arg(
     valid_values: Optional[Iterable[T]] = None,
     custom_msg: Optional[str] = None,
 ) -> T:
-    if not isinstance(value, torch._six.string_classes):
+    if not isinstance(value, str):
         if arg is None:
             msg = "Expected type str, but got type {type}."
         else:
@@ -483,7 +426,7 @@ def verify_str_arg(
     return value
 
 
-def _read_pfm(file_name: str, slice_channels: int = 2) -> np.ndarray:
+def _read_pfm(file_name: Union[str, pathlib.Path], slice_channels: int = 2) -> np.ndarray:
     """Read file in .pfm format. Might contain either 1 or 3 channels of data.
 
     Args:
@@ -517,3 +460,9 @@ def _read_pfm(file_name: str, slice_channels: int = 2) -> np.ndarray:
     data = np.flip(data, axis=1)  # flip on h dimension
     data = data[:slice_channels, :, :]
     return data.astype(np.float32)
+
+
+def _flip_byte_order(t: torch.Tensor) -> torch.Tensor:
+    return (
+        t.contiguous().view(torch.uint8).view(*t.shape, t.element_size()).flip(-1).view(*t.shape[:-1], -1).view(t.dtype)
+    )
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index c4890ff4416..d9214beaa68 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -2,7 +2,7 @@
 import math
 import warnings
 from fractions import Fraction
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Any, Callable, cast, Optional, TypeVar, Union
 
 import torch
 from torchvision.io import _probe_video_from_file, _read_video_from_file, read_video, read_video_timestamps
@@ -49,17 +49,17 @@ class _VideoTimestampsDataset:
     Dataset used to parallelize the reading of the timestamps
     of a list of videos, given their paths in the filesystem.
 
-    Used in VideoClips and defined at top level so it can be
+    Used in VideoClips and defined at top level, so it can be
     pickled when forking.
     """
 
-    def __init__(self, video_paths: List[str]) -> None:
+    def __init__(self, video_paths: list[str]) -> None:
         self.video_paths = video_paths
 
     def __len__(self) -> int:
         return len(self.video_paths)
 
-    def __getitem__(self, idx: int) -> Tuple[List[int], Optional[float]]:
+    def __getitem__(self, idx: int) -> tuple[list[int], Optional[float]]:
         return read_video_timestamps(self.video_paths[idx])
 
 
@@ -89,7 +89,7 @@ class VideoClips:
         video_paths (List[str]): paths to the video files
         clip_length_in_frames (int): size of a clip in number of frames
         frames_between_clips (int): step (in frames) between each clip
-        frame_rate (int, optional): if specified, it will resample the video
+        frame_rate (float, optional): if specified, it will resample the video
             so that it has `frame_rate`, and then the clips will be defined
             on the resampled video
         num_workers (int): how many subprocesses to use for data loading.
@@ -99,11 +99,11 @@ class VideoClips:
 
     def __init__(
         self,
-        video_paths: List[str],
+        video_paths: list[str],
         clip_length_in_frames: int = 16,
         frames_between_clips: int = 1,
-        frame_rate: Optional[int] = None,
-        _precomputed_metadata: Optional[Dict[str, Any]] = None,
+        frame_rate: Optional[float] = None,
+        _precomputed_metadata: Optional[dict[str, Any]] = None,
         num_workers: int = 0,
         _video_width: int = 0,
         _video_height: int = 0,
@@ -135,8 +135,8 @@ def __init__(
         self.compute_clips(clip_length_in_frames, frames_between_clips, frame_rate)
 
     def _compute_frame_pts(self) -> None:
-        self.video_pts = []
-        self.video_fps = []
+        self.video_pts = []  # len = num_videos. Each entry is a tensor of shape (num_frames_in_video,)
+        self.video_fps: list[float] = []  # len = num_videos
 
         # strategy: use a DataLoader to parallelize read_video_timestamps
         # so need to create a dummy dataset first
@@ -152,15 +152,15 @@ def _compute_frame_pts(self) -> None:
         with tqdm(total=len(dl)) as pbar:
             for batch in dl:
                 pbar.update(1)
-                clips, fps = list(zip(*batch))
+                batch_pts, batch_fps = list(zip(*batch))
                 # we need to specify dtype=torch.long because for empty list,
                 # torch.as_tensor will use torch.float as default dtype. This
                 # happens when decoding fails and no pts is returned in the list.
-                clips = [torch.as_tensor(c, dtype=torch.long) for c in clips]
-                self.video_pts.extend(clips)
-                self.video_fps.extend(fps)
+                batch_pts = [torch.as_tensor(pts, dtype=torch.long) for pts in batch_pts]
+                self.video_pts.extend(batch_pts)
+                self.video_fps.extend(batch_fps)
 
-    def _init_from_metadata(self, metadata: Dict[str, Any]) -> None:
+    def _init_from_metadata(self, metadata: dict[str, Any]) -> None:
         self.video_paths = metadata["video_paths"]
         assert len(self.video_paths) == len(metadata["video_pts"])
         self.video_pts = metadata["video_pts"]
@@ -168,7 +168,7 @@ def _init_from_metadata(self, metadata: Dict[str, Any]) -> None:
         self.video_fps = metadata["video_fps"]
 
     @property
-    def metadata(self) -> Dict[str, Any]:
+    def metadata(self) -> dict[str, Any]:
         _metadata = {
             "video_paths": self.video_paths,
             "video_pts": self.video_pts,
@@ -176,7 +176,7 @@ def metadata(self) -> Dict[str, Any]:
         }
         return _metadata
 
-    def subset(self, indices: List[int]) -> "VideoClips":
+    def subset(self, indices: list[int]) -> "VideoClips":
         video_paths = [self.video_paths[i] for i in indices]
         video_pts = [self.video_pts[i] for i in indices]
         video_fps = [self.video_fps[i] for i in indices]
@@ -187,9 +187,9 @@ def subset(self, indices: List[int]) -> "VideoClips":
         }
         return type(self)(
             video_paths,
-            self.num_frames,
-            self.step,
-            self.frame_rate,
+            clip_length_in_frames=self.num_frames,
+            frames_between_clips=self.step,
+            frame_rate=self.frame_rate,
             _precomputed_metadata=metadata,
             num_workers=self.num_workers,
             _video_width=self._video_width,
@@ -198,19 +198,20 @@ def subset(self, indices: List[int]) -> "VideoClips":
             _video_max_dimension=self._video_max_dimension,
             _audio_samples=self._audio_samples,
             _audio_channels=self._audio_channels,
+            output_format=self.output_format,
         )
 
     @staticmethod
     def compute_clips_for_video(
-        video_pts: torch.Tensor, num_frames: int, step: int, fps: int, frame_rate: Optional[int] = None
-    ) -> Tuple[torch.Tensor, Union[List[slice], torch.Tensor]]:
+        video_pts: torch.Tensor, num_frames: int, step: int, fps: Optional[float], frame_rate: Optional[float] = None
+    ) -> tuple[torch.Tensor, Union[list[slice], torch.Tensor]]:
         if fps is None:
             # if for some reason the video doesn't have fps (because doesn't have a video stream)
             # set the fps to 1. The value doesn't matter, because video_pts is empty anyway
             fps = 1
         if frame_rate is None:
             frame_rate = fps
-        total_frames = len(video_pts) * (float(frame_rate) / fps)
+        total_frames = len(video_pts) * frame_rate / fps
         _idxs = VideoClips._resample_video_idx(int(math.floor(total_frames)), fps, frame_rate)
         video_pts = video_pts[_idxs]
         clips = unfold(video_pts, num_frames, step)
@@ -219,14 +220,14 @@ def compute_clips_for_video(
                 "There aren't enough frames in the current video to get a clip for the given clip length and "
                 "frames between clips. The video (and potentially others) will be skipped."
             )
-        idxs: Union[List[slice], torch.Tensor]
+        idxs: Union[list[slice], torch.Tensor]
         if isinstance(_idxs, slice):
             idxs = [_idxs] * len(clips)
         else:
             idxs = unfold(_idxs, num_frames, step)
         return clips, idxs
 
-    def compute_clips(self, num_frames: int, step: int, frame_rate: Optional[int] = None) -> None:
+    def compute_clips(self, num_frames: int, step: int, frame_rate: Optional[float] = None) -> None:
         """
         Compute all consecutive sequences of clips from video_pts.
         Always returns clips of size `num_frames`, meaning that the
@@ -261,7 +262,7 @@ def num_clips(self) -> int:
         """
         return self.cumulative_sizes[-1]
 
-    def get_clip_location(self, idx: int) -> Tuple[int, int]:
+    def get_clip_location(self, idx: int) -> tuple[int, int]:
         """
         Converts a flattened representation of the indices into a video_idx, clip_idx
         representation.
@@ -274,8 +275,8 @@ def get_clip_location(self, idx: int) -> Tuple[int, int]:
         return video_idx, clip_idx
 
     @staticmethod
-    def _resample_video_idx(num_frames: int, original_fps: int, new_fps: int) -> Union[slice, torch.Tensor]:
-        step = float(original_fps) / new_fps
+    def _resample_video_idx(num_frames: int, original_fps: float, new_fps: float) -> Union[slice, torch.Tensor]:
+        step = original_fps / new_fps
         if step.is_integer():
             # optimization: if step is integer, don't need to perform
             # advanced indexing
@@ -285,7 +286,7 @@ def _resample_video_idx(num_frames: int, original_fps: int, new_fps: int) -> Uni
         idxs = idxs.floor().to(torch.int64)
         return idxs
 
-    def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any], int]:
+    def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any], int]:
         """
         Gets a subclip from a list of videos.
 
@@ -373,7 +374,7 @@ def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]
 
         return video, audio, info, video_idx
 
-    def __getstate__(self) -> Dict[str, Any]:
+    def __getstate__(self) -> dict[str, Any]:
         video_pts_sizes = [len(v) for v in self.video_pts]
         # To be back-compatible, we convert data to dtype torch.long as needed
         # because for empty list, in legacy implementation, torch.as_tensor will
@@ -401,7 +402,7 @@ def __getstate__(self) -> Dict[str, Any]:
         d["_version"] = 2
         return d
 
-    def __setstate__(self, d: Dict[str, Any]) -> None:
+    def __setstate__(self, d: dict[str, Any]) -> None:
         # for backwards-compatibility
         if "_version" not in d:
             self.__dict__ = d
diff --git a/torchvision/datasets/vision.py b/torchvision/datasets/vision.py
index 22fc85322e4..c43f7814c6c 100644
--- a/torchvision/datasets/vision.py
+++ b/torchvision/datasets/vision.py
@@ -1,7 +1,7 @@
 import os
-from typing import Any, Callable, List, Optional, Tuple
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
-import torch
 import torch.utils.data as data
 
 from ..utils import _log_api_usage_once
@@ -13,10 +13,10 @@ class VisionDataset(data.Dataset):
     It is necessary to override the ``__getitem__`` and ``__len__`` method.
 
     Args:
-        root (string): Root directory of dataset.
+        root (string, optional): Root directory of dataset. Only used for `__repr__`.
         transforms (callable, optional): A function/transforms that takes in
             an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -30,13 +30,13 @@ class VisionDataset(data.Dataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path] = None,  # type: ignore[assignment]
         transforms: Optional[Callable] = None,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
     ) -> None:
         _log_api_usage_once(self)
-        if isinstance(root, torch._six.string_classes):
+        if isinstance(root, str):
             root = os.path.expanduser(root)
         self.root = root
 
@@ -77,7 +77,7 @@ def __repr__(self) -> str:
         lines = [head] + [" " * self._repr_indent + line for line in body]
         return "\n".join(lines)
 
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+    def _format_transform_repr(self, transform: Callable, head: str) -> list[str]:
         lines = transform.__repr__().splitlines()
         return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
 
@@ -90,14 +90,14 @@ def __init__(self, transform: Optional[Callable] = None, target_transform: Optio
         self.transform = transform
         self.target_transform = target_transform
 
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
+    def __call__(self, input: Any, target: Any) -> tuple[Any, Any]:
         if self.transform is not None:
             input = self.transform(input)
         if self.target_transform is not None:
             target = self.target_transform(target)
         return input, target
 
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+    def _format_transform_repr(self, transform: Callable, head: str) -> list[str]:
         lines = transform.__repr__().splitlines()
         return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
 
diff --git a/torchvision/datasets/voc.py b/torchvision/datasets/voc.py
index 32888cd5c8c..4d3e502d84e 100644
--- a/torchvision/datasets/voc.py
+++ b/torchvision/datasets/voc.py
@@ -1,19 +1,18 @@
 import collections
 import os
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 from xml.etree.ElementTree import Element as ET_Element
 
-from .vision import VisionDataset
-
 try:
     from defusedxml.ElementTree import parse as ET_parse
 except ImportError:
     from xml.etree.ElementTree import parse as ET_parse
-import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple
 
 from PIL import Image
 
 from .utils import download_and_extract_archive, verify_str_arg
+from .vision import VisionDataset
 
 DATASET_YEAR_DICT = {
     "2012": {
@@ -68,7 +67,7 @@ class _VOCBase(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         year: str = "2012",
         image_set: str = "train",
         download: bool = False,
@@ -77,20 +76,8 @@ def __init__(
         transforms: Optional[Callable] = None,
     ):
         super().__init__(root, transforms, transform, target_transform)
-        if year == "2007-test":
-            if image_set == "test":
-                warnings.warn(
-                    "Accessing the test image set of the year 2007 with year='2007-test' is deprecated "
-                    "since 0.12 and will be removed in 0.14. "
-                    "Please use the combination year='2007' and image_set='test' instead."
-                )
-                year = "2007"
-            else:
-                raise ValueError(
-                    "In the test image set of the year 2007 only image_set='test' is allowed. "
-                    "For all other image sets use year='2007' instead."
-                )
-        self.year = year
+
+        self.year = verify_str_arg(year, "year", valid_values=[str(yr) for yr in range(2007, 2013)])
 
         valid_image_sets = ["train", "trainval", "val"]
         if year == "2007":
@@ -134,14 +121,14 @@ class VOCSegmentation(_VOCBase):
     """`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Segmentation Dataset.
 
     Args:
-        root (string): Root directory of the VOC Dataset.
+        root (str or ``pathlib.Path``): Root directory of the VOC Dataset.
         year (string, optional): The dataset year, supports years ``"2007"`` to ``"2012"``.
         image_set (string, optional): Select the image_set to use, ``"train"``, ``"trainval"`` or ``"val"``. If
             ``year=="2007"``, can also be ``"test"``.
         download (bool, optional): If true, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that  takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -154,10 +141,10 @@ class VOCSegmentation(_VOCBase):
     _TARGET_FILE_EXT = ".png"
 
     @property
-    def masks(self) -> List[str]:
+    def masks(self) -> list[str]:
         return self.targets
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -178,7 +165,7 @@ class VOCDetection(_VOCBase):
     """`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Detection Dataset.
 
     Args:
-        root (string): Root directory of the VOC Dataset.
+        root (str or ``pathlib.Path``): Root directory of the VOC Dataset.
         year (string, optional): The dataset year, supports years ``"2007"`` to ``"2012"``.
         image_set (string, optional): Select the image_set to use, ``"train"``, ``"trainval"`` or ``"val"``. If
             ``year=="2007"``, can also be ``"test"``.
@@ -186,7 +173,7 @@ class VOCDetection(_VOCBase):
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
             (default: alphabetic indexing of VOC's 20 classes).
-        transform (callable, optional): A function/transform that  takes in an PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, required): A function/transform that takes in the
             target and transforms it.
@@ -199,10 +186,10 @@ class VOCDetection(_VOCBase):
     _TARGET_FILE_EXT = ".xml"
 
     @property
-    def annotations(self) -> List[str]:
+    def annotations(self) -> list[str]:
         return self.targets
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -219,11 +206,11 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
         return img, target
 
     @staticmethod
-    def parse_voc_xml(node: ET_Element) -> Dict[str, Any]:
-        voc_dict: Dict[str, Any] = {}
+    def parse_voc_xml(node: ET_Element) -> dict[str, Any]:
+        voc_dict: dict[str, Any] = {}
         children = list(node)
         if children:
-            def_dic: Dict[str, Any] = collections.defaultdict(list)
+            def_dic: dict[str, Any] = collections.defaultdict(list)
             for dc in map(VOCDetection.parse_voc_xml, children):
                 for ind, v in dc.items():
                     def_dic[ind].append(v)
diff --git a/torchvision/datasets/widerface.py b/torchvision/datasets/widerface.py
index b46c7982d8b..31ab28ebdba 100644
--- a/torchvision/datasets/widerface.py
+++ b/torchvision/datasets/widerface.py
@@ -1,6 +1,8 @@
 import os
 from os.path import abspath, expanduser
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from pathlib import Path
+
+from typing import Any, Callable, Optional, Union
 
 import torch
 from PIL import Image
@@ -13,7 +15,7 @@ class WIDERFace(VisionDataset):
     """`WIDERFace <http://shuoyang1213.me/WIDERFACE/>`_ Dataset.
 
     Args:
-        root (string): Root directory where images and annotations are downloaded to.
+        root (str or ``pathlib.Path``): Root directory where images and annotations are downloaded to.
             Expects the following folder structure if download=False:
 
             .. code::
@@ -26,7 +28,7 @@ class WIDERFace(VisionDataset):
                         └── WIDER_test ('WIDER_test.zip' if compressed)
         split (string): The dataset split to use. One of {``train``, ``val``, ``test``}.
             Defaults to ``train``.
-        transform (callable, optional): A function/transform that  takes in a PIL image
+        transform (callable, optional): A function/transform that takes in a PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
@@ -34,6 +36,10 @@ class WIDERFace(VisionDataset):
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again.
 
+            .. warning::
+
+                To download the dataset `gdown <https://github.com/wkentaro/gdown>`_ is required.
+
     """
 
     BASE_FOLDER = "widerface"
@@ -51,7 +57,7 @@ class WIDERFace(VisionDataset):
 
     def __init__(
         self,
-        root: str,
+        root: Union[str, Path],
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
@@ -69,13 +75,13 @@ def __init__(
         if not self._check_integrity():
             raise RuntimeError("Dataset not found or corrupted. You can use download=True to download and prepare it")
 
-        self.img_info: List[Dict[str, Union[str, Dict[str, torch.Tensor]]]] = []
+        self.img_info: list[dict[str, Union[str, dict[str, torch.Tensor]]]] = []
         if self.split in ("train", "val"):
             self.parse_train_val_annotations_file()
         else:
             self.parse_test_annotations_file()
 
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+    def __getitem__(self, index: int) -> tuple[Any, Any]:
         """
         Args:
             index (int): Index
@@ -86,7 +92,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
         """
 
         # stay consistent with other datasets and return a PIL Image
-        img = Image.open(self.img_info[index]["img_path"])
+        img = Image.open(self.img_info[index]["img_path"])  # type: ignore[arg-type]
 
         if self.transform is not None:
             img = self.transform(img)
@@ -137,13 +143,13 @@ def parse_train_val_annotations_file(self) -> None:
                             {
                                 "img_path": img_path,
                                 "annotations": {
-                                    "bbox": labels_tensor[:, 0:4],  # x, y, width, height
-                                    "blur": labels_tensor[:, 4],
-                                    "expression": labels_tensor[:, 5],
-                                    "illumination": labels_tensor[:, 6],
-                                    "occlusion": labels_tensor[:, 7],
-                                    "pose": labels_tensor[:, 8],
-                                    "invalid": labels_tensor[:, 9],
+                                    "bbox": labels_tensor[:, 0:4].clone(),  # x, y, width, height
+                                    "blur": labels_tensor[:, 4].clone(),
+                                    "expression": labels_tensor[:, 5].clone(),
+                                    "illumination": labels_tensor[:, 6].clone(),
+                                    "occlusion": labels_tensor[:, 7].clone(),
+                                    "pose": labels_tensor[:, 8].clone(),
+                                    "invalid": labels_tensor[:, 9].clone(),
                                 },
                             }
                         )
@@ -167,7 +173,7 @@ def _check_integrity(self) -> bool:
         # Allow original archive to be deleted (zip). Only need the extracted images
         all_files = self.FILE_LIST.copy()
         all_files.append(self.ANNOTATIONS_FILE)
-        for (_, md5, filename) in all_files:
+        for _, md5, filename in all_files:
             file, ext = os.path.splitext(filename)
             extracted_dir = os.path.join(self.root, file)
             if not os.path.exists(extracted_dir):
@@ -176,11 +182,10 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
             return
 
         # download and extract image data
-        for (file_id, md5, filename) in self.FILE_LIST:
+        for file_id, md5, filename in self.FILE_LIST:
             download_file_from_google_drive(file_id, self.root, filename, md5)
             filepath = os.path.join(self.root, filename)
             extract_archive(filepath)
diff --git a/torchvision/extension.py b/torchvision/extension.py
index 3bad8351b23..67801056e88 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -1,7 +1,5 @@
-import ctypes
 import os
 import sys
-from warnings import warn
 
 import torch
 
@@ -16,6 +14,22 @@ def _has_ops():
 
 
 try:
+    # On Windows Python-3.8.x has `os.add_dll_directory` call,
+    # which is called to configure dll search path.
+    # To find cuda related dlls we need to make sure the
+    # conda environment/bin path is configured Please take a look:
+    # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
+    # Please note: if some path can't be added using add_dll_directory we simply ignore this path
+    if os.name == "nt" and sys.version_info < (3, 9):
+        env_path = os.environ["PATH"]
+        path_arr = env_path.split(";")
+        for path in path_arr:
+            if os.path.exists(path):
+                try:
+                    os.add_dll_directory(path)  # type: ignore[attr-defined]
+                except Exception:
+                    pass
+
     lib_path = _get_extension_path("_C")
     torch.ops.load_library(lib_path)
     _HAS_OPS = True
@@ -60,9 +74,9 @@ def _check_cuda_version():
         t_version = torch_version_cuda.split(".")
         t_major = int(t_version[0])
         t_minor = int(t_version[1])
-        if t_major != tv_major or t_minor != tv_minor:
+        if t_major != tv_major:
             raise RuntimeError(
-                "Detected that PyTorch and torchvision were compiled with different CUDA versions. "
+                "Detected that PyTorch and torchvision were compiled with different CUDA major versions. "
                 f"PyTorch has CUDA Version={t_major}.{t_minor} and torchvision has "
                 f"CUDA Version={tv_major}.{tv_minor}. "
                 "Please reinstall the torchvision that matches your PyTorch install."
@@ -72,19 +86,6 @@ def _check_cuda_version():
 
 def _load_library(lib_name):
     lib_path = _get_extension_path(lib_name)
-    # On Windows Python-3.8+ has `os.add_dll_directory` call,
-    # which is called from _get_extension_path to configure dll search path
-    # Condition below adds a workaround for older versions by
-    # explicitly calling `LoadLibraryExW` with the following flags:
-    #  - LOAD_LIBRARY_SEARCH_DEFAULT_DIRS (0x1000)
-    #  - LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR (0x100)
-    if os.name == "nt" and sys.version_info < (3, 8):
-        _kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
-        if hasattr(_kernel32, "LoadLibraryExW"):
-            _kernel32.LoadLibraryExW(lib_path, None, 0x00001100)
-        else:
-            warn("LoadLibraryExW is missing in kernel32.dll")
-
     torch.ops.load_library(lib_path)
 
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index ba7d4f69f26..03bd5d23cb2 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -1,14 +1,10 @@
-from typing import Any, Dict, Iterator
-
-import torch
-
-from ..utils import _log_api_usage_once
-
 try:
     from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
 except ModuleNotFoundError:
     _HAS_GPU_VIDEO_DECODER = False
+
 from ._video_opt import (
+    _HAS_CPU_VIDEO_DECODER,
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
     _probe_video_from_memory,
@@ -20,9 +16,13 @@
     VideoMetaData,
 )
 from .image import (
+    decode_avif,
+    decode_gif,
+    decode_heic,
     decode_image,
     decode_jpeg,
     decode_png,
+    decode_webp,
     encode_jpeg,
     encode_png,
     ImageReadMode,
@@ -46,6 +46,7 @@
     "_read_video_from_memory",
     "_read_video_timestamps_from_memory",
     "_probe_video_from_memory",
+    "_HAS_CPU_VIDEO_DECODER",
     "_HAS_VIDEO_OPT",
     "_HAS_GPU_VIDEO_DECODER",
     "_read_video_clip_from_memory",
@@ -56,6 +57,10 @@
     "decode_image",
     "decode_jpeg",
     "decode_png",
+    "decode_avif",
+    "decode_heic",
+    "decode_webp",
+    "decode_gif",
     "encode_jpeg",
     "encode_png",
     "read_file",
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
index f7869f0a9d1..cfd40c545d8 100644
--- a/torchvision/io/_load_gpu_decoder.py
+++ b/torchvision/io/_load_gpu_decoder.py
@@ -2,7 +2,7 @@
 
 
 try:
-    _load_library("Decoder")
+    _load_library("gpu_decoder")
     _HAS_GPU_VIDEO_DECODER = True
 except (ImportError, OSError):
     _HAS_GPU_VIDEO_DECODER = False
diff --git a/torchvision/io/_video_deprecation_warning.py b/torchvision/io/_video_deprecation_warning.py
new file mode 100644
index 00000000000..6e18dc0916d
--- /dev/null
+++ b/torchvision/io/_video_deprecation_warning.py
@@ -0,0 +1,16 @@
+import warnings
+
+import torch
+
+
+def _raise_video_deprecation_warning():
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        warnings.warn(
+            "The video decoding and encoding capabilities of torchvision "
+            "are deprecated from version 0.22 and will be removed in version 0.24. "
+            "We recommend that you migrate to TorchCodec, where we'll consolidate "
+            "the future decoding/encoding capabilities of PyTorch: "
+            "https://github.com/pytorch/torchcodec",
+            UserWarning,
+        )
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
index b598196d413..5dbf035886f 100644
--- a/torchvision/io/_video_opt.py
+++ b/torchvision/io/_video_opt.py
@@ -1,19 +1,21 @@
 import math
 import warnings
 from fractions import Fraction
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 
 from ..extension import _load_library
+from ._video_deprecation_warning import _raise_video_deprecation_warning
 
 
 try:
     _load_library("video_reader")
-    _HAS_VIDEO_OPT = True
+    _HAS_CPU_VIDEO_DECODER = True
 except (ImportError, OSError):
-    _HAS_VIDEO_OPT = False
+    _HAS_CPU_VIDEO_DECODER = False
 
+_HAS_VIDEO_OPT = _HAS_CPU_VIDEO_DECODER  # For BC
 default_timebase = Fraction(0, 1)
 
 
@@ -65,7 +67,7 @@ def __init__(self) -> None:
         self.audio_sample_rate = 0.0
 
 
-def _validate_pts(pts_range: Tuple[int, int]) -> None:
+def _validate_pts(pts_range: tuple[int, int]) -> None:
 
     if pts_range[0] > pts_range[1] > 0:
         raise ValueError(
@@ -106,7 +108,7 @@ def _fill_info(
 
 
 def _align_audio_frames(
-    aframes: torch.Tensor, aframe_pts: torch.Tensor, audio_pts_range: Tuple[int, int]
+    aframes: torch.Tensor, aframe_pts: torch.Tensor, audio_pts_range: tuple[int, int]
 ) -> torch.Tensor:
     start, end = aframe_pts[0], aframe_pts[-1]
     num_samples = aframes.size(0)
@@ -128,17 +130,16 @@ def _read_video_from_file(
     video_height: int = 0,
     video_min_dimension: int = 0,
     video_max_dimension: int = 0,
-    video_pts_range: Tuple[int, int] = (0, -1),
+    video_pts_range: tuple[int, int] = (0, -1),
     video_timebase: Fraction = default_timebase,
     read_audio_stream: bool = True,
     audio_samples: int = 0,
     audio_channels: int = 0,
-    audio_pts_range: Tuple[int, int] = (0, -1),
+    audio_pts_range: tuple[int, int] = (0, -1),
     audio_timebase: Fraction = default_timebase,
-) -> Tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
+) -> tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
     """
-    Reads a video from a file, returning both the video frames as well as
-    the audio frames
+    Reads a video from a file, returning both the video frames and the audio frames
 
     Args:
     filename (str): path to the video file
@@ -185,6 +186,7 @@ def _read_video_from_file(
         info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
             and audio_fps (int)
     """
+    _raise_video_deprecation_warning()
     _validate_pts(video_pts_range)
     _validate_pts(audio_pts_range)
 
@@ -217,7 +219,7 @@ def _read_video_from_file(
     return vframes, aframes, info
 
 
-def _read_video_timestamps_from_file(filename: str) -> Tuple[List[int], List[int], VideoMetaData]:
+def _read_video_timestamps_from_file(filename: str) -> tuple[list[int], list[int], VideoMetaData]:
     """
     Decode all video- and audio frames in the video. Only pts
     (presentation timestamp) is returned. The actual frame pixel data is not
@@ -256,6 +258,7 @@ def _probe_video_from_file(filename: str) -> VideoMetaData:
     """
     Probe a video file and return VideoMetaData with info about the video
     """
+    _raise_video_deprecation_warning()
     result = torch.ops.video_reader.probe_video_from_file(filename)
     vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
     info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
@@ -270,19 +273,18 @@ def _read_video_from_memory(
     video_height: int = 0,
     video_min_dimension: int = 0,
     video_max_dimension: int = 0,
-    video_pts_range: Tuple[int, int] = (0, -1),
+    video_pts_range: tuple[int, int] = (0, -1),
     video_timebase_numerator: int = 0,
     video_timebase_denominator: int = 1,
     read_audio_stream: int = 1,
     audio_samples: int = 0,
     audio_channels: int = 0,
-    audio_pts_range: Tuple[int, int] = (0, -1),
+    audio_pts_range: tuple[int, int] = (0, -1),
     audio_timebase_numerator: int = 0,
     audio_timebase_denominator: int = 1,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
-    Reads a video from memory, returning both the video frames as well as
-    the audio frames
+    Reads a video from memory, returning both the video frames as the audio frames
     This function is torchscriptable.
 
     Args:
@@ -332,11 +334,15 @@ def _read_video_from_memory(
             `K` is the number of channels
     """
 
+    _raise_video_deprecation_warning()
     _validate_pts(video_pts_range)
     _validate_pts(audio_pts_range)
 
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
 
     result = torch.ops.video_reader.read_video_from_memory(
         video_data,
@@ -371,14 +377,17 @@ def _read_video_from_memory(
 
 def _read_video_timestamps_from_memory(
     video_data: torch.Tensor,
-) -> Tuple[List[int], List[int], VideoMetaData]:
+) -> tuple[list[int], list[int], VideoMetaData]:
     """
     Decode all frames in the video. Only pts (presentation timestamp) is returned.
     The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
     is much faster than read_video(...)
     """
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.read_video_from_memory(
         video_data,
         0,  # seek_frame_margin
@@ -400,6 +409,7 @@ def _read_video_timestamps_from_memory(
         0,  # audio_timebase_num
         1,  # audio_timebase_den
     )
+    _raise_video_deprecation_warning()
     _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
     info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
 
@@ -415,8 +425,12 @@ def _probe_video_from_memory(
     Probe a video in memory and return VideoMetaData with info about the video
     This function is torchscriptable
     """
+    _raise_video_deprecation_warning()
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.probe_video_from_memory(video_data)
     vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
     info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
@@ -428,7 +442,8 @@ def _read_video(
     start_pts: Union[float, Fraction] = 0,
     end_pts: Optional[Union[float, Fraction]] = None,
     pts_unit: str = "pts",
-) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, float]]:
+) -> tuple[torch.Tensor, torch.Tensor, dict[str, float]]:
+    _raise_video_deprecation_warning()
     if end_pts is None:
         end_pts = float("inf")
 
@@ -486,14 +501,15 @@ def get_pts(time_base):
 
 def _read_video_timestamps(
     filename: str, pts_unit: str = "pts"
-) -> Tuple[Union[List[int], List[Fraction]], Optional[float]]:
+) -> tuple[Union[list[int], list[Fraction]], Optional[float]]:
+    _raise_video_deprecation_warning()
     if pts_unit == "pts":
         warnings.warn(
             "The pts_unit 'pts' gives wrong results and will be removed in a "
             + "follow-up version. Please use pts_unit 'sec'."
         )
 
-    pts: Union[List[int], List[Fraction]]
+    pts: Union[list[int], list[Fraction]]
     pts, _, info = _read_video_timestamps_from_file(filename)
 
     if pts_unit == "sec":
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
index 174823752db..c88e58ca4ca 100644
--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import Union
 from warnings import warn
 
 import torch
@@ -10,18 +11,34 @@
 try:
     _load_library("image")
 except (ImportError, OSError) as e:
-    warn(f"Failed to load image Python extension: {e}")
+    warn(
+        f"Failed to load image Python extension: '{e}'"
+        f"If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. "
+        f"Otherwise, there might be something wrong with your environment. "
+        f"Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?"
+    )
 
 
 class ImageReadMode(Enum):
-    """
-    Support for various modes while reading images.
+    """Allow automatic conversion to RGB, RGBA, etc while decoding.
+
+    .. note::
+
+        You don't need to use this struct, you can just pass strings to all
+        ``mode`` parameters, e.g. ``mode="RGB"``.
+
+    The different available modes are the following.
 
-    Use ``ImageReadMode.UNCHANGED`` for loading the image as-is,
-    ``ImageReadMode.GRAY`` for converting to grayscale,
-    ``ImageReadMode.GRAY_ALPHA`` for grayscale with transparency,
-    ``ImageReadMode.RGB`` for RGB and ``ImageReadMode.RGB_ALPHA`` for
-    RGB with transparency.
+    - UNCHANGED: loads the image as-is
+    - RGB: converts to RGB
+    - RGBA: converts to RGB with transparency (also aliased as RGB_ALPHA)
+    - GRAY: converts to grayscale
+    - GRAY_ALPHA: converts to grayscale with transparency
+
+    .. note::
+
+        Some decoders won't support all possible values, e.g. GRAY and
+        GRAY_ALPHA are only supported for PNG and JPEG images.
     """
 
     UNCHANGED = 0
@@ -29,59 +46,71 @@ class ImageReadMode(Enum):
     GRAY_ALPHA = 2
     RGB = 3
     RGB_ALPHA = 4
+    RGBA = RGB_ALPHA  # Alias for convenience
 
 
 def read_file(path: str) -> torch.Tensor:
     """
-    Reads and outputs the bytes contents of a file as a uint8 Tensor
-    with one dimension.
+    Return the bytes contents of a file as a uint8 1D Tensor.
 
     Args:
-        path (str): the path to the file to be read
+        path (str or ``pathlib.Path``): the path to the file to be read
 
     Returns:
         data (Tensor)
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(read_file)
-    data = torch.ops.image.read_file(path)
+    data = torch.ops.image.read_file(str(path))
     return data
 
 
 def write_file(filename: str, data: torch.Tensor) -> None:
     """
-    Writes the contents of a uint8 tensor with one dimension to a
-    file.
+    Write the content of an uint8 1D tensor to a file.
 
     Args:
-        filename (str): the path to the file to be written
+        filename (str or ``pathlib.Path``): the path to the file to be written
         data (Tensor): the contents to be written to the output file
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(write_file)
-    torch.ops.image.write_file(filename, data)
+    torch.ops.image.write_file(str(filename), data)
 
 
-def decode_png(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+def decode_png(
+    input: torch.Tensor,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    apply_exif_orientation: bool = False,
+) -> torch.Tensor:
     """
     Decodes a PNG image into a 3 dimensional RGB or grayscale Tensor.
-    Optionally converts the image to the desired format.
-    The values of the output tensor are uint8 in [0, 255].
+
+    The values of the output tensor are in uint8 in [0, 255] for most cases. If
+    the image is a 16-bit png, then the output tensor is uint16 in [0, 65535]
+    (supported from torchvision ``0.21``). Since uint16 support is limited in
+    pytorch, we recommend calling
+    :func:`torchvision.transforms.v2.functional.to_dtype()` with ``scale=True``
+    after this function to convert the decoded image into a uint8 or float
+    tensor.
 
     Args:
         input (Tensor[1]): a one dimensional uint8 tensor containing
             the raw bytes of the PNG image.
-        mode (ImageReadMode): the read mode used for optionally
-            converting the image. Default: ``ImageReadMode.UNCHANGED``.
-            See `ImageReadMode` class for more information on various
-            available modes.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+            Default: False.
 
     Returns:
         output (Tensor[image_channels, image_height, image_width])
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(decode_png)
-    output = torch.ops.image.decode_png(input, mode.value, False)
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+    output = torch.ops.image.decode_png(input, mode.value, apply_exif_orientation)
     return output
 
 
@@ -114,7 +143,7 @@ def write_png(input: torch.Tensor, filename: str, compression_level: int = 6):
     Args:
         input (Tensor[channels, image_height, image_width]): int8 image tensor of
             ``c`` channels, where ``c`` must be 1 or 3.
-        filename (str): Path to save the image.
+        filename (str or ``pathlib.Path``): Path to save the image.
         compression_level (int): Compression factor for the resulting file, it must be a number
             between 0 and 9. Default: 6
     """
@@ -125,21 +154,28 @@ def write_png(input: torch.Tensor, filename: str, compression_level: int = 6):
 
 
 def decode_jpeg(
-    input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED, device: str = "cpu"
-) -> torch.Tensor:
-    """
-    Decodes a JPEG image into a 3 dimensional RGB or grayscale Tensor.
-    Optionally converts the image to the desired format.
+    input: Union[torch.Tensor, list[torch.Tensor]],
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    device: Union[str, torch.device] = "cpu",
+    apply_exif_orientation: bool = False,
+) -> Union[torch.Tensor, list[torch.Tensor]]:
+    """Decode JPEG image(s) into 3D RGB or grayscale Tensor(s), on CPU or CUDA.
+
     The values of the output tensor are uint8 between 0 and 255.
 
+    .. note::
+        When using a CUDA device, passing a list of tensors is more efficient than repeated individual calls to ``decode_jpeg``.
+        When using CPU the performance is equivalent.
+        The CUDA version of this function has explicitly been designed with thread-safety in mind.
+        This function does not return partial results in case of an error.
+
     Args:
-        input (Tensor[1]): a one dimensional uint8 tensor containing
-            the raw bytes of the JPEG image. This tensor must be on CPU,
+        input (Tensor[1] or list[Tensor[1]]): a (list of) one dimensional uint8 tensor(s) containing
+            the raw bytes of the JPEG image. The tensor(s) must be on CPU,
             regardless of the ``device`` parameter.
-        mode (ImageReadMode): the read mode used for optionally
-            converting the image. Default: ``ImageReadMode.UNCHANGED``.
-            See ``ImageReadMode`` class for more information on various
-            available modes.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
         device (str or torch.device): The device on which the decoded image will
             be stored. If a cuda device is specified, the image will be decoded
             with `nvjpeg <https://developer.nvidia.com/nvjpeg>`_. This is only
@@ -150,42 +186,78 @@ def decode_jpeg(
             .. warning::
                 There is a memory leak in the nvjpeg library for CUDA versions < 11.6.
                 Make sure to rely on CUDA 11.6 or above before using ``device="cuda"``.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+            Default: False. Only implemented for JPEG format on CPU.
 
     Returns:
-        output (Tensor[image_channels, image_height, image_width])
-    """
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(decode_jpeg)
-    device = torch.device(device)
-    if device.type == "cuda":
-        output = torch.ops.image.decode_jpeg_cuda(input, mode.value, device)
-    else:
-        output = torch.ops.image.decode_jpeg(input, mode.value)
-    return output
+        output (Tensor[image_channels, image_height, image_width] or list[Tensor[image_channels, image_height, image_width]]):
+            The values of the output tensor(s) are uint8 between 0 and 255.
+            ``output.device`` will be set to the specified ``device``
 
 
-def encode_jpeg(input: torch.Tensor, quality: int = 75) -> torch.Tensor:
     """
-    Takes an input tensor in CHW layout and returns a buffer with the contents
-    of its corresponding JPEG file.
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_jpeg)
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+
+    if isinstance(input, list):
+        if len(input) == 0:
+            raise ValueError("Input list must contain at least one element")
+        if not all(isinstance(t, torch.Tensor) for t in input):
+            raise ValueError("All elements of the input list must be tensors.")
+        if not all(t.device.type == "cpu" for t in input):
+            raise ValueError("Input list must contain tensors on CPU.")
+        if device.type == "cuda":
+            return torch.ops.image.decode_jpegs_cuda(input, mode.value, device)
+        else:
+            return [torch.ops.image.decode_jpeg(img, mode.value, apply_exif_orientation) for img in input]
+
+    else:  # input is tensor
+        if input.device.type != "cpu":
+            raise ValueError("Input tensor must be a CPU tensor")
+        if device.type == "cuda":
+            return torch.ops.image.decode_jpegs_cuda([input], mode.value, device)[0]
+        else:
+            return torch.ops.image.decode_jpeg(input, mode.value, apply_exif_orientation)
+
+
+def encode_jpeg(
+    input: Union[torch.Tensor, list[torch.Tensor]], quality: int = 75
+) -> Union[torch.Tensor, list[torch.Tensor]]:
+    """Encode RGB tensor(s) into raw encoded jpeg bytes, on CPU or CUDA.
+
+    .. note::
+        Passing a list of CUDA tensors is more efficient than repeated individual calls to ``encode_jpeg``.
+        For CPU tensors the performance is equivalent.
 
     Args:
-        input (Tensor[channels, image_height, image_width])): int8 image tensor of
-            ``c`` channels, where ``c`` must be 1 or 3.
-        quality (int): Quality of the resulting JPEG file, it must be a number between
+        input (Tensor[channels, image_height, image_width] or List[Tensor[channels, image_height, image_width]]):
+            (list of) uint8 image tensor(s) of ``c`` channels, where ``c`` must be 1 or 3
+        quality (int): Quality of the resulting JPEG file(s). Must be a number between
             1 and 100. Default: 75
 
     Returns:
-        output (Tensor[1]): A one dimensional int8 tensor that contains the raw bytes of the
-            JPEG file.
+        output (Tensor[1] or list[Tensor[1]]): A (list of) one dimensional uint8 tensor(s) that contain the raw bytes of the JPEG file.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(encode_jpeg)
     if quality < 1 or quality > 100:
         raise ValueError("Image quality should be a positive number between 1 and 100")
-
-    output = torch.ops.image.encode_jpeg(input, quality)
-    return output
+    if isinstance(input, list):
+        if not input:
+            raise ValueError("encode_jpeg requires at least one input tensor when a list is passed")
+        if input[0].device.type == "cuda":
+            return torch.ops.image.encode_jpegs_cuda(input, quality)
+        else:
+            return [torch.ops.image.encode_jpeg(image, quality) for image in input]
+    else:  # single input tensor
+        if input.device.type == "cuda":
+            return torch.ops.image.encode_jpegs_cuda([input], quality)[0]
+        else:
+            return torch.ops.image.encode_jpeg(input, quality)
 
 
 def write_jpeg(input: torch.Tensor, filename: str, quality: int = 75):
@@ -195,63 +267,245 @@ def write_jpeg(input: torch.Tensor, filename: str, quality: int = 75):
     Args:
         input (Tensor[channels, image_height, image_width]): int8 image tensor of ``c``
             channels, where ``c`` must be 1 or 3.
-        filename (str): Path to save the image.
+        filename (str or ``pathlib.Path``): Path to save the image.
         quality (int): Quality of the resulting JPEG file, it must be a number
             between 1 and 100. Default: 75
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(write_jpeg)
     output = encode_jpeg(input, quality)
+    assert isinstance(output, torch.Tensor)  # Needed for torchscript
     write_file(filename, output)
 
 
-def decode_image(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
-    """
-    Detects whether an image is a JPEG or PNG and performs the appropriate
-    operation to decode the image into a 3 dimensional RGB or grayscale Tensor.
+def decode_image(
+    input: Union[torch.Tensor, str],
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    apply_exif_orientation: bool = False,
+) -> torch.Tensor:
+    """Decode an image into a uint8 tensor, from a path or from raw encoded bytes.
+
+    Currently supported image formats are jpeg, png, gif and webp.
+
+    The values of the output tensor are in uint8 in [0, 255] for most cases.
 
-    Optionally converts the image to the desired format.
-    The values of the output tensor are uint8 in [0, 255].
+    If the image is a 16-bit png, then the output tensor is uint16 in [0, 65535]
+    (supported from torchvision ``0.21``). Since uint16 support is limited in
+    pytorch, we recommend calling
+    :func:`torchvision.transforms.v2.functional.to_dtype()` with ``scale=True``
+    after this function to convert the decoded image into a uint8 or float
+    tensor.
+
+    .. note::
+
+        ``decode_image()`` doesn't work yet on AVIF or HEIC images. For these
+        formats, directly call  :func:`~torchvision.io.decode_avif` or
+        :func:`~torchvision.io.decode_heic`.
 
     Args:
-        input (Tensor): a one dimensional uint8 tensor containing the raw bytes of the
-            PNG or JPEG image.
-        mode (ImageReadMode): the read mode used for optionally converting the image.
-            Default: ``ImageReadMode.UNCHANGED``.
-            See ``ImageReadMode`` class for more information on various
-            available modes.
+        input (Tensor or str or ``pathlib.Path``): The image to decode. If a
+            tensor is passed, it must be one dimensional uint8 tensor containing
+            the raw bytes of the image. Otherwise, this must be a path to the image file.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+           Only applies to JPEG and PNG images. Default: False.
 
     Returns:
         output (Tensor[image_channels, image_height, image_width])
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(decode_image)
-    output = torch.ops.image.decode_image(input, mode.value)
+    if not isinstance(input, torch.Tensor):
+        input = read_file(str(input))
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+    output = torch.ops.image.decode_image(input, mode.value, apply_exif_orientation)
     return output
 
 
-def read_image(path: str, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+def read_image(
+    path: str,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    apply_exif_orientation: bool = False,
+) -> torch.Tensor:
+    """[OBSOLETE] Use :func:`~torchvision.io.decode_image` instead."""
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_image)
+    data = read_file(path)
+    return decode_image(data, mode, apply_exif_orientation=apply_exif_orientation)
+
+
+def decode_gif(input: torch.Tensor) -> torch.Tensor:
     """
-    Reads a JPEG or PNG image into a 3 dimensional RGB or grayscale Tensor.
-    Optionally converts the image to the desired format.
-    The values of the output tensor are uint8 in [0, 255].
+    Decode a GIF image into a 3 or 4 dimensional RGB Tensor.
+
+    The values of the output tensor are uint8 between 0 and 255.
+    The output tensor has shape ``(C, H, W)`` if there is only one image in the
+    GIF, and ``(N, C, H, W)`` if there are ``N`` images.
 
     Args:
-        path (str): path of the JPEG or PNG image.
-        mode (ImageReadMode): the read mode used for optionally converting the image.
-            Default: ``ImageReadMode.UNCHANGED``.
-            See ``ImageReadMode`` class for more information on various
-            available modes.
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the GIF image.
 
     Returns:
-        output (Tensor[image_channels, image_height, image_width])
+        output (Tensor[image_channels, image_height, image_width] or Tensor[num_images, image_channels, image_height, image_width])
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(read_image)
-    data = read_file(path)
-    return decode_image(data, mode)
+        _log_api_usage_once(decode_gif)
+    return torch.ops.image.decode_gif(input)
 
 
-def _read_png_16(path: str, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
-    data = read_file(path)
-    return torch.ops.image.decode_png(data, mode.value, True)
+def decode_webp(
+    input: torch.Tensor,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+) -> torch.Tensor:
+    """
+    Decode a WEBP image into a 3 dimensional RGB[A] Tensor.
+
+    The values of the output tensor are uint8 between 0 and 255.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the WEBP image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_webp)
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+    return torch.ops.image.decode_webp(input, mode.value)
+
+
+# TODO_AVIF_HEIC: Better support for torchscript. Scripting decode_avif of
+# decode_heic currently fails, mainly because of the logic
+# _load_extra_decoders_once() (using global variables, try/except statements,
+# etc.).
+# The ops (torch.ops.extra_decoders_ns.decode_*) are otherwise torchscript-able,
+# and users who need torchscript can always just wrap those.
+
+# TODO_AVIF_HEIC: decode_image() should work for those. The key technical issue
+# we have here is that the format detection logic of decode_image() is
+# implemented in torchvision, and torchvision has zero knowledge of
+# torchvision-extra-decoders, so we cannot call the AVIF/HEIC C++ decoders
+# (those in torchvision-extra-decoders) from there.
+# A trivial check that could be done within torchvision would be to check the
+# file extension, if a path was passed. We could also just implement the
+# AVIF/HEIC detection logic in Python as a fallback, if the file detection
+# didn't find any format. In any case: properly determining whether a file is
+# HEIC is far from trivial, and relying on libmagic would probably be best
+
+
+_EXTRA_DECODERS_ALREADY_LOADED = False
+
+
+def _load_extra_decoders_once():
+    global _EXTRA_DECODERS_ALREADY_LOADED
+    if _EXTRA_DECODERS_ALREADY_LOADED:
+        return
+
+    try:
+        import torchvision_extra_decoders
+
+        # torchvision-extra-decoders only supports linux for now. BUT, users on
+        # e.g. MacOS can still install it: they will get the pure-python
+        # 0.0.0.dev version:
+        # https://pypi.org/project/torchvision-extra-decoders/0.0.0.dev0, which
+        # is a dummy version that was created to reserve the namespace on PyPI.
+        # We have to check that expose_extra_decoders() exists for those users,
+        # so we can properly error on non-Linux archs.
+        assert hasattr(torchvision_extra_decoders, "expose_extra_decoders")
+    except (AssertionError, ImportError) as e:
+        raise RuntimeError(
+            "In order to enable the AVIF and HEIC decoding capabilities of "
+            "torchvision, you need to `pip install torchvision-extra-decoders`. "
+            "Just install the package, you don't need to update your code. "
+            "This is only supported on Linux, and this feature is still in BETA stage. "
+            "Please let us know of any issue: https://github.com/pytorch/vision/issues/new/choose. "
+            "Note that `torchvision-extra-decoders` is released under the LGPL license. "
+        ) from e
+
+    # This will expose torch.ops.extra_decoders_ns.decode_avif and torch.ops.extra_decoders_ns.decode_heic
+    torchvision_extra_decoders.expose_extra_decoders()
+
+    _EXTRA_DECODERS_ALREADY_LOADED = True
+
+
+def decode_avif(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+    """Decode an AVIF image into a 3 dimensional RGB[A] Tensor.
+
+    .. warning::
+        In order to enable the AVIF decoding capabilities of torchvision, you
+        first need to run ``pip install torchvision-extra-decoders``. Just
+        install the package, you don't need to update your code. This is only
+        supported on Linux, and this feature is still in BETA stage. Please let
+        us know of any issue:
+        https://github.com/pytorch/vision/issues/new/choose. Note that
+        `torchvision-extra-decoders
+        <https://github.com/meta-pytorch/torchvision-extra-decoders/>`_ is
+        released under the LGPL license.
+
+    The values of the output tensor are in uint8 in [0, 255] for most images. If
+    the image has a bit-depth of more than 8, then the output tensor is uint16
+    in [0, 65535]. Since uint16 support is limited in pytorch, we recommend
+    calling :func:`torchvision.transforms.v2.functional.to_dtype()` with
+    ``scale=True`` after this function to convert the decoded image into a uint8
+    or float tensor.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the AVIF image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
+    _load_extra_decoders_once()
+    if input.dtype != torch.uint8:
+        raise RuntimeError(f"Input tensor must have uint8 data type, got {input.dtype}")
+    return torch.ops.extra_decoders_ns.decode_avif(input, mode.value)
+
+
+def decode_heic(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+    """Decode an HEIC image into a 3 dimensional RGB[A] Tensor.
+
+    .. warning::
+        In order to enable the HEIC decoding capabilities of torchvision, you
+        first need to run ``pip install torchvision-extra-decoders``. Just
+        install the package, you don't need to update your code. This is only
+        supported on Linux, and this feature is still in BETA stage. Please let
+        us know of any issue:
+        https://github.com/pytorch/vision/issues/new/choose. Note that
+        `torchvision-extra-decoders
+        <https://github.com/meta-pytorch/torchvision-extra-decoders/>`_ is
+        released under the LGPL license.
+
+    The values of the output tensor are in uint8 in [0, 255] for most images. If
+    the image has a bit-depth of more than 8, then the output tensor is uint16
+    in [0, 65535]. Since uint16 support is limited in pytorch, we recommend
+    calling :func:`torchvision.transforms.v2.functional.to_dtype()` with
+    ``scale=True`` after this function to convert the decoded image into a uint8
+    or float tensor.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the HEIC image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
+    _load_extra_decoders_once()
+    if input.dtype != torch.uint8:
+        raise RuntimeError(f"Input tensor must have uint8 data type, got {input.dtype}")
+    return torch.ops.extra_decoders_ns.decode_heic(input, mode.value)
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index ceb20fe52c0..14edcf50aaa 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -4,14 +4,14 @@
 import re
 import warnings
 from fractions import Fraction
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 import torch
 
 from ..utils import _log_api_usage_once
 from . import _video_opt
-
+from ._video_deprecation_warning import _raise_video_deprecation_warning
 
 try:
     import av
@@ -27,6 +27,10 @@
 install PyAV on your system.
 """
         )
+    try:
+        FFmpegError = av.FFmpegError  # from av 14 https://github.com/PyAV-Org/PyAV/blob/main/CHANGELOG.rst
+    except AttributeError:
+        FFmpegError = av.AVError
 except ImportError:
     av = ImportError(
         """\
@@ -56,14 +60,27 @@ def write_video(
     video_array: torch.Tensor,
     fps: float,
     video_codec: str = "libx264",
-    options: Optional[Dict[str, Any]] = None,
+    options: Optional[dict[str, Any]] = None,
     audio_array: Optional[torch.Tensor] = None,
     audio_fps: Optional[float] = None,
     audio_codec: Optional[str] = None,
-    audio_options: Optional[Dict[str, Any]] = None,
+    audio_options: Optional[dict[str, Any]] = None,
 ) -> None:
     """
-    Writes a 4d tensor in [T, H, W, C] format in a video file
+    [DEPRECATED] Writes a 4d tensor in [T, H, W, C] format in a video file.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    This function relies on PyAV (therefore, ultimately FFmpeg) to encode
+    videos, you can get more fine-grained control by referring to the other
+    options at your disposal within `the FFMpeg wiki
+    <http://trac.ffmpeg.org/wiki#Encoding>`_.
 
     Args:
         filename (str): path where the video will be saved
@@ -71,22 +88,36 @@ def write_video(
             as a uint8 tensor in [T, H, W, C] format
         fps (Number): video frames per second
         video_codec (str): the name of the video codec, i.e. "libx264", "h264", etc.
-        options (Dict): dictionary containing options to be passed into the PyAV video stream
+        options (Dict): dictionary containing options to be passed into the PyAV video stream.
+            The list of options is codec-dependent and can all
+            be found from `the FFMpeg wiki <http://trac.ffmpeg.org/wiki#Encoding>`_.
         audio_array (Tensor[C, N]): tensor containing the audio, where C is the number of channels
             and N is the number of samples
         audio_fps (Number): audio sample rate, typically 44100 or 48000
         audio_codec (str): the name of the audio codec, i.e. "mp3", "aac", etc.
-        audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream
+        audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream.
+            The list of options is codec-dependent and can all
+            be found from `the FFMpeg wiki <http://trac.ffmpeg.org/wiki#Encoding>`_.
+
+    Examples::
+        >>> # Creating libx264 video with CRF 17, for visually lossless footage:
+        >>>
+        >>> from torchvision.io import write_video
+        >>> # 1000 frames of 100x100, 3-channel image.
+        >>> vid = torch.randn(1000, 100, 100, 3, dtype = torch.uint8)
+        >>> write_video("video.mp4", options = {"crf": "17"})
+
     """
+    _raise_video_deprecation_warning()
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(write_video)
     _check_av_available()
-    video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy()
+    video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy(force=True)
 
     # PyAV does not support floating point numbers with decimal point
     # and will throw OverflowException in case this is not the case
     if isinstance(fps, float):
-        fps = np.round(fps)
+        fps = int(np.round(fps))
 
     with av.open(filename, mode="w") as container:
         stream = container.add_stream(video_codec, rate=fps)
@@ -116,7 +147,7 @@ def write_video(
             audio_sample_fmt = container.streams.audio[0].format.name
 
             format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
-            audio_array = torch.as_tensor(audio_array).numpy().astype(format_dtype)
+            audio_array = torch.as_tensor(audio_array).numpy(force=True).astype(format_dtype)
 
             frame = av.AudioFrame.from_ndarray(audio_array, format=audio_sample_fmt, layout=audio_layout)
 
@@ -130,7 +161,13 @@ def write_video(
 
         for img in video_array:
             frame = av.VideoFrame.from_ndarray(img, format="rgb24")
-            frame.pict_type = "NONE"
+            try:
+                frame.pict_type = "NONE"
+            except TypeError:
+                from av.video.frame import PictureType  # noqa
+
+                frame.pict_type = PictureType.NONE
+
             for packet in stream.encode(frame):
                 container.mux(packet)
 
@@ -145,8 +182,8 @@ def _read_from_stream(
     end_offset: float,
     pts_unit: str,
     stream: "av.stream.Stream",
-    stream_name: Dict[str, Optional[Union[int, Tuple[int, ...], List[int]]]],
-) -> List["av.frame.Frame"]:
+    stream_name: dict[str, Optional[Union[int, tuple[int, ...], list[int]]]],
+) -> list["av.frame.Frame"]:
     global _CALLED_TIMES, _GC_COLLECTION_INTERVAL
     _CALLED_TIMES += 1
     if _CALLED_TIMES % _GC_COLLECTION_INTERVAL == _GC_COLLECTION_INTERVAL - 1:
@@ -190,7 +227,7 @@ def _read_from_stream(
     try:
         # TODO check if stream needs to always be the video stream here or not
         container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
-    except av.AVError:
+    except FFmpegError:
         # TODO add some warnings in this case
         # print("Corrupted file?", container.name)
         return []
@@ -203,7 +240,7 @@ def _read_from_stream(
                     buffer_count += 1
                     continue
                 break
-    except av.AVError:
+    except FFmpegError:
         # TODO add a warning
         pass
     # ensure that the results are sorted wrt the pts
@@ -220,7 +257,7 @@ def _read_from_stream(
 
 
 def _align_audio_frames(
-    aframes: torch.Tensor, audio_frames: List["av.frame.Frame"], ref_start: int, ref_end: float
+    aframes: torch.Tensor, audio_frames: list["av.frame.Frame"], ref_start: int, ref_end: float
 ) -> torch.Tensor:
     start, end = audio_frames[0].pts, audio_frames[-1].pts
     total_aframes = aframes.shape[1]
@@ -240,13 +277,19 @@ def read_video(
     end_pts: Optional[Union[float, Fraction]] = None,
     pts_unit: str = "pts",
     output_format: str = "THWC",
-) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
-    """
-    Reads a video from a file, returning both the video frames as well as
-    the audio frames
+) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]:
+    """[DEPRECATED] Reads a video from a file, returning both the video frames and the audio frames
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
 
     Args:
-        filename (str): path to the video file
+        filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts.
         start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
             The start presentation time of the video
         end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
@@ -260,6 +303,7 @@ def read_video(
         aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
         info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
     """
+    _raise_video_deprecation_warning()
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(read_video)
 
@@ -269,76 +313,77 @@ def read_video(
 
     from torchvision import get_video_backend
 
-    if not os.path.exists(filename):
-        raise RuntimeError(f"File not found: {filename}")
-
     if get_video_backend() != "pyav":
-        return _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
-
-    _check_av_available()
-
-    if end_pts is None:
-        end_pts = float("inf")
-
-    if end_pts < start_pts:
-        raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}")
-
-    info = {}
-    video_frames = []
-    audio_frames = []
-    audio_timebase = _video_opt.default_timebase
-
-    try:
-        with av.open(filename, metadata_errors="ignore") as container:
-            if container.streams.audio:
-                audio_timebase = container.streams.audio[0].time_base
-            if container.streams.video:
-                video_frames = _read_from_stream(
-                    container,
-                    start_pts,
-                    end_pts,
-                    pts_unit,
-                    container.streams.video[0],
-                    {"video": 0},
-                )
-                video_fps = container.streams.video[0].average_rate
-                # guard against potentially corrupted files
-                if video_fps is not None:
-                    info["video_fps"] = float(video_fps)
-
-            if container.streams.audio:
-                audio_frames = _read_from_stream(
-                    container,
-                    start_pts,
-                    end_pts,
-                    pts_unit,
-                    container.streams.audio[0],
-                    {"audio": 0},
-                )
-                info["audio_fps"] = container.streams.audio[0].rate
-
-    except av.AVError:
-        # TODO raise a warning?
-        pass
-
-    vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
-    aframes_list = [frame.to_ndarray() for frame in audio_frames]
-
-    if vframes_list:
-        vframes = torch.as_tensor(np.stack(vframes_list))
-    else:
-        vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
-
-    if aframes_list:
-        aframes = np.concatenate(aframes_list, 1)
-        aframes = torch.as_tensor(aframes)
-        if pts_unit == "sec":
-            start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
-            if end_pts != float("inf"):
-                end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
-        aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
+        if not os.path.exists(filename):
+            raise RuntimeError(f"File not found: {filename}")
+        vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
     else:
-        aframes = torch.empty((1, 0), dtype=torch.float32)
+        _check_av_available()
+
+        if end_pts is None:
+            end_pts = float("inf")
+
+        if end_pts < start_pts:
+            raise ValueError(
+                f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}"
+            )
+
+        info = {}
+        video_frames = []
+        audio_frames = []
+        audio_timebase = _video_opt.default_timebase
+
+        try:
+            with av.open(filename, metadata_errors="ignore") as container:
+                if container.streams.audio:
+                    audio_timebase = container.streams.audio[0].time_base
+                if container.streams.video:
+                    video_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.video[0],
+                        {"video": 0},
+                    )
+                    video_fps = container.streams.video[0].average_rate
+                    # guard against potentially corrupted files
+                    if video_fps is not None:
+                        info["video_fps"] = float(video_fps)
+
+                if container.streams.audio:
+                    audio_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.audio[0],
+                        {"audio": 0},
+                    )
+                    info["audio_fps"] = container.streams.audio[0].rate
+
+        except FFmpegError:
+            # TODO raise a warning?
+            pass
+
+        vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
+        aframes_list = [frame.to_ndarray() for frame in audio_frames]
+
+        if vframes_list:
+            vframes = torch.as_tensor(np.stack(vframes_list))
+        else:
+            vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
+
+        if aframes_list:
+            aframes = np.concatenate(aframes_list, 1)
+            aframes = torch.as_tensor(aframes)
+            if pts_unit == "sec":
+                start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+                if end_pts != float("inf"):
+                    end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
+            aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
+        else:
+            aframes = torch.empty((1, 0), dtype=torch.float32)
 
     if output_format == "TCHW":
         # [T,H,W,C] --> [T,C,H,W]
@@ -356,7 +401,7 @@ def _can_read_timestamps_from_packets(container: "av.container.Container") -> bo
     return False
 
 
-def _decode_video_timestamps(container: "av.container.Container") -> List[int]:
+def _decode_video_timestamps(container: "av.container.Container") -> list[int]:
     if _can_read_timestamps_from_packets(container):
         # fast path
         return [x.pts for x in container.demux(video=0) if x.pts is not None]
@@ -364,9 +409,16 @@ def _decode_video_timestamps(container: "av.container.Container") -> List[int]:
         return [x.pts for x in container.decode(video=0) if x.pts is not None]
 
 
-def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[int], Optional[float]]:
-    """
-    List the video frames timestamps.
+def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[int], Optional[float]]:
+    """[DEPREACTED] List the video frames timestamps.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
 
     Note that the function decodes the whole video frame-by-frame.
 
@@ -381,6 +433,7 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in
         video_fps (float, optional): the frame rate for the video
 
     """
+    _raise_video_deprecation_warning()
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(read_video_timestamps)
     from torchvision import get_video_backend
@@ -400,10 +453,10 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in
                 video_time_base = video_stream.time_base
                 try:
                     pts = _decode_video_timestamps(container)
-                except av.AVError:
+                except FFmpegError:
                     warnings.warn(f"Failed decoding frames for file {filename}")
                 video_fps = float(video_stream.average_rate)
-    except av.AVError as e:
+    except FFmpegError as e:
         msg = f"Failed to open container for {filename}; Caught error: {e}"
         warnings.warn(msg, RuntimeWarning)
 
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index c2ffa049d31..efc58c47905 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -1,16 +1,17 @@
-from typing import Any, Dict, Iterator
+import io
+import warnings
+from collections.abc import Iterator
+
+from typing import Any
 
 import torch
 
 from ..utils import _log_api_usage_once
+from ._video_deprecation_warning import _raise_video_deprecation_warning
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
-from ._video_opt import _HAS_VIDEO_OPT
+from ._video_opt import _HAS_CPU_VIDEO_DECODER
 
-if _HAS_VIDEO_OPT:
+if _HAS_CPU_VIDEO_DECODER:
 
     def _has_video_opt() -> bool:
         return True
@@ -21,11 +22,44 @@ def _has_video_opt() -> bool:
         return False
 
 
+try:
+    import av
+
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
+
+
 class VideoReader:
-    """
-    Fine-grained video-reading API.
+    """[DEPRECATED] Fine-grained video-reading API.
     Supports frame-by-frame reading of various streams from a single video
-    container.
+    container. Much like previous video_reader API it supports the following
+    backends: video_reader, pyav, and cuda.
+    Backends can be set via `torchvision.set_video_backend` function.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
 
     .. betastatus:: VideoReader class
 
@@ -66,13 +100,16 @@ class VideoReader:
 
         Each stream descriptor consists of two parts: stream type (e.g. 'video') and
         a unique stream id (which are determined by the video encoding).
-        In this way, if the video contaner contains multiple
-        streams of the same type, users can acces the one they want.
+        In this way, if the video container contains multiple
+        streams of the same type, users can access the one they want.
         If only stream type is passed, the decoder auto-detects first stream of that type.
 
     Args:
-
-        path (string): Path to the video file in supported format
+        src (string, bytes object, or tensor): The media source.
+            If string-type, it must be a file path supported by FFMPEG.
+            If bytes, should be an in-memory representation of a file supported by FFMPEG.
+            If Tensor, it is interpreted internally as byte buffer.
+            It must be one-dimensional, of type ``torch.uint8``.
 
         stream (string, optional): descriptor of the required stream, followed by the stream id,
             in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
@@ -81,33 +118,67 @@ class VideoReader:
         num_threads (int, optional): number of threads used by the codec to decode video.
             Default value (0) enables multithreading with codec-dependent heuristic. The performance
             will depend on the version of FFMPEG codecs supported.
-
-        device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
-            To use GPU decoding, pass ``device="cuda"``.
-
     """
 
-    def __init__(self, path: str, stream: str = "video", num_threads: int = 0, device: str = "cpu") -> None:
+    def __init__(
+        self,
+        src: str,
+        stream: str = "video",
+        num_threads: int = 0,
+    ) -> None:
+        _raise_video_deprecation_warning()
         _log_api_usage_once(self)
-        self.is_cuda = False
-        device = torch.device(device)
-        if device.type == "cuda":
-            if not _HAS_GPU_VIDEO_DECODER:
-                raise RuntimeError("Not compiled with GPU decoder support.")
-            self.is_cuda = True
-            self._c = torch.classes.torchvision.GPUDecoder(path, device)
-            return
-        if not _has_video_opt():
-            raise RuntimeError(
-                "Not compiled with video_reader support, "
-                + "to enable video_reader support, please install "
-                + "ffmpeg (version 4.2 is currently supported) and "
-                + "build torchvision from source."
-            )
-
-        self._c = torch.classes.torchvision.Video(path, stream, num_threads)
-
-    def __next__(self) -> Dict[str, Any]:
+        from .. import get_video_backend
+
+        self.backend = get_video_backend()
+        if isinstance(src, str):
+            if not src:
+                raise ValueError("src cannot be empty")
+        elif isinstance(src, bytes):
+            if self.backend in ["cuda"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
+                )
+            elif self.backend == "pyav":
+                src = io.BytesIO(src)
+            else:
+                with warnings.catch_warnings():
+                    # Ignore the warning because we actually don't modify the buffer in this function
+                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
+                    src = torch.frombuffer(src, dtype=torch.uint8)
+        elif isinstance(src, torch.Tensor):
+            if self.backend in ["cuda", "pyav"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
+                )
+        else:
+            raise ValueError(f"src must be either string, Tensor or bytes object. Got {type(src)}")
+
+        if self.backend == "cuda":
+            device = torch.device("cuda")
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+
+        elif self.backend == "video_reader":
+            if isinstance(src, str):
+                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            elif isinstance(src, torch.Tensor):
+                self._c = torch.classes.torchvision.Video("", "", 0)
+                self._c.init_from_memory(src, stream, num_threads)
+
+        elif self.backend == "pyav":
+            self.container = av.open(src, metadata_errors="ignore")
+            # TODO: load metadata
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+
+            # TODO: add extradata exception
+
+        else:
+            raise RuntimeError(f"Unknown video backend: {self.backend}")
+
+    def __next__(self) -> dict[str, Any]:
         """Decodes and returns the next frame of the current stream.
         Frames are encoded as a dict with mandatory
         data and pts fields, where data is a tensor, and pts is a
@@ -119,17 +190,32 @@ def __next__(self) -> Dict[str, Any]:
             and corresponding timestamp (``pts``) in seconds
 
         """
-        if self.is_cuda:
+        if self.backend == "cuda":
             frame = self._c.next()
             if frame.numel() == 0:
                 raise StopIteration
-            return {"data": frame}
-        frame, pts = self._c.next()
+            return {"data": frame, "pts": None}
+        elif self.backend == "video_reader":
+            frame, pts = self._c.next()
+        else:
+            try:
+                frame = next(self._c)
+                pts = float(frame.pts * frame.time_base)
+                if "video" in self.pyav_stream:
+                    frame = torch.as_tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
+                elif "audio" in self.pyav_stream:
+                    frame = torch.as_tensor(frame.to_ndarray()).permute(1, 0)
+                else:
+                    frame = None
+            except av.error.EOFError:
+                raise StopIteration
+
         if frame.numel() == 0:
             raise StopIteration
+
         return {"data": frame, "pts": pts}
 
-    def __iter__(self) -> Iterator[Dict[str, Any]]:
+    def __iter__(self) -> Iterator[dict[str, Any]]:
         return self
 
     def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
@@ -145,15 +231,41 @@ def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
             frame with the exact timestamp if it exists or
             the first frame with timestamp larger than ``time_s``.
         """
-        self._c.seek(time_s, keyframes_only)
+        if self.backend in ["cuda", "video_reader"]:
+            self._c.seek(time_s, keyframes_only)
+        else:
+            # handle special case as pyav doesn't catch it
+            if time_s < 0:
+                time_s = 0
+            temp_str = self.container.streams.get(**self.pyav_stream)[0]
+            offset = int(round(time_s / temp_str.time_base))
+            if not keyframes_only:
+                warnings.warn("Accurate seek is not implemented for pyav backend")
+            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
+            self._c = self.container.decode(**self.pyav_stream)
         return self
 
-    def get_metadata(self) -> Dict[str, Any]:
+    def get_metadata(self) -> dict[str, Any]:
         """Returns video metadata
 
         Returns:
             (dict): dictionary containing duration and frame rate for every stream
         """
+        if self.backend == "pyav":
+            metadata = {}  # type:  Dict[str, Any]
+            for stream in self.container.streams:
+                if stream.type not in metadata:
+                    if stream.type == "video":
+                        rate_n = "fps"
+                    else:
+                        rate_n = "framerate"
+                    metadata[stream.type] = {rate_n: [], "duration": []}
+
+                rate = getattr(stream, "average_rate", None) or stream.sample_rate
+
+                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
+                metadata[stream.type][rate_n].append(float(rate))
+            return metadata
         return self._c.get_metadata()
 
     def set_current_stream(self, stream: str) -> bool:
@@ -165,14 +277,20 @@ def set_current_stream(self, stream: str) -> bool:
                 Currently available stream types include ``['video', 'audio']``.
                 Each descriptor consists of two parts: stream type (e.g. 'video') and
                 a unique stream id (which are determined by video encoding).
-                In this way, if the video contaner contains multiple
-                streams of the same type, users can acces the one they want.
+                In this way, if the video container contains multiple
+                streams of the same type, users can access the one they want.
                 If only stream type is passed, the decoder auto-detects first stream
                 of that type and returns it.
 
         Returns:
-            (bool): True on succes, False otherwise
+            (bool): True on success, False otherwise
         """
-        if self.is_cuda:
-            print("GPU decoding only works with video stream.")
+        if self.backend == "cuda":
+            warnings.warn("GPU decoding only works with video stream.")
+        if self.backend == "pyav":
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            return True
         return self._c.set_current_stream(stream)
diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py
index eb949fb3d5c..6ea0a1f7178 100644
--- a/torchvision/models/__init__.py
+++ b/torchvision/models/__init__.py
@@ -13,5 +13,11 @@
 from .vgg import *
 from .vision_transformer import *
 from .swin_transformer import *
+from .maxvit import *
 from . import detection, optical_flow, quantization, segmentation, video
-from ._api import get_model, get_model_weights, get_weight, list_models
+
+# The Weights and WeightsEnum are developer-facing utils that we make public for
+# downstream libs like torchgeo https://github.com/pytorch/vision/issues/7094
+# TODO: we could / should document them publicly, but it's not clear where, as
+# they're not intended for end users.
+from ._api import get_model, get_model_builder, get_model_weights, get_weight, list_models, Weights, WeightsEnum
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index c2886d2ed99..358e6f43159 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -1,19 +1,21 @@
+import fnmatch
 import importlib
 import inspect
 import sys
-from dataclasses import dataclass, fields
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from enum import Enum
+from functools import partial
 from inspect import signature
 from types import ModuleType
-from typing import Any, Callable, cast, Dict, List, Mapping, Optional, TypeVar, Union
+from typing import Any, Callable, get_args, Optional, TypeVar, Union
 
 from torch import nn
 
-from torchvision._utils import StrEnum
-
 from .._internally_replaced_utils import load_state_dict_from_url
 
 
-__all__ = ["WeightsEnum", "Weights", "get_model", "get_model_weights", "get_weight", "list_models"]
+__all__ = ["WeightsEnum", "Weights", "get_model", "get_model_builder", "get_model_weights", "get_weight", "list_models"]
 
 
 @dataclass
@@ -35,10 +37,36 @@ class Weights:
 
     url: str
     transforms: Callable
-    meta: Dict[str, Any]
-
-
-class WeightsEnum(StrEnum):
+    meta: dict[str, Any]
+
+    def __eq__(self, other: Any) -> bool:
+        # We need this custom implementation for correct deep-copy and deserialization behavior.
+        # TL;DR: After the definition of an enum, creating a new instance, i.e. by deep-copying or deserializing it,
+        # involves an equality check against the defined members. Unfortunately, the `transforms` attribute is often
+        # defined with `functools.partial` and `fn = partial(...); assert deepcopy(fn) != fn`. Without custom handling
+        # for it, the check against the defined members would fail and effectively prevent the weights from being
+        # deep-copied or deserialized.
+        # See https://github.com/pytorch/vision/pull/7107 for details.
+        if not isinstance(other, Weights):
+            return NotImplemented
+
+        if self.url != other.url:
+            return False
+
+        if self.meta != other.meta:
+            return False
+
+        if isinstance(self.transforms, partial) and isinstance(other.transforms, partial):
+            return (
+                self.transforms.func == other.transforms.func
+                and self.transforms.args == other.transforms.args
+                and self.transforms.keywords == other.transforms.keywords
+            )
+        else:
+            return self.transforms == other.transforms
+
+
+class WeightsEnum(Enum):
     """
     This class is the parent class of all model weights. Each model building method receives an optional `weights`
     parameter with its associated pre-trained weights. It inherits from `Enum` and its values should be of type
@@ -48,40 +76,40 @@ class WeightsEnum(StrEnum):
         value (Weights): The data class entry with the weight information.
     """
 
-    def __init__(self, value: Weights):
-        self._value_ = value
-
     @classmethod
     def verify(cls, obj: Any) -> Any:
         if obj is not None:
             if type(obj) is str:
-                obj = cls.from_str(obj.replace(cls.__name__ + ".", ""))
+                obj = cls[obj.replace(cls.__name__ + ".", "")]
             elif not isinstance(obj, cls):
                 raise TypeError(
                     f"Invalid Weight class provided; expected {cls.__name__} but received {obj.__class__.__name__}."
                 )
         return obj
 
-    def get_state_dict(self, progress: bool) -> Mapping[str, Any]:
-        return load_state_dict_from_url(self.url, progress=progress)
+    def get_state_dict(self, *args: Any, **kwargs: Any) -> Mapping[str, Any]:
+        return load_state_dict_from_url(self.url, *args, **kwargs)
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}.{self._name_}"
 
-    def __getattr__(self, name):
-        # Be able to fetch Weights attributes directly
-        for f in fields(Weights):
-            if f.name == name:
-                return object.__getattribute__(self.value, name)
-        return super().__getattr__(name)
+    @property
+    def url(self):
+        return self.value.url
+
+    @property
+    def transforms(self):
+        return self.value.transforms
+
+    @property
+    def meta(self):
+        return self.value.meta
 
 
 def get_weight(name: str) -> WeightsEnum:
     """
     Gets the weights enum value by its full name. Example: "ResNet50_Weights.IMAGENET1K_V1"
 
-    .. betastatus:: function
-
     Args:
         name (str): The name of the weight enum entry.
 
@@ -96,7 +124,9 @@ def get_weight(name: str) -> WeightsEnum:
     base_module_name = ".".join(sys.modules[__name__].__name__.split(".")[:-1])
     base_module = importlib.import_module(base_module_name)
     model_modules = [base_module] + [
-        x[1] for x in inspect.getmembers(base_module, inspect.ismodule) if x[1].__file__.endswith("__init__.py")
+        x[1]
+        for x in inspect.getmembers(base_module, inspect.ismodule)
+        if x[1].__file__.endswith("__init__.py")  # type: ignore[union-attr]
     ]
 
     weights_enum = None
@@ -109,35 +139,29 @@ def get_weight(name: str) -> WeightsEnum:
     if weights_enum is None:
         raise ValueError(f"The weight enum '{enum_name}' for the specific method couldn't be retrieved.")
 
-    return weights_enum.from_str(value_name)
-
-
-W = TypeVar("W", bound=WeightsEnum)
+    return weights_enum[value_name]
 
 
-def get_model_weights(name: Union[Callable, str]) -> W:
+def get_model_weights(name: Union[Callable, str]) -> type[WeightsEnum]:
     """
-    Retuns the weights enum class associated to the given model.
-
-    .. betastatus:: function
+    Returns the weights enum class associated to the given model.
 
     Args:
         name (callable or str): The model builder function or the name under which it is registered.
 
     Returns:
-        weights_enum (W): The weights enum class associated with the model.
+        weights_enum (WeightsEnum): The weights enum class associated with the model.
     """
-    model = find_model(name) if isinstance(name, str) else name
-    return cast(W, _get_enum_from_fn(model))
+    model = get_model_builder(name) if isinstance(name, str) else name
+    return _get_enum_from_fn(model)
 
 
-def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
+def _get_enum_from_fn(fn: Callable) -> type[WeightsEnum]:
     """
     Internal method that gets the weight enum of a specific model builder method.
 
     Args:
         fn (Callable): The builder method used to create the model.
-        weight_name (str): The name of the weight enum entry of the specific model.
     Returns:
         WeightsEnum: The requested weight enum.
     """
@@ -145,14 +169,13 @@ def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
     if "weights" not in sig.parameters:
         raise ValueError("The method is missing the 'weights' argument.")
 
-    ann = signature(fn).parameters["weights"].annotation
+    ann = sig.parameters["weights"].annotation
     weights_enum = None
     if isinstance(ann, type) and issubclass(ann, WeightsEnum):
         weights_enum = ann
     else:
         # handle cases like Union[Optional, T]
-        # TODO: Replace ann.__args__ with typing.get_args(ann) after python >= 3.8
-        for t in ann.__args__:  # type: ignore[union-attr]
+        for t in get_args(ann):  # type: ignore[union-attr]
             if isinstance(t, type) and issubclass(t, WeightsEnum):
                 weights_enum = t
                 break
@@ -162,7 +185,7 @@ def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
             "The WeightsEnum class for the specific method couldn't be retrieved. Make sure the typing info is correct."
         )
 
-    return cast(WeightsEnum, weights_enum)
+    return weights_enum
 
 
 M = TypeVar("M", bound=nn.Module)
@@ -181,25 +204,56 @@ def wrapper(fn: Callable[..., M]) -> Callable[..., M]:
     return wrapper
 
 
-def list_models(module: Optional[ModuleType] = None) -> List[str]:
+def list_models(
+    module: Optional[ModuleType] = None,
+    include: Union[Iterable[str], str, None] = None,
+    exclude: Union[Iterable[str], str, None] = None,
+) -> list[str]:
     """
     Returns a list with the names of registered models.
 
-    .. betastatus:: function
-
     Args:
         module (ModuleType, optional): The module from which we want to extract the available models.
+        include (str or Iterable[str], optional): Filter(s) for including the models from the set of all models.
+            Filters are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is the union of individual filters.
+        exclude (str or Iterable[str], optional): Filter(s) applied after include_filters to remove models.
+            Filter are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is removal of all the models that match any individual filter.
 
     Returns:
         models (list): A list with the names of available models.
     """
-    models = [
+    all_models = {
         k for k, v in BUILTIN_MODELS.items() if module is None or v.__module__.rsplit(".", 1)[0] == module.__name__
-    ]
+    }
+    if include:
+        models: set[str] = set()
+        if isinstance(include, str):
+            include = [include]
+        for include_filter in include:
+            models = models | set(fnmatch.filter(all_models, include_filter))
+    else:
+        models = all_models
+
+    if exclude:
+        if isinstance(exclude, str):
+            exclude = [exclude]
+        for exclude_filter in exclude:
+            models = models - set(fnmatch.filter(all_models, exclude_filter))
     return sorted(models)
 
 
-def find_model(name: str) -> Callable[..., M]:
+def get_model_builder(name: str) -> Callable[..., nn.Module]:
+    """
+    Gets the model name and returns the model builder method.
+
+    Args:
+        name (str): The name under which the model is registered.
+
+    Returns:
+        fn (Callable): The model builder method.
+    """
     name = name.lower()
     try:
         fn = BUILTIN_MODELS[name]
@@ -208,12 +262,10 @@ def find_model(name: str) -> Callable[..., M]:
     return fn
 
 
-def get_model(name: str, **config: Any) -> M:
+def get_model(name: str, **config: Any) -> nn.Module:
     """
     Gets the model name and configuration and returns an instantiated model.
 
-    .. betastatus:: function
-
     Args:
         name (str): The name under which the model is registered.
         **config (Any): parameters passed to the model builder method.
@@ -221,5 +273,5 @@ def get_model(name: str, **config: Any) -> M:
     Returns:
         model (nn.Module): The initialized model.
     """
-    fn = find_model(name)
+    fn = get_model_builder(name)
     return fn(**config)
diff --git a/torchvision/models/_utils.py b/torchvision/models/_utils.py
index 5d930e60295..61b9a069f98 100644
--- a/torchvision/models/_utils.py
+++ b/torchvision/models/_utils.py
@@ -2,7 +2,7 @@
 import inspect
 import warnings
 from collections import OrderedDict
-from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 from torch import nn
 
@@ -44,10 +44,10 @@ class IntermediateLayerGetter(nn.ModuleDict):
 
     _version = 2
     __annotations__ = {
-        "return_layers": Dict[str, str],
+        "return_layers": dict[str, str],
     }
 
-    def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None:
+    def __init__(self, model: nn.Module, return_layers: dict[str, str]) -> None:
         if not set(return_layers).issubset([name for name, _ in model.named_children()]):
             raise ValueError("return_layers are not present in model")
         orig_return_layers = return_layers
@@ -134,7 +134,7 @@ def wrapper(*args: Any, **kwargs: Any) -> D:
             keyword_only_kwargs = dict(zip(keyword_only_params, keyword_only_args))
             warnings.warn(
                 f"Using {sequence_to_str(tuple(keyword_only_kwargs.keys()), separate_last='and ')} as positional "
-                f"parameter(s) is deprecated since 0.13 and will be removed in 0.15. Please use keyword parameter(s) "
+                f"parameter(s) is deprecated since 0.13 and may be removed in the future. Please use keyword parameter(s) "
                 f"instead."
             )
             kwargs.update(keyword_only_kwargs)
@@ -149,7 +149,7 @@ def wrapper(*args: Any, **kwargs: Any) -> D:
 V = TypeVar("V")
 
 
-def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[Dict[str, Any]], Optional[W]]]]):
+def handle_legacy_interface(**weights: tuple[str, Union[Optional[W], Callable[[dict[str, Any]], Optional[W]]]]):
     """Decorates a model builder with the new interface to make it compatible with the old.
 
     In particular this handles two things:
@@ -191,7 +191,7 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> M:
                 # used to be a pretrained parameter.
                 pretrained_positional = weights_arg is not sentinel
                 if pretrained_positional:
-                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have a
+                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have
                     # unified access to the value if the default value is a callable.
                     kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
                 else:
@@ -206,13 +206,13 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> M:
 
                 if not pretrained_positional:
                     warnings.warn(
-                        f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
+                        f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "
                         f"please use '{weights_param}' instead."
                     )
 
                 msg = (
                     f"Arguments other than a weight enum or `None` for '{weights_param}' are deprecated since 0.13 and "
-                    f"will be removed in 0.15. "
+                    f"may be removed in the future. "
                     f"The current behavior is equivalent to passing `{weights_param}={default_weights_arg}`."
                 )
                 if pretrained_arg:
@@ -232,7 +232,7 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> M:
     return outer_wrapper
 
 
-def _ovewrite_named_param(kwargs: Dict[str, Any], param: str, new_value: V) -> None:
+def _ovewrite_named_param(kwargs: dict[str, Any], param: str, new_value: V) -> None:
     if param in kwargs:
         if kwargs[param] != new_value:
             raise ValueError(f"The parameter '{param}' expected value {new_value} but got {kwargs[param]} instead.")
@@ -240,17 +240,17 @@ def _ovewrite_named_param(kwargs: Dict[str, Any], param: str, new_value: V) -> N
         kwargs[param] = new_value
 
 
-def _ovewrite_value_param(param: Optional[V], new_value: V) -> V:
-    if param is not None:
-        if param != new_value:
-            raise ValueError(f"The parameter '{param}' expected value {new_value} but got {param} instead.")
-    return new_value
+def _ovewrite_value_param(param: str, actual: Optional[V], expected: V) -> V:
+    if actual is not None:
+        if actual != expected:
+            raise ValueError(f"The parameter '{param}' expected value {expected} but got {actual} instead.")
+    return expected
 
 
 class _ModelURLs(dict):
     def __getitem__(self, item):
         warnings.warn(
-            "Accessing the model URLs via the internal dictionary of the module is deprecated since 0.13 and will "
-            "be removed in 0.15. Please access them via the appropriate Weights Enum instead."
+            "Accessing the model URLs via the internal dictionary of the module is deprecated since 0.13 and may "
+            "be removed in the future. Please access them via the appropriate Weights Enum instead."
         )
         return super().__getitem__(item)
diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py
index 328f978ba11..f85acbeb214 100644
--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -67,6 +67,8 @@ class AlexNet_Weights(WeightsEnum):
                     "acc@5": 79.066,
                 }
             },
+            "_ops": 0.714,
+            "_file_size": 233.087,
             "_docs": """
                 These weights reproduce closely the results of the paper using a simplified training recipe.
             """,
@@ -112,17 +114,6 @@ def alexnet(*, weights: Optional[AlexNet_Weights] = None, progress: bool = True,
     model = AlexNet(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "alexnet": AlexNet_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/convnext.py b/torchvision/models/convnext.py
index 025baa3d148..3264cb1fd0c 100644
--- a/torchvision/models/convnext.py
+++ b/torchvision/models/convnext.py
@@ -1,5 +1,6 @@
+from collections.abc import Sequence
 from functools import partial
-from typing import Any, Callable, List, Optional, Sequence
+from typing import Any, Callable, Optional
 
 import torch
 from torch import nn, Tensor
@@ -90,7 +91,7 @@ def __repr__(self) -> str:
 class ConvNeXt(nn.Module):
     def __init__(
         self,
-        block_setting: List[CNBlockConfig],
+        block_setting: list[CNBlockConfig],
         stochastic_depth_prob: float = 0.0,
         layer_scale: float = 1e-6,
         num_classes: int = 1000,
@@ -112,7 +113,7 @@ def __init__(
         if norm_layer is None:
             norm_layer = partial(LayerNorm2d, eps=1e-6)
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
 
         # Stem
         firstconv_output_channels = block_setting[0].input_channels
@@ -133,7 +134,7 @@ def __init__(
         stage_block_id = 0
         for cnf in block_setting:
             # Bottlenecks
-            stage: List[nn.Module] = []
+            stage: list[nn.Module] = []
             for _ in range(cnf.num_layers):
                 # adjust stochastic depth probability based on the depth of the stage block
                 sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0)
@@ -177,7 +178,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 def _convnext(
-    block_setting: List[CNBlockConfig],
+    block_setting: list[CNBlockConfig],
     stochastic_depth_prob: float,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -189,7 +190,7 @@ def _convnext(
     model = ConvNeXt(block_setting, stochastic_depth_prob=stochastic_depth_prob, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -219,6 +220,8 @@ class ConvNeXt_Tiny_Weights(WeightsEnum):
                     "acc@5": 96.146,
                 }
             },
+            "_ops": 4.456,
+            "_file_size": 109.119,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -237,6 +240,8 @@ class ConvNeXt_Small_Weights(WeightsEnum):
                     "acc@5": 96.650,
                 }
             },
+            "_ops": 8.684,
+            "_file_size": 191.703,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -255,6 +260,8 @@ class ConvNeXt_Base_Weights(WeightsEnum):
                     "acc@5": 96.870,
                 }
             },
+            "_ops": 15.355,
+            "_file_size": 338.064,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -273,6 +280,8 @@ class ConvNeXt_Large_Weights(WeightsEnum):
                     "acc@5": 96.976,
                 }
             },
+            "_ops": 34.361,
+            "_file_size": 754.537,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py
index 9aa5ed176a0..06457f7b09e 100644
--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -1,7 +1,7 @@
 import re
 from collections import OrderedDict
 from functools import partial
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torch.nn as nn
@@ -15,7 +15,6 @@
 from ._meta import _IMAGENET_CATEGORIES
 from ._utils import _ovewrite_named_param, handle_legacy_interface
 
-
 __all__ = [
     "DenseNet",
     "DenseNet121_Weights",
@@ -45,27 +44,27 @@ def __init__(
         self.drop_rate = float(drop_rate)
         self.memory_efficient = memory_efficient
 
-    def bn_function(self, inputs: List[Tensor]) -> Tensor:
+    def bn_function(self, inputs: list[Tensor]) -> Tensor:
         concated_features = torch.cat(inputs, 1)
         bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))  # noqa: T484
         return bottleneck_output
 
     # todo: rewrite when torchscript supports any
-    def any_requires_grad(self, input: List[Tensor]) -> bool:
+    def any_requires_grad(self, input: list[Tensor]) -> bool:
         for tensor in input:
             if tensor.requires_grad:
                 return True
         return False
 
     @torch.jit.unused  # noqa: T484
-    def call_checkpoint_bottleneck(self, input: List[Tensor]) -> Tensor:
+    def call_checkpoint_bottleneck(self, input: list[Tensor]) -> Tensor:
         def closure(*inputs):
             return self.bn_function(inputs)
 
-        return cp.checkpoint(closure, *input)
+        return cp.checkpoint(closure, *input, use_reentrant=False)
 
     @torch.jit._overload_method  # noqa: F811
-    def forward(self, input: List[Tensor]) -> Tensor:  # noqa: F811
+    def forward(self, input: list[Tensor]) -> Tensor:  # noqa: F811
         pass
 
     @torch.jit._overload_method  # noqa: F811
@@ -153,7 +152,7 @@ class DenseNet(nn.Module):
     def __init__(
         self,
         growth_rate: int = 32,
-        block_config: Tuple[int, int, int, int] = (6, 12, 24, 16),
+        block_config: tuple[int, int, int, int] = (6, 12, 24, 16),
         num_init_features: int = 64,
         bn_size: int = 4,
         drop_rate: float = 0,
@@ -228,7 +227,7 @@ def _load_state_dict(model: nn.Module, weights: WeightsEnum, progress: bool) ->
         r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$"
     )
 
-    state_dict = weights.get_state_dict(progress=progress)
+    state_dict = weights.get_state_dict(progress=progress, check_hash=True)
     for key in list(state_dict.keys()):
         res = pattern.match(key)
         if res:
@@ -240,7 +239,7 @@ def _load_state_dict(model: nn.Module, weights: WeightsEnum, progress: bool) ->
 
 def _densenet(
     growth_rate: int,
-    block_config: Tuple[int, int, int, int],
+    block_config: tuple[int, int, int, int],
     num_init_features: int,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -278,6 +277,8 @@ class DenseNet121_Weights(WeightsEnum):
                     "acc@5": 91.972,
                 }
             },
+            "_ops": 2.834,
+            "_file_size": 30.845,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -296,6 +297,8 @@ class DenseNet161_Weights(WeightsEnum):
                     "acc@5": 93.560,
                 }
             },
+            "_ops": 7.728,
+            "_file_size": 110.369,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -314,6 +317,8 @@ class DenseNet169_Weights(WeightsEnum):
                     "acc@5": 92.806,
                 }
             },
+            "_ops": 3.36,
+            "_file_size": 54.708,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -332,6 +337,8 @@ class DenseNet201_Weights(WeightsEnum):
                     "acc@5": 93.370,
                 }
             },
+            "_ops": 4.291,
+            "_file_size": 77.373,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -439,17 +446,3 @@ def densenet201(*, weights: Optional[DenseNet201_Weights] = None, progress: bool
     weights = DenseNet201_Weights.verify(weights)
 
     return _densenet(32, (6, 12, 48, 32), 64, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "densenet121": DenseNet121_Weights.IMAGENET1K_V1.url,
-        "densenet169": DenseNet169_Weights.IMAGENET1K_V1.url,
-        "densenet201": DenseNet201_Weights.IMAGENET1K_V1.url,
-        "densenet161": DenseNet161_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
index 10d31852856..805c05a92ff 100644
--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
@@ -1,6 +1,6 @@
 import math
 from collections import OrderedDict
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 from torch import nn, Tensor
@@ -22,10 +22,10 @@ def __init__(self, batch_size_per_image: int, positive_fraction: float) -> None:
         self.batch_size_per_image = batch_size_per_image
         self.positive_fraction = positive_fraction
 
-    def __call__(self, matched_idxs: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+    def __call__(self, matched_idxs: list[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
         """
         Args:
-            matched idxs: list of tensors containing -1, 0 or positive values.
+            matched_idxs: list of tensors containing -1, 0 or positive values.
                 Each tensor corresponds to a specific image.
                 -1 values are ignored, 0 are considered as negatives and > 0 as
                 positives.
@@ -126,7 +126,7 @@ class BoxCoder:
     """
 
     def __init__(
-        self, weights: Tuple[float, float, float, float], bbox_xform_clip: float = math.log(1000.0 / 16)
+        self, weights: tuple[float, float, float, float], bbox_xform_clip: float = math.log(1000.0 / 16)
     ) -> None:
         """
         Args:
@@ -136,7 +136,7 @@ def __init__(
         self.weights = weights
         self.bbox_xform_clip = bbox_xform_clip
 
-    def encode(self, reference_boxes: List[Tensor], proposals: List[Tensor]) -> List[Tensor]:
+    def encode(self, reference_boxes: list[Tensor], proposals: list[Tensor]) -> list[Tensor]:
         boxes_per_image = [len(b) for b in reference_boxes]
         reference_boxes = torch.cat(reference_boxes, dim=0)
         proposals = torch.cat(proposals, dim=0)
@@ -159,7 +159,7 @@ def encode_single(self, reference_boxes: Tensor, proposals: Tensor) -> Tensor:
 
         return targets
 
-    def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor:
+    def decode(self, rel_codes: Tensor, boxes: list[Tensor]) -> Tensor:
         torch._assert(
             isinstance(boxes, (list, tuple)),
             "This function expects boxes of type list or tuple.",
@@ -273,7 +273,6 @@ def encode(self, reference_boxes: Tensor, proposals: Tensor) -> Tensor:
         return targets
 
     def decode(self, rel_codes: Tensor, boxes: Tensor) -> Tensor:
-
         """
         From a set of original boxes and encoded relative box offsets,
         get the decoded boxes.
@@ -403,22 +402,14 @@ def set_low_quality_matches_(self, matches: Tensor, all_matches: Tensor, match_q
         it is unmatched, then match it to the ground-truth with which it has the highest
         quality value.
         """
-        # For each gt, find the prediction with which it has highest quality
+        # For each gt, find the prediction with which it has the highest quality
         highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find highest quality match available, even if it is low, including ties
+        # Find the highest quality match available, even if it is low, including ties
         gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
         # Example gt_pred_pairs_of_highest_quality:
-        #   tensor([[    0, 39796],
-        #           [    1, 32055],
-        #           [    1, 32070],
-        #           [    2, 39190],
-        #           [    2, 40255],
-        #           [    3, 40390],
-        #           [    3, 41455],
-        #           [    4, 45470],
-        #           [    5, 45325],
-        #           [    5, 46390]])
-        # Each row is a (gt index, prediction index)
+        # (tensor([0, 1, 1, 2, 2, 3, 3, 4, 5, 5]),
+        #  tensor([39796, 32055, 32070, 39190, 40255, 40390, 41455, 45470, 45325, 46390]))
+        # Each element in the first tensor is a gt index, and each element in second tensor is a prediction index
         # Note how gt items 1, 2, 3, and 5 each have two ties
 
         pred_inds_to_update = gt_pred_pairs_of_highest_quality[1]
@@ -459,7 +450,7 @@ def overwrite_eps(model: nn.Module, eps: float) -> None:
             module.eps = eps
 
 
-def retrieve_out_channels(model: nn.Module, size: Tuple[int, int]) -> List[int]:
+def retrieve_out_channels(model: nn.Module, size: tuple[int, int]) -> list[int]:
     """
     This method retrieves the number of output channels of a specific model.
 
@@ -501,14 +492,14 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
     if K exceeds the number of elements along that axis. Previously, python's min() function was
     used to determine whether to use the provided k-value or the specified dim axis value.
 
-    However in cases where the model is being exported in tracing mode, python min() is
+    However, in cases where the model is being exported in tracing mode, python min() is
     static causing the model to be traced incorrectly and eventually fail at the topk node.
     In order to avoid this situation, in tracing mode, torch.min() is used instead.
 
     Args:
-        input (Tensor): The orignal input tensor.
+        input (Tensor): The original input tensor.
         orig_kval (int): The provided k-value.
-        axis(int): Axis along which we retreive the input size.
+        axis(int): Axis along which we retrieve the input size.
 
     Returns:
         min_kval (int): Appropriately selected k-value.
@@ -526,7 +517,7 @@ def _box_loss(
     anchors_per_image: Tensor,
     matched_gt_boxes_per_image: Tensor,
     bbox_regression_per_image: Tensor,
-    cnf: Optional[Dict[str, float]] = None,
+    cnf: Optional[dict[str, float]] = None,
 ) -> Tensor:
     torch._assert(type in ["l1", "smooth_l1", "ciou", "diou", "giou"], f"Unsupported loss: {type}")
 
diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py
index 34fb8d23069..05aa7664bea 100644
--- a/torchvision/models/detection/anchor_utils.py
+++ b/torchvision/models/detection/anchor_utils.py
@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional
+from typing import Optional
 
 import torch
 from torch import nn, Tensor
@@ -29,7 +29,7 @@ class AnchorGenerator(nn.Module):
     """
 
     __annotations__ = {
-        "cell_anchors": List[torch.Tensor],
+        "cell_anchors": list[torch.Tensor],
     }
 
     def __init__(
@@ -57,11 +57,11 @@ def __init__(
     # This method assumes aspect ratio = height / width for an anchor.
     def generate_anchors(
         self,
-        scales: List[int],
-        aspect_ratios: List[float],
+        scales: list[int],
+        aspect_ratios: list[float],
         dtype: torch.dtype = torch.float32,
         device: torch.device = torch.device("cpu"),
-    ):
+    ) -> Tensor:
         scales = torch.as_tensor(scales, dtype=dtype, device=device)
         aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
         h_ratios = torch.sqrt(aspect_ratios)
@@ -76,12 +76,12 @@ def generate_anchors(
     def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
         self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
 
-    def num_anchors_per_location(self):
+    def num_anchors_per_location(self) -> list[int]:
         return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
 
     # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
     # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
-    def grid_anchors(self, grid_sizes: List[List[int]], strides: List[List[Tensor]]) -> List[Tensor]:
+    def grid_anchors(self, grid_sizes: list[list[int]], strides: list[list[Tensor]]) -> list[Tensor]:
         anchors = []
         cell_anchors = self.cell_anchors
         torch._assert(cell_anchors is not None, "cell_anchors should not be None")
@@ -112,7 +112,7 @@ def grid_anchors(self, grid_sizes: List[List[int]], strides: List[List[Tensor]])
 
         return anchors
 
-    def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]:
+    def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Tensor]:
         grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
         image_size = image_list.tensors.shape[-2:]
         dtype, device = feature_maps[0].dtype, feature_maps[0].device
@@ -125,7 +125,7 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten
         ]
         self.set_cell_anchors(dtype, device)
         anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
-        anchors: List[List[torch.Tensor]] = []
+        anchors: list[list[torch.Tensor]] = []
         for _ in range(len(image_list.image_sizes)):
             anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
             anchors.append(anchors_in_image)
@@ -145,7 +145,7 @@ class DefaultBoxGenerator(nn.Module):
             of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
         scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
             the ``min_ratio`` and ``max_ratio`` parameters.
-        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of defalt boxes. If not provided
+        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
             it will be estimated from the data.
         clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
             is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
@@ -153,11 +153,11 @@ class DefaultBoxGenerator(nn.Module):
 
     def __init__(
         self,
-        aspect_ratios: List[List[int]],
+        aspect_ratios: list[list[int]],
         min_ratio: float = 0.15,
         max_ratio: float = 0.9,
-        scales: Optional[List[float]] = None,
-        steps: Optional[List[int]] = None,
+        scales: Optional[list[float]] = None,
+        steps: Optional[list[int]] = None,
         clip: bool = True,
     ):
         super().__init__()
@@ -183,8 +183,8 @@ def __init__(
 
     def _generate_wh_pairs(
         self, num_outputs: int, dtype: torch.dtype = torch.float32, device: torch.device = torch.device("cpu")
-    ) -> List[Tensor]:
-        _wh_pairs: List[Tensor] = []
+    ) -> list[Tensor]:
+        _wh_pairs: list[Tensor] = []
         for k in range(num_outputs):
             # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k
             s_k = self.scales[k]
@@ -201,20 +201,20 @@ def _generate_wh_pairs(
             _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device))
         return _wh_pairs
 
-    def num_anchors_per_location(self):
+    def num_anchors_per_location(self) -> list[int]:
         # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map.
         return [2 + 2 * len(r) for r in self.aspect_ratios]
 
     # Default Boxes calculation based on page 6 of SSD paper
     def _grid_default_boxes(
-        self, grid_sizes: List[List[int]], image_size: List[int], dtype: torch.dtype = torch.float32
+        self, grid_sizes: list[list[int]], image_size: list[int], dtype: torch.dtype = torch.float32
     ) -> Tensor:
         default_boxes = []
         for k, f_k in enumerate(grid_sizes):
             # Now add the default boxes for each width-height pair
             if self.steps is not None:
-                x_f_k = image_size[0] / self.steps[k]
-                y_f_k = image_size[1] / self.steps[k]
+                x_f_k = image_size[1] / self.steps[k]
+                y_f_k = image_size[0] / self.steps[k]
             else:
                 y_f_k, x_f_k = f_k
 
@@ -246,7 +246,7 @@ def __repr__(self) -> str:
         )
         return s
 
-    def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]:
+    def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Tensor]:
         grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
         image_size = image_list.tensors.shape[-2:]
         dtype, device = feature_maps[0].dtype, feature_maps[0].device
diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
index 4941d7ec440..f24c121d59a 100644
--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Optional, Union
 
 from torch import nn, Tensor
 from torchvision.ops import misc as misc_nn_ops
@@ -33,8 +33,8 @@ class BackboneWithFPN(nn.Module):
     def __init__(
         self,
         backbone: nn.Module,
-        return_layers: Dict[str, str],
-        in_channels_list: List[int],
+        return_layers: dict[str, str],
+        in_channels_list: list[int],
         out_channels: int,
         extra_blocks: Optional[ExtraFPNBlock] = None,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
@@ -53,7 +53,7 @@ def __init__(
         )
         self.out_channels = out_channels
 
-    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
         x = self.body(x)
         x = self.fpn(x)
         return x
@@ -62,7 +62,7 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]:
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]]).from_str("IMAGENET1K_V1"),
+        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
     ),
 )
 def resnet_fpn_backbone(
@@ -71,7 +71,7 @@ def resnet_fpn_backbone(
     weights: Optional[WeightsEnum],
     norm_layer: Callable[..., nn.Module] = misc_nn_ops.FrozenBatchNorm2d,
     trainable_layers: int = 3,
-    returned_layers: Optional[List[int]] = None,
+    returned_layers: Optional[list[int]] = None,
     extra_blocks: Optional[ExtraFPNBlock] = None,
 ) -> BackboneWithFPN:
     """
@@ -79,8 +79,10 @@ def resnet_fpn_backbone(
 
     Examples::
 
+        >>> import torch
+        >>> from torchvision.models import ResNet50_Weights
         >>> from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
-        >>> backbone = resnet_fpn_backbone('resnet50', weights=ResNet50_Weights.DEFAULT, trainable_layers=3)
+        >>> backbone = resnet_fpn_backbone(backbone_name='resnet50', weights=ResNet50_Weights.DEFAULT, trainable_layers=3)
         >>> # get some dummy image
         >>> x = torch.rand(1,3,64,64)
         >>> # compute the output
@@ -102,12 +104,12 @@ def resnet_fpn_backbone(
         trainable_layers (int): number of trainable (not frozen) layers starting from final block.
             Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
         returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
-            By default all layers are returned.
+            By default, all layers are returned.
         extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
             be performed. It is expected to take the fpn features, the original
             features and the names of the original features as input, and returns
             a new list of feature maps and their corresponding names. By
-            default a ``LastLevelMaxPool`` is used.
+            default, a ``LastLevelMaxPool`` is used.
     """
     backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
     return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks)
@@ -116,12 +118,12 @@ def resnet_fpn_backbone(
 def _resnet_fpn_extractor(
     backbone: resnet.ResNet,
     trainable_layers: int,
-    returned_layers: Optional[List[int]] = None,
+    returned_layers: Optional[list[int]] = None,
     extra_blocks: Optional[ExtraFPNBlock] = None,
     norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> BackboneWithFPN:
 
-    # select layers that wont be frozen
+    # select layers that won't be frozen
     if trainable_layers < 0 or trainable_layers > 5:
         raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
     layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
@@ -158,7 +160,7 @@ def _validate_trainable_layers(
     if not is_trained:
         if trainable_backbone_layers is not None:
             warnings.warn(
-                "Changing trainable_backbone_layers has not effect if "
+                "Changing trainable_backbone_layers has no effect if "
                 "neither pretrained nor pretrained_backbone have been set to True, "
                 f"falling back to trainable_backbone_layers={max_value} so that all layers are trainable"
             )
@@ -177,7 +179,7 @@ def _validate_trainable_layers(
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]]).from_str("IMAGENET1K_V1"),
+        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
     ),
 )
 def mobilenet_backbone(
@@ -187,7 +189,7 @@ def mobilenet_backbone(
     fpn: bool,
     norm_layer: Callable[..., nn.Module] = misc_nn_ops.FrozenBatchNorm2d,
     trainable_layers: int = 2,
-    returned_layers: Optional[List[int]] = None,
+    returned_layers: Optional[list[int]] = None,
     extra_blocks: Optional[ExtraFPNBlock] = None,
 ) -> nn.Module:
     backbone = mobilenet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
@@ -198,7 +200,7 @@ def _mobilenet_extractor(
     backbone: Union[mobilenet.MobileNetV2, mobilenet.MobileNetV3],
     fpn: bool,
     trainable_layers: int,
-    returned_layers: Optional[List[int]] = None,
+    returned_layers: Optional[list[int]] = None,
     extra_blocks: Optional[ExtraFPNBlock] = None,
     norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> nn.Module:
@@ -208,7 +210,7 @@ def _mobilenet_extractor(
     stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     if trainable_layers < 0 or trainable_layers > num_stages:
         raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ")
     freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 3160e8e89b3..c6f7063107b 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -47,9 +47,9 @@ class FasterRCNN(GeneralizedRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -68,13 +68,17 @@ class FasterRCNN(GeneralizedRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
             If box_predictor is specified, num_classes should be None.
-        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
-        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
         image_mean (Tuple[float, float, float]): mean values used for input normalization.
             They are generally the mean values of the dataset on which the backbone has been trained
             on
@@ -96,8 +100,7 @@ class FasterRCNN(GeneralizedRCNN):
             for computing the loss
         rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
             of the RPN
-        rpn_score_thresh (float): during inference, only return proposals with a classification score
-            greater than rpn_score_thresh
+        rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh
         box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
             the locations indicated by the bounding boxes
         box_head (nn.Module): module that takes the cropped feature maps as input
@@ -128,7 +131,7 @@ class FasterRCNN(GeneralizedRCNN):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # FasterRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -307,9 +310,9 @@ def forward(self, x):
 class FastRCNNConvFCHead(nn.Sequential):
     def __init__(
         self,
-        input_size: Tuple[int, int, int],
-        conv_layers: List[int],
-        fc_layers: List[int],
+        input_size: tuple[int, int, int],
+        conv_layers: list[int],
+        fc_layers: list[int],
         norm_layer: Optional[Callable[..., nn.Module]] = None,
     ):
         """
@@ -388,6 +391,8 @@ class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 37.0,
                 }
             },
+            "_ops": 134.38,
+            "_file_size": 159.743,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -407,6 +412,8 @@ class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "box_map": 46.7,
                 }
             },
+            "_ops": 280.371,
+            "_file_size": 167.104,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -426,6 +433,8 @@ class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
                     "box_map": 32.8,
                 }
             },
+            "_ops": 4.494,
+            "_file_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -445,6 +454,8 @@ class FasterRCNN_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
                     "box_map": 22.8,
                 }
             },
+            "_ops": 0.719,
+            "_file_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -475,9 +486,9 @@ def fasterrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and a targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -550,7 +561,7 @@ def fasterrcnn_resnet50_fpn(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -563,7 +574,7 @@ def fasterrcnn_resnet50_fpn(
     model = FasterRCNN(backbone, num_classes=num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == FasterRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -571,6 +582,10 @@ def fasterrcnn_resnet50_fpn(
 
 
 @register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
 def fasterrcnn_resnet50_fpn_v2(
     *,
     weights: Optional[FasterRCNN_ResNet50_FPN_V2_Weights] = None,
@@ -617,7 +632,7 @@ def fasterrcnn_resnet50_fpn_v2(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -641,7 +656,7 @@ def fasterrcnn_resnet50_fpn_v2(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -657,7 +672,7 @@ def _fasterrcnn_mobilenet_v3_large_fpn(
 ) -> FasterRCNN:
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -682,7 +697,7 @@ def _fasterrcnn_mobilenet_v3_large_fpn(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -702,7 +717,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(
     **kwargs: Any,
 ) -> FasterRCNN:
     """
-    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tunned for mobile use cases.
+    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
 
     .. betastatus:: detection module
 
@@ -829,16 +844,3 @@ def fasterrcnn_mobilenet_v3_large_fpn(
         trainable_backbone_layers=trainable_backbone_layers,
         **kwargs,
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fasterrcnn_resnet50_fpn_coco": FasterRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-        "fasterrcnn_mobilenet_v3_large_320_fpn_coco": FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.COCO_V1.url,
-        "fasterrcnn_mobilenet_v3_large_fpn_coco": FasterRCNN_MobileNet_V3_Large_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index 73c9a6e042d..ccbd2496517 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -2,7 +2,7 @@
 import warnings
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 from torch import nn, Tensor
@@ -51,11 +51,11 @@ def __init__(self, in_channels: int, num_anchors: int, num_classes: int, num_con
 
     def compute_loss(
         self,
-        targets: List[Dict[str, Tensor]],
-        head_outputs: Dict[str, Tensor],
-        anchors: List[Tensor],
-        matched_idxs: List[Tensor],
-    ) -> Dict[str, Tensor]:
+        targets: list[dict[str, Tensor]],
+        head_outputs: dict[str, Tensor],
+        anchors: list[Tensor],
+        matched_idxs: list[Tensor],
+    ) -> dict[str, Tensor]:
 
         cls_logits = head_outputs["cls_logits"]  # [N, HWA, C]
         bbox_regression = head_outputs["bbox_regression"]  # [N, HWA, 4]
@@ -70,7 +70,7 @@ def compute_loss(
             else:
                 gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
                 gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
-            gt_classes_targets[matched_idxs_per_image < 0] = -1  # backgroud
+            gt_classes_targets[matched_idxs_per_image < 0] = -1  # background
             all_gt_classes_targets.append(gt_classes_targets)
             all_gt_boxes_targets.append(gt_boxes_targets)
 
@@ -124,7 +124,7 @@ def compute_loss(
             "bbox_ctrness": loss_bbox_ctrness / max(1, num_foreground),
         }
 
-    def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
+    def forward(self, x: list[Tensor]) -> dict[str, Tensor]:
         cls_logits = self.classification_head(x)
         bbox_regression, bbox_ctrness = self.regression_head(x)
         return {
@@ -180,7 +180,7 @@ def __init__(
         torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
         torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))
 
-    def forward(self, x: List[Tensor]) -> Tensor:
+    def forward(self, x: list[Tensor]) -> Tensor:
         all_cls_logits = []
 
         for features in x:
@@ -242,7 +242,7 @@ def __init__(
                 torch.nn.init.normal_(layer.weight, std=0.01)
                 torch.nn.init.zeros_(layer.bias)
 
-    def forward(self, x: List[Tensor]) -> Tuple[Tensor, Tensor]:
+    def forward(self, x: list[Tensor]) -> tuple[Tensor, Tensor]:
         all_bbox_regression = []
         all_bbox_ctrness = []
 
@@ -274,9 +274,9 @@ class FCOS(nn.Module):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -299,8 +299,12 @@ class FCOS(nn.Module):
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or an OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
-        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
-        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
         image_mean (Tuple[float, float, float]): mean values used for input normalization.
             They are generally the mean values of the dataset on which the backbone has been trained
             on
@@ -329,7 +333,7 @@ class FCOS(nn.Module):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # FCOS needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -365,8 +369,8 @@ def __init__(
         # transform parameters
         min_size: int = 800,
         max_size: int = 1333,
-        image_mean: Optional[List[float]] = None,
-        image_std: Optional[List[float]] = None,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
         # Anchor parameters
         anchor_generator: Optional[AnchorGenerator] = None,
         head: Optional[nn.Module] = None,
@@ -426,8 +430,8 @@ def __init__(
 
     @torch.jit.unused
     def eager_outputs(
-        self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]]
-    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
+        self, losses: dict[str, Tensor], detections: list[dict[str, Tensor]]
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
         if self.training:
             return losses
 
@@ -435,11 +439,11 @@ def eager_outputs(
 
     def compute_loss(
         self,
-        targets: List[Dict[str, Tensor]],
-        head_outputs: Dict[str, Tensor],
-        anchors: List[Tensor],
-        num_anchors_per_level: List[int],
-    ) -> Dict[str, Tensor]:
+        targets: list[dict[str, Tensor]],
+        head_outputs: dict[str, Tensor],
+        anchors: list[Tensor],
+        num_anchors_per_level: list[int],
+    ) -> dict[str, Tensor]:
         matched_idxs = []
         for anchors_per_image, targets_per_image in zip(anchors, targets):
             if targets_per_image["boxes"].numel() == 0:
@@ -483,15 +487,15 @@ def compute_loss(
         return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs)
 
     def postprocess_detections(
-        self, head_outputs: Dict[str, List[Tensor]], anchors: List[List[Tensor]], image_shapes: List[Tuple[int, int]]
-    ) -> List[Dict[str, Tensor]]:
+        self, head_outputs: dict[str, list[Tensor]], anchors: list[list[Tensor]], image_shapes: list[tuple[int, int]]
+    ) -> list[dict[str, Tensor]]:
         class_logits = head_outputs["cls_logits"]
         box_regression = head_outputs["bbox_regression"]
         box_ctrness = head_outputs["bbox_ctrness"]
 
         num_images = len(image_shapes)
 
-        detections: List[Dict[str, Tensor]] = []
+        detections: list[dict[str, Tensor]] = []
 
         for index in range(num_images):
             box_regression_per_image = [br[index] for br in box_regression]
@@ -553,9 +557,9 @@ def postprocess_detections(
 
     def forward(
         self,
-        images: List[Tensor],
-        targets: Optional[List[Dict[str, Tensor]]] = None,
-    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
+        images: list[Tensor],
+        targets: Optional[list[dict[str, Tensor]]] = None,
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
         """
         Args:
             images (list[Tensor]): images to be processed
@@ -580,7 +584,7 @@ def forward(
                         f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.",
                     )
 
-        original_image_sizes: List[Tuple[int, int]] = []
+        original_image_sizes: list[tuple[int, int]] = []
         for img in images:
             val = img.shape[-2:]
             torch._assert(
@@ -600,7 +604,7 @@ def forward(
                 if degenerate_boxes.any():
                     # print the first degenerate box
                     bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
-                    degen_bb: List[float] = boxes[bb_idx].tolist()
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
                     torch._assert(
                         False,
                         f"All bounding boxes should have positive height and width. Found invalid box {degen_bb} for target at index {target_idx}.",
@@ -622,7 +626,7 @@ def forward(
         num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
 
         losses = {}
-        detections: List[Dict[str, Tensor]] = []
+        detections: list[dict[str, Tensor]] = []
         if self.training:
             if targets is None:
                 torch._assert(False, "targets should not be none when in training mode")
@@ -631,7 +635,7 @@ def forward(
                 losses = self.compute_loss(targets, head_outputs, anchors, num_anchors_per_level)
         else:
             # split outputs per level
-            split_head_outputs: Dict[str, List[Tensor]] = {}
+            split_head_outputs: dict[str, list[Tensor]] = {}
             for k in head_outputs:
                 split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
             split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors]
@@ -662,6 +666,8 @@ class FCOS_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 39.2,
                 }
             },
+            "_ops": 128.207,
+            "_file_size": 123.608,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -693,9 +699,9 @@ def fcos_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -749,7 +755,7 @@ def fcos_resnet50_fpn(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -764,17 +770,6 @@ def fcos_resnet50_fpn(
     model = FCOS(backbone, num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fcos_resnet50_fpn_coco": FCOS_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/generalized_rcnn.py b/torchvision/models/detection/generalized_rcnn.py
index b481265077f..f07fa77aae9 100644
--- a/torchvision/models/detection/generalized_rcnn.py
+++ b/torchvision/models/detection/generalized_rcnn.py
@@ -4,10 +4,10 @@
 
 import warnings
 from collections import OrderedDict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
-from torch import nn, Tensor
+from torch import nn
 
 from ...utils import _log_api_usage_once
 
@@ -25,7 +25,13 @@ class GeneralizedRCNN(nn.Module):
             the model
     """
 
-    def __init__(self, backbone: nn.Module, rpn: nn.Module, roi_heads: nn.Module, transform: nn.Module) -> None:
+    def __init__(
+        self,
+        backbone: nn.Module,
+        rpn: nn.Module,
+        roi_heads: nn.Module,
+        transform: nn.Module,
+    ) -> None:
         super().__init__()
         _log_api_usage_once(self)
         self.transform = transform
@@ -36,19 +42,23 @@ def __init__(self, backbone: nn.Module, rpn: nn.Module, roi_heads: nn.Module, tr
         self._has_warned = False
 
     @torch.jit.unused
-    def eager_outputs(self, losses, detections):
-        # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Union[Dict[str, Tensor], List[Dict[str, Tensor]]]
+    def eager_outputs(
+        self, losses: dict[str, torch.Tensor], detections: list[dict[str, torch.Tensor]]
+    ) -> Union[dict[str, torch.Tensor], list[dict[str, torch.Tensor]]]:
         if self.training:
             return losses
 
         return detections
 
-    def forward(self, images, targets=None):
-        # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
+    def forward(
+        self,
+        images: list[torch.Tensor],
+        targets: Optional[list[dict[str, torch.Tensor]]] = None,
+    ) -> tuple[dict[str, torch.Tensor], list[dict[str, torch.Tensor]]]:
         """
         Args:
             images (list[Tensor]): images to be processed
-            targets (list[Dict[str, Tensor]]): ground-truth boxes present in the image (optional)
+            targets (list[dict[str, tensor]]): ground-truth boxes present in the image (optional)
 
         Returns:
             result (list[BoxList] or dict[Tensor]): the output from the model.
@@ -69,9 +79,12 @@ def forward(self, images, targets=None):
                             f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.",
                         )
                     else:
-                        torch._assert(False, f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
+                        torch._assert(
+                            False,
+                            f"Expected target boxes to be of type Tensor, got {type(boxes)}.",
+                        )
 
-        original_image_sizes: List[Tuple[int, int]] = []
+        original_image_sizes: list[tuple[int, int]] = []
         for img in images:
             val = img.shape[-2:]
             torch._assert(
@@ -91,7 +104,7 @@ def forward(self, images, targets=None):
                 if degenerate_boxes.any():
                     # print the first degenerate box
                     bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
-                    degen_bb: List[float] = boxes[bb_idx].tolist()
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
                     torch._assert(
                         False,
                         "All bounding boxes should have positive height and width."
@@ -103,7 +116,9 @@ def forward(self, images, targets=None):
             features = OrderedDict([("0", features)])
         proposals, proposal_losses = self.rpn(images, features, targets)
         detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
-        detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)  # type: ignore[operator]
+        detections = self.transform.postprocess(
+            detections, images.image_sizes, original_image_sizes
+        )  # type: ignore[operator]
 
         losses = {}
         losses.update(detector_losses)
diff --git a/torchvision/models/detection/image_list.py b/torchvision/models/detection/image_list.py
index 583866557e4..08aabe3a486 100644
--- a/torchvision/models/detection/image_list.py
+++ b/torchvision/models/detection/image_list.py
@@ -1,5 +1,3 @@
-from typing import List, Tuple
-
 import torch
 from torch import Tensor
 
@@ -16,7 +14,7 @@ class ImageList:
         image_sizes (list[tuple[int, int]]): List of Tuples each containing size of images.
     """
 
-    def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]) -> None:
+    def __init__(self, tensors: Tensor, image_sizes: list[tuple[int, int]]) -> None:
         self.tensors = tensors
         self.image_sizes = image_sizes
 
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 21fb53c2a49..42b9d65562d 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -29,9 +29,9 @@ class KeypointRCNN(FasterRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -55,13 +55,17 @@ class KeypointRCNN(FasterRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
             If box_predictor is specified, num_classes should be None.
-        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
-        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
         image_mean (Tuple[float, float, float]): mean values used for input normalization.
             They are generally the mean values of the dataset on which the backbone has been trained
             on
@@ -83,8 +87,7 @@ class KeypointRCNN(FasterRCNN):
             for computing the loss
         rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
             of the RPN
-        rpn_score_thresh (float): during inference, only return proposals with a classification score
-            greater than rpn_score_thresh
+        rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh
         box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
             the locations indicated by the bounding boxes
         box_head (nn.Module): module that takes the cropped feature maps as input
@@ -121,7 +124,7 @@ class KeypointRCNN(FasterRCNN):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # KeypointRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -154,7 +157,6 @@ class KeypointRCNN(FasterRCNN):
         >>>                      box_roi_pool=roi_pooler,
         >>>                      keypoint_roi_pool=keypoint_roi_pooler)
         >>> model.eval()
-        >>> model.eval()
         >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
         >>> predictions = model(x)
     """
@@ -328,6 +330,8 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "kp_map": 61.1,
                 }
             },
+            "_ops": 133.924,
+            "_file_size": 226.054,
             "_docs": """
                 These weights were produced by following a similar training recipe as on the paper but use a checkpoint
                 from an early epoch.
@@ -347,6 +351,8 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "kp_map": 65.0,
                 }
             },
+            "_ops": 137.42,
+            "_file_size": 226.054,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -357,9 +363,11 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: KeypointRCNN_ResNet50_FPN_Weights.COCO_LEGACY
-        if kwargs["pretrained"] == "legacy"
-        else KeypointRCNN_ResNet50_FPN_Weights.COCO_V1,
+        lambda kwargs: (
+            KeypointRCNN_ResNet50_FPN_Weights.COCO_LEGACY
+            if kwargs["pretrained"] == "legacy"
+            else KeypointRCNN_ResNet50_FPN_Weights.COCO_V1
+        ),
     ),
     weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
 )
@@ -383,9 +391,9 @@ def keypointrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -444,8 +452,8 @@ def keypointrcnn_resnet50_fpn(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
-        num_keypoints = _ovewrite_value_param(num_keypoints, len(weights.meta["keypoint_names"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        num_keypoints = _ovewrite_value_param("num_keypoints", num_keypoints, len(weights.meta["keypoint_names"]))
     else:
         if num_classes is None:
             num_classes = 2
@@ -461,21 +469,8 @@ def keypointrcnn_resnet50_fpn(
     model = KeypointRCNN(backbone, num_classes, num_keypoints=num_keypoints, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == KeypointRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # legacy model for BC reasons, see https://github.com/pytorch/vision/issues/1606
-        "keypointrcnn_resnet50_fpn_coco_legacy": KeypointRCNN_ResNet50_FPN_Weights.COCO_LEGACY.url,
-        "keypointrcnn_resnet50_fpn_coco": KeypointRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index e2d105b5e41..d1668ab423e 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -50,19 +50,23 @@ class MaskRCNN(FasterRCNN):
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
-        - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
+        - masks (FloatTensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
           obtain the final segmentation masks, the soft masks can be thresholded, generally
           with a value of 0.5 (mask >= 0.5)
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
             If box_predictor is specified, num_classes should be None.
-        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
-        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
         image_mean (Tuple[float, float, float]): mean values used for input normalization.
             They are generally the mean values of the dataset on which the backbone has been trained
             on
@@ -84,8 +88,7 @@ class MaskRCNN(FasterRCNN):
             for computing the loss
         rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
             of the RPN
-        rpn_score_thresh (float): during inference, only return proposals with a classification score
-            greater than rpn_score_thresh
+        rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh
         box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
             the locations indicated by the bounding boxes
         box_head (nn.Module): module that takes the cropped feature maps as input
@@ -123,7 +126,7 @@ class MaskRCNN(FasterRCNN):
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # MaskRCNN needs to know the number of
         >>> # output channels in a backbone. For mobilenet_v2, it's 1280
-        >>> # so we need to add it here
+        >>> # so we need to add it here,
         >>> backbone.out_channels = 1280
         >>>
         >>> # let's make the RPN generate 5 x 3 anchors per spatial
@@ -370,6 +373,8 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "mask_map": 34.6,
                 }
             },
+            "_ops": 134.38,
+            "_file_size": 169.84,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -390,6 +395,8 @@ class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "mask_map": 41.8,
                 }
             },
+            "_ops": 333.577,
+            "_file_size": 177.219,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -418,9 +425,9 @@ def maskrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -484,7 +491,7 @@ def maskrcnn_resnet50_fpn(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -497,7 +504,7 @@ def maskrcnn_resnet50_fpn(
     model = MaskRCNN(backbone, num_classes=num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == MaskRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -505,6 +512,10 @@ def maskrcnn_resnet50_fpn(
 
 
 @register_model()
+@handle_legacy_interface(
+    weights=("pretrained", MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
 def maskrcnn_resnet50_fpn_v2(
     *,
     weights: Optional[MaskRCNN_ResNet50_FPN_V2_Weights] = None,
@@ -548,7 +559,7 @@ def maskrcnn_resnet50_fpn_v2(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -574,17 +585,6 @@ def maskrcnn_resnet50_fpn_v2(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "maskrcnn_resnet50_fpn_coco": MaskRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 792c2c36ce4..cd77749d2c1 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -2,7 +2,7 @@
 import warnings
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 from torch import nn, Tensor
@@ -31,7 +31,7 @@
 ]
 
 
-def _sum(x: List[Tensor]) -> Tensor:
+def _sum(x: list[Tensor]) -> Tensor:
     res = x[0]
     for i in x[1:]:
         res = res + i
@@ -73,14 +73,14 @@ def __init__(self, in_channels, num_anchors, num_classes, norm_layer: Optional[C
         self.regression_head = RetinaNetRegressionHead(in_channels, num_anchors, norm_layer=norm_layer)
 
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
-        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Dict[str, Tensor]
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor], list[Tensor]) -> dict[str, Tensor]
         return {
             "classification": self.classification_head.compute_loss(targets, head_outputs, matched_idxs),
             "bbox_regression": self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
         }
 
     def forward(self, x):
-        # type: (List[Tensor]) -> Dict[str, Tensor]
+        # type: (list[Tensor]) -> dict[str, Tensor]
         return {"cls_logits": self.classification_head(x), "bbox_regression": self.regression_head(x)}
 
 
@@ -156,7 +156,7 @@ def _load_from_state_dict(
         )
 
     def compute_loss(self, targets, head_outputs, matched_idxs):
-        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Tensor
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor]) -> Tensor
         losses = []
 
         cls_logits = head_outputs["cls_logits"]
@@ -189,7 +189,7 @@ def compute_loss(self, targets, head_outputs, matched_idxs):
         return _sum(losses) / len(targets)
 
     def forward(self, x):
-        # type: (List[Tensor]) -> Tensor
+        # type: (list[Tensor]) -> Tensor
         all_cls_logits = []
 
         for features in x:
@@ -270,7 +270,7 @@ def _load_from_state_dict(
         )
 
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
-        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor], list[Tensor]) -> Tensor
         losses = []
 
         bbox_regression = head_outputs["bbox_regression"]
@@ -302,7 +302,7 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
         return _sum(losses) / max(1, len(targets))
 
     def forward(self, x):
-        # type: (List[Tensor]) -> Tensor
+        # type: (list[Tensor]) -> Tensor
         all_bbox_regression = []
 
         for features in x:
@@ -327,9 +327,9 @@ class RetinaNet(nn.Module):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -352,8 +352,12 @@ class RetinaNet(nn.Module):
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or an OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
-        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
-        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
         image_mean (Tuple[float, float, float]): mean values used for input normalization.
             They are generally the mean values of the dataset on which the backbone has been trained
             on
@@ -382,7 +386,7 @@ class RetinaNet(nn.Module):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # RetinaNet needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -481,14 +485,14 @@ def __init__(
 
     @torch.jit.unused
     def eager_outputs(self, losses, detections):
-        # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
+        # type: (dict[str, Tensor], list[dict[str, Tensor]]) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]
         if self.training:
             return losses
 
         return detections
 
     def compute_loss(self, targets, head_outputs, anchors):
-        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Dict[str, Tensor]
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor]) -> dict[str, Tensor]
         matched_idxs = []
         for anchors_per_image, targets_per_image in zip(anchors, targets):
             if targets_per_image["boxes"].numel() == 0:
@@ -503,13 +507,13 @@ def compute_loss(self, targets, head_outputs, anchors):
         return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs)
 
     def postprocess_detections(self, head_outputs, anchors, image_shapes):
-        # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
+        # type: (dict[str, list[Tensor]], list[list[Tensor]], list[tuple[int, int]]) -> list[dict[str, Tensor]]
         class_logits = head_outputs["cls_logits"]
         box_regression = head_outputs["bbox_regression"]
 
         num_images = len(image_shapes)
 
-        detections: List[Dict[str, Tensor]] = []
+        detections: list[dict[str, Tensor]] = []
 
         for index in range(num_images):
             box_regression_per_image = [br[index] for br in box_regression]
@@ -567,7 +571,7 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes):
         return detections
 
     def forward(self, images, targets=None):
-        # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
+        # type: (list[Tensor], Optional[list[dict[str, Tensor]]]) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]
         """
         Args:
             images (list[Tensor]): images to be processed
@@ -593,7 +597,7 @@ def forward(self, images, targets=None):
                     )
 
         # get the original image sizes
-        original_image_sizes: List[Tuple[int, int]] = []
+        original_image_sizes: list[tuple[int, int]] = []
         for img in images:
             val = img.shape[-2:]
             torch._assert(
@@ -614,7 +618,7 @@ def forward(self, images, targets=None):
                 if degenerate_boxes.any():
                     # print the first degenerate box
                     bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
-                    degen_bb: List[float] = boxes[bb_idx].tolist()
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
                     torch._assert(
                         False,
                         "All bounding boxes should have positive height and width."
@@ -636,7 +640,7 @@ def forward(self, images, targets=None):
         anchors = self.anchor_generator(images, features)
 
         losses = {}
-        detections: List[Dict[str, Tensor]] = []
+        detections: list[dict[str, Tensor]] = []
         if self.training:
             if targets is None:
                 torch._assert(False, "targets should not be none when in training mode")
@@ -654,7 +658,7 @@ def forward(self, images, targets=None):
             num_anchors_per_level = [hw * A for hw in num_anchors_per_level]
 
             # split outputs per level
-            split_head_outputs: Dict[str, List[Tensor]] = {}
+            split_head_outputs: dict[str, list[Tensor]] = {}
             for k in head_outputs:
                 split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
             split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors]
@@ -690,6 +694,8 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 36.4,
                 }
             },
+            "_ops": 151.54,
+            "_file_size": 130.267,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -709,6 +715,8 @@ class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "box_map": 41.5,
                 }
             },
+            "_ops": 152.238,
+            "_file_size": 146.037,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -739,9 +747,9 @@ def retinanet_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -795,7 +803,7 @@ def retinanet_resnet50_fpn(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -811,7 +819,7 @@ def retinanet_resnet50_fpn(
     model = RetinaNet(backbone, num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == RetinaNet_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -819,6 +827,10 @@ def retinanet_resnet50_fpn(
 
 
 @register_model()
+@handle_legacy_interface(
+    weights=("pretrained", RetinaNet_ResNet50_FPN_V2_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
 def retinanet_resnet50_fpn_v2(
     *,
     weights: Optional[RetinaNet_ResNet50_FPN_V2_Weights] = None,
@@ -864,7 +876,7 @@ def retinanet_resnet50_fpn_v2(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -886,17 +898,6 @@ def retinanet_resnet50_fpn_v2(
     model = RetinaNet(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "retinanet_resnet50_fpn_coco": RetinaNet_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
index 18a6782a06b..4e721674537 100644
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -1,16 +1,20 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
 import torchvision
-from torch import nn, Tensor
+from torch import nn
 from torchvision.ops import boxes as box_ops, roi_align
 
 from . import _utils as det_utils
 
 
-def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
-    # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
+def fastrcnn_loss(
+    class_logits: torch.Tensor,
+    box_regression: torch.Tensor,
+    labels: list[torch.Tensor],
+    regression_targets: list[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Computes the loss for Faster R-CNN.
 
@@ -49,8 +53,7 @@ def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
     return classification_loss, box_loss
 
 
-def maskrcnn_inference(x, labels):
-    # type: (Tensor, List[Tensor]) -> List[Tensor]
+def maskrcnn_inference(x: torch.Tensor, labels: list[torch.Tensor]) -> list[torch.Tensor]:
     """
     From the results of the CNN, post process the masks
     by taking the mask corresponding to the class with max
@@ -60,7 +63,7 @@ def maskrcnn_inference(x, labels):
     Args:
         x (Tensor): the mask logits
         labels (list[BoxList]): bounding boxes that are used as
-            reference, one for ech image
+            reference, one for each image
 
     Returns:
         results (list[BoxList]): one BoxList for each image, containing
@@ -95,7 +98,7 @@ def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
 
 
 def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
-    # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
+    # type: (Tensor, list[Tensor], list[Tensor], list[Tensor], list[Tensor]) -> Tensor
     """
     Args:
         proposals (list[BoxList])
@@ -127,7 +130,7 @@ def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs
 
 
 def keypoints_to_heatmap(keypoints, rois, heatmap_size):
-    # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor]
+    # type: (Tensor, Tensor, int) -> tuple[Tensor, Tensor]
     offset_x = rois[:, 0]
     offset_y = rois[:, 1]
     scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
@@ -296,7 +299,7 @@ def heatmaps_to_keypoints(maps, rois):
 
 
 def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
-    # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
+    # type: (Tensor, list[Tensor], list[Tensor], list[Tensor]) -> Tensor
     N, K, H, W = keypoint_logits.shape
     if H != W:
         raise ValueError(
@@ -315,7 +318,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
     valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
     valid = torch.where(valid)[0]
 
-    # torch.mean (in binary_cross_entropy_with_logits) does'nt
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
     # accept empty tensors, so handle it sepaartely
     if keypoint_targets.numel() == 0 or len(valid) == 0:
         return keypoint_logits.sum() * 0
@@ -327,7 +330,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
 
 
 def keypointrcnn_inference(x, boxes):
-    # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
+    # type: (Tensor, list[Tensor]) -> tuple[list[Tensor], list[Tensor]]
     kp_probs = []
     kp_scores = []
 
@@ -390,7 +393,7 @@ def expand_masks_tracing_scale(M, padding):
 
 
 def expand_masks(mask, padding):
-    # type: (Tensor, int) -> Tuple[Tensor, float]
+    # type: (Tensor, int) -> tuple[Tensor, float]
     M = mask.shape[-1]
     if torch._C._get_tracing_state():  # could not import is_tracing(), not sure why
         scale = expand_masks_tracing_scale(M, padding)
@@ -472,7 +475,7 @@ def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
 
 
 def paste_masks_in_image(masks, boxes, img_shape, padding=1):
-    # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
+    # type: (Tensor, Tensor, tuple[int, int], int) -> Tensor
     masks, scale = expand_masks(masks, padding=padding)
     boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
     im_h, im_w = img_shape
@@ -566,7 +569,7 @@ def has_keypoint(self):
         return True
 
     def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
-        # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
+        # type: (list[Tensor], list[Tensor], list[Tensor]) -> tuple[list[Tensor], list[Tensor]]
         matched_idxs = []
         labels = []
         for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
@@ -601,7 +604,7 @@ def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
         return matched_idxs, labels
 
     def subsample(self, labels):
-        # type: (List[Tensor]) -> List[Tensor]
+        # type: (list[Tensor]) -> list[Tensor]
         sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
         sampled_inds = []
         for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
@@ -610,13 +613,13 @@ def subsample(self, labels):
         return sampled_inds
 
     def add_gt_proposals(self, proposals, gt_boxes):
-        # type: (List[Tensor], List[Tensor]) -> List[Tensor]
+        # type: (list[Tensor], list[Tensor]) -> list[Tensor]
         proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
 
         return proposals
 
     def check_targets(self, targets):
-        # type: (Optional[List[Dict[str, Tensor]]]) -> None
+        # type: (Optional[list[dict[str, Tensor]]]) -> None
         if targets is None:
             raise ValueError("targets should not be None")
         if not all(["boxes" in t for t in targets]):
@@ -629,10 +632,10 @@ def check_targets(self, targets):
 
     def select_training_samples(
         self,
-        proposals,  # type: List[Tensor]
-        targets,  # type: Optional[List[Dict[str, Tensor]]]
+        proposals,  # type: list[Tensor]
+        targets,  # type: Optional[list[dict[str, Tensor]]]
     ):
-        # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
+        # type: (...) -> tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]]
         self.check_targets(targets)
         if targets is None:
             raise ValueError("targets should not be None")
@@ -669,10 +672,10 @@ def postprocess_detections(
         self,
         class_logits,  # type: Tensor
         box_regression,  # type: Tensor
-        proposals,  # type: List[Tensor]
-        image_shapes,  # type: List[Tuple[int, int]]
+        proposals,  # type: list[Tensor]
+        image_shapes,  # type: list[tuple[int, int]]
     ):
-        # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
+        # type: (...) -> tuple[list[Tensor], list[Tensor], list[Tensor]]
         device = class_logits.device
         num_classes = class_logits.shape[-1]
 
@@ -726,12 +729,11 @@ def postprocess_detections(
 
     def forward(
         self,
-        features,  # type: Dict[str, Tensor]
-        proposals,  # type: List[Tensor]
-        image_shapes,  # type: List[Tuple[int, int]]
-        targets=None,  # type: Optional[List[Dict[str, Tensor]]]
-    ):
-        # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
+        features: dict[str, torch.Tensor],
+        proposals: list[torch.Tensor],
+        image_shapes: list[tuple[int, int]],
+        targets: Optional[list[dict[str, torch.Tensor]]] = None,
+    ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
         """
         Args:
             features (List[Tensor])
@@ -743,10 +745,10 @@ def forward(
             for t in targets:
                 # TODO: https://github.com/pytorch/pytorch/issues/26731
                 floating_point_types = (torch.float, torch.double, torch.half)
-                if not t["boxes"].dtype in floating_point_types:
+                if t["boxes"].dtype not in floating_point_types:
                     raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
                 if not t["labels"].dtype == torch.int64:
-                    raise TypeError("target labels must of int64 type, instead got {t['labels'].dtype}")
+                    raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
                 if self.has_keypoint():
                     if not t["keypoints"].dtype == torch.float32:
                         raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
@@ -762,7 +764,7 @@ def forward(
         box_features = self.box_head(box_features)
         class_logits, box_regression = self.box_predictor(box_features)
 
-        result: List[Dict[str, torch.Tensor]] = []
+        result: list[dict[str, torch.Tensor]] = []
         losses = {}
         if self.training:
             if labels is None:
@@ -787,7 +789,7 @@ def forward(
             mask_proposals = [p["boxes"] for p in result]
             if self.training:
                 if matched_idxs is None:
-                    raise ValueError("if in trainning, matched_idxs should not be None")
+                    raise ValueError("if in training, matched_idxs should not be None")
 
                 # during training, only focus on positive boxes
                 num_images = len(proposals)
diff --git a/torchvision/models/detection/rpn.py b/torchvision/models/detection/rpn.py
index 07a8b931150..ef5718922cb 100644
--- a/torchvision/models/detection/rpn.py
+++ b/torchvision/models/detection/rpn.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 from torch import nn, Tensor
@@ -68,7 +68,7 @@ def _load_from_state_dict(
             error_msgs,
         )
 
-    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+    def forward(self, x: list[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
         logits = []
         bbox_reg = []
         for feature in x:
@@ -85,7 +85,7 @@ def permute_and_flatten(layer: Tensor, N: int, A: int, C: int, H: int, W: int) -
     return layer
 
 
-def concat_box_prediction_layers(box_cls: List[Tensor], box_regression: List[Tensor]) -> Tuple[Tensor, Tensor]:
+def concat_box_prediction_layers(box_cls: list[Tensor], box_regression: list[Tensor]) -> tuple[Tensor, Tensor]:
     box_cls_flattened = []
     box_regression_flattened = []
     # for each feature level, permute the outputs to make them be in the
@@ -133,6 +133,7 @@ class RegionProposalNetwork(torch.nn.Module):
             contain two fields: training and testing, to allow for different values depending
             on training or evaluation
         nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+        score_thresh (float): only return proposals with an objectness score greater than score_thresh
 
     """
 
@@ -152,8 +153,8 @@ def __init__(
         batch_size_per_image: int,
         positive_fraction: float,
         # Faster-RCNN Inference
-        pre_nms_top_n: Dict[str, int],
-        post_nms_top_n: Dict[str, int],
+        pre_nms_top_n: dict[str, int],
+        post_nms_top_n: dict[str, int],
         nms_thresh: float,
         score_thresh: float = 0.0,
     ) -> None:
@@ -190,8 +191,8 @@ def post_nms_top_n(self) -> int:
         return self._post_nms_top_n["testing"]
 
     def assign_targets_to_anchors(
-        self, anchors: List[Tensor], targets: List[Dict[str, Tensor]]
-    ) -> Tuple[List[Tensor], List[Tensor]]:
+        self, anchors: list[Tensor], targets: list[dict[str, Tensor]]
+    ) -> tuple[list[Tensor], list[Tensor]]:
 
         labels = []
         matched_gt_boxes = []
@@ -227,7 +228,7 @@ def assign_targets_to_anchors(
             matched_gt_boxes.append(matched_gt_boxes_per_image)
         return labels, matched_gt_boxes
 
-    def _get_top_n_idx(self, objectness: Tensor, num_anchors_per_level: List[int]) -> Tensor:
+    def _get_top_n_idx(self, objectness: Tensor, num_anchors_per_level: list[int]) -> Tensor:
         r = []
         offset = 0
         for ob in objectness.split(num_anchors_per_level, 1):
@@ -242,9 +243,9 @@ def filter_proposals(
         self,
         proposals: Tensor,
         objectness: Tensor,
-        image_shapes: List[Tuple[int, int]],
-        num_anchors_per_level: List[int],
-    ) -> Tuple[List[Tensor], List[Tensor]]:
+        image_shapes: list[tuple[int, int]],
+        num_anchors_per_level: list[int],
+    ) -> tuple[list[Tensor], list[Tensor]]:
 
         num_images = proposals.shape[0]
         device = proposals.device
@@ -296,8 +297,8 @@ def filter_proposals(
         return final_boxes, final_scores
 
     def compute_loss(
-        self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: List[Tensor], regression_targets: List[Tensor]
-    ) -> Tuple[Tensor, Tensor]:
+        self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: list[Tensor], regression_targets: list[Tensor]
+    ) -> tuple[Tensor, Tensor]:
         """
         Args:
             objectness (Tensor)
@@ -335,10 +336,9 @@ def compute_loss(
     def forward(
         self,
         images: ImageList,
-        features: Dict[str, Tensor],
-        targets: Optional[List[Dict[str, Tensor]]] = None,
-    ) -> Tuple[List[Tensor], Dict[str, Tensor]]:
-
+        features: dict[str, Tensor],
+        targets: Optional[list[dict[str, Tensor]]] = None,
+    ) -> tuple[list[Tensor], dict[str, Tensor]]:
         """
         Args:
             images (ImageList): images for which we want to compute the predictions
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index c30e508f488..8cd43d04c75 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -1,6 +1,6 @@
 import warnings
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
@@ -39,6 +39,8 @@ class SSD300_VGG16_Weights(WeightsEnum):
                     "box_map": 25.1,
                 }
             },
+            "_ops": 34.858,
+            "_file_size": 135.988,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -54,12 +56,12 @@ def _xavier_init(conv: nn.Module):
 
 
 class SSDHead(nn.Module):
-    def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int):
+    def __init__(self, in_channels: list[int], num_anchors: list[int], num_classes: int):
         super().__init__()
         self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes)
         self.regression_head = SSDRegressionHead(in_channels, num_anchors)
 
-    def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
+    def forward(self, x: list[Tensor]) -> dict[str, Tensor]:
         return {
             "bbox_regression": self.regression_head(x),
             "cls_logits": self.classification_head(x),
@@ -86,7 +88,7 @@ def _get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor:
                 out = module(x)
         return out
 
-    def forward(self, x: List[Tensor]) -> Tensor:
+    def forward(self, x: list[Tensor]) -> Tensor:
         all_results = []
 
         for i, features in enumerate(x):
@@ -104,7 +106,7 @@ def forward(self, x: List[Tensor]) -> Tensor:
 
 
 class SSDClassificationHead(SSDScoringHead):
-    def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int):
+    def __init__(self, in_channels: list[int], num_anchors: list[int], num_classes: int):
         cls_logits = nn.ModuleList()
         for channels, anchors in zip(in_channels, num_anchors):
             cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1))
@@ -113,7 +115,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes:
 
 
 class SSDRegressionHead(SSDScoringHead):
-    def __init__(self, in_channels: List[int], num_anchors: List[int]):
+    def __init__(self, in_channels: list[int], num_anchors: list[int]):
         bbox_reg = nn.ModuleList()
         for channels, anchors in zip(in_channels, num_anchors):
             bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1))
@@ -126,12 +128,12 @@ class SSD(nn.Module):
     Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
     to a fixed size before passing it to the backbone.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -185,10 +187,10 @@ def __init__(
         self,
         backbone: nn.Module,
         anchor_generator: DefaultBoxGenerator,
-        size: Tuple[int, int],
+        size: tuple[int, int],
         num_classes: int,
-        image_mean: Optional[List[float]] = None,
-        image_std: Optional[List[float]] = None,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
         head: Optional[nn.Module] = None,
         score_thresh: float = 0.01,
         nms_thresh: float = 0.45,
@@ -243,8 +245,8 @@ def __init__(
 
     @torch.jit.unused
     def eager_outputs(
-        self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]]
-    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
+        self, losses: dict[str, Tensor], detections: list[dict[str, Tensor]]
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
         if self.training:
             return losses
 
@@ -252,11 +254,11 @@ def eager_outputs(
 
     def compute_loss(
         self,
-        targets: List[Dict[str, Tensor]],
-        head_outputs: Dict[str, Tensor],
-        anchors: List[Tensor],
-        matched_idxs: List[Tensor],
-    ) -> Dict[str, Tensor]:
+        targets: list[dict[str, Tensor]],
+        head_outputs: dict[str, Tensor],
+        anchors: list[Tensor],
+        matched_idxs: list[Tensor],
+    ) -> dict[str, Tensor]:
         bbox_regression = head_outputs["bbox_regression"]
         cls_logits = head_outputs["cls_logits"]
 
@@ -322,8 +324,8 @@ def compute_loss(
         }
 
     def forward(
-        self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None
-    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
+        self, images: list[Tensor], targets: Optional[list[dict[str, Tensor]]] = None
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
         if self.training:
             if targets is None:
                 torch._assert(False, "targets should not be none when in training mode")
@@ -339,7 +341,7 @@ def forward(
                         torch._assert(False, f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
 
         # get the original image sizes
-        original_image_sizes: List[Tuple[int, int]] = []
+        original_image_sizes: list[tuple[int, int]] = []
         for img in images:
             val = img.shape[-2:]
             torch._assert(
@@ -358,7 +360,7 @@ def forward(
                 degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
                 if degenerate_boxes.any():
                     bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
-                    degen_bb: List[float] = boxes[bb_idx].tolist()
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
                     torch._assert(
                         False,
                         "All bounding boxes should have positive height and width."
@@ -379,7 +381,7 @@ def forward(
         anchors = self.anchor_generator(images, features)
 
         losses = {}
-        detections: List[Dict[str, Tensor]] = []
+        detections: list[dict[str, Tensor]] = []
         if self.training:
             matched_idxs = []
             if targets is None:
@@ -410,15 +412,15 @@ def forward(
         return self.eager_outputs(losses, detections)
 
     def postprocess_detections(
-        self, head_outputs: Dict[str, Tensor], image_anchors: List[Tensor], image_shapes: List[Tuple[int, int]]
-    ) -> List[Dict[str, Tensor]]:
+        self, head_outputs: dict[str, Tensor], image_anchors: list[Tensor], image_shapes: list[tuple[int, int]]
+    ) -> list[dict[str, Tensor]]:
         bbox_regression = head_outputs["bbox_regression"]
         pred_scores = F.softmax(head_outputs["cls_logits"], dim=-1)
 
         num_classes = pred_scores.size(-1)
         device = pred_scores.device
 
-        detections: List[Dict[str, Tensor]] = []
+        detections: list[dict[str, Tensor]] = []
 
         for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes):
             boxes = self.box_coder.decode_single(boxes, anchors)
@@ -534,7 +536,7 @@ def __init__(self, backbone: nn.Module, highres: bool):
         )
         self.extra = extra
 
-    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
         # L2 regularization + Rescaling of 1st block's feature map
         x = self.features(x)
         rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x)
@@ -554,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
     stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     torch._assert(
         0 <= trainable_layers <= num_stages,
         f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
@@ -588,12 +590,12 @@ def ssd300_vgg16(
     .. betastatus:: detection module
 
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
     to a fixed size before passing it to the backbone.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -649,7 +651,7 @@ def ssd300_vgg16(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -675,28 +677,6 @@ def ssd300_vgg16(
     model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssd300_vgg16_coco": SSD300_VGG16_Weights.COCO_V1.url,
-    }
-)
-
-
-backbone_urls = _ModelURLs(
-    {
-        # We port the features of a VGG16 backbone trained by amdegroot because unlike the one on TorchVision, it uses
-        # the same input standardization method as the paper.
-        # Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
-        # Only the `features` weights have proper values, those on the `classifier` module are filled with nans.
-        "vgg16_features": VGG16_Weights.IMAGENET1K_FEATURES.url,
-    }
-)
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index 63ac0d2bc73..6b05aae0c0f 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -1,7 +1,7 @@
 import warnings
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch import nn, Tensor
@@ -80,13 +80,13 @@ def _normal_init(conv: nn.Module):
 
 class SSDLiteHead(nn.Module):
     def __init__(
-        self, in_channels: List[int], num_anchors: List[int], num_classes: int, norm_layer: Callable[..., nn.Module]
+        self, in_channels: list[int], num_anchors: list[int], num_classes: int, norm_layer: Callable[..., nn.Module]
     ):
         super().__init__()
         self.classification_head = SSDLiteClassificationHead(in_channels, num_anchors, num_classes, norm_layer)
         self.regression_head = SSDLiteRegressionHead(in_channels, num_anchors, norm_layer)
 
-    def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
+    def forward(self, x: list[Tensor]) -> dict[str, Tensor]:
         return {
             "bbox_regression": self.regression_head(x),
             "cls_logits": self.classification_head(x),
@@ -95,7 +95,7 @@ def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
 
 class SSDLiteClassificationHead(SSDScoringHead):
     def __init__(
-        self, in_channels: List[int], num_anchors: List[int], num_classes: int, norm_layer: Callable[..., nn.Module]
+        self, in_channels: list[int], num_anchors: list[int], num_classes: int, norm_layer: Callable[..., nn.Module]
     ):
         cls_logits = nn.ModuleList()
         for channels, anchors in zip(in_channels, num_anchors):
@@ -105,7 +105,7 @@ def __init__(
 
 
 class SSDLiteRegressionHead(SSDScoringHead):
-    def __init__(self, in_channels: List[int], num_anchors: List[int], norm_layer: Callable[..., nn.Module]):
+    def __init__(self, in_channels: list[int], num_anchors: list[int], norm_layer: Callable[..., nn.Module]):
         bbox_reg = nn.ModuleList()
         for channels, anchors in zip(in_channels, num_anchors):
             bbox_reg.append(_prediction_block(channels, 4 * anchors, 3, norm_layer))
@@ -147,7 +147,7 @@ def __init__(
 
         self.extra = extra
 
-    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
         # Get feature maps from backbone and extra. Can't be refactored due to JIT limitations.
         output = []
         for block in self.features:
@@ -172,7 +172,7 @@ def _mobilenet_extractor(
     stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     if not 0 <= trainable_layers <= num_stages:
         raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
     freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
@@ -198,6 +198,8 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
                     "box_map": 21.3,
                 }
             },
+            "_ops": 0.583,
+            "_file_size": 13.418,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -253,7 +255,7 @@ def ssdlite320_mobilenet_v3_large(
         norm_layer (callable, optional): Module specifying the normalization layer to use.
         **kwargs: parameters passed to the ``torchvision.models.detection.ssd.SSD``
             base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssd.py>`_
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssdlite.py>`_
             for more details about this class.
 
     .. autoclass:: torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights
@@ -268,7 +270,7 @@ def ssdlite320_mobilenet_v3_large(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 91
 
@@ -324,17 +326,6 @@ def ssdlite320_mobilenet_v3_large(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssdlite320_mobilenet_v3_large_coco": SSDLite320_MobileNet_V3_Large_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py
index dd2d728abf9..ac54873dee8 100644
--- a/torchvision/models/detection/transform.py
+++ b/torchvision/models/detection/transform.py
@@ -1,5 +1,5 @@
 import math
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torchvision
@@ -24,30 +24,42 @@ def _fake_cast_onnx(v: Tensor) -> float:
 
 def _resize_image_and_masks(
     image: Tensor,
-    self_min_size: float,
-    self_max_size: float,
-    target: Optional[Dict[str, Tensor]] = None,
-    fixed_size: Optional[Tuple[int, int]] = None,
-) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+    self_min_size: int,
+    self_max_size: int,
+    target: Optional[dict[str, Tensor]] = None,
+    fixed_size: Optional[tuple[int, int]] = None,
+) -> tuple[Tensor, Optional[dict[str, Tensor]]]:
     if torchvision._is_tracing():
         im_shape = _get_shape_onnx(image)
-    else:
+    elif torch.jit.is_scripting():
         im_shape = torch.tensor(image.shape[-2:])
+    else:
+        im_shape = image.shape[-2:]
 
-    size: Optional[List[int]] = None
+    size: Optional[list[int]] = None
     scale_factor: Optional[float] = None
     recompute_scale_factor: Optional[bool] = None
     if fixed_size is not None:
         size = [fixed_size[1], fixed_size[0]]
     else:
-        min_size = torch.min(im_shape).to(dtype=torch.float32)
-        max_size = torch.max(im_shape).to(dtype=torch.float32)
-        scale = torch.min(self_min_size / min_size, self_max_size / max_size)
+        if torch.jit.is_scripting() or torchvision._is_tracing():
+            min_size = torch.min(im_shape).to(dtype=torch.float32)
+            max_size = torch.max(im_shape).to(dtype=torch.float32)
+            self_min_size_f = float(self_min_size)
+            self_max_size_f = float(self_max_size)
+            scale = torch.min(self_min_size_f / min_size, self_max_size_f / max_size)
+
+            if torchvision._is_tracing():
+                scale_factor = _fake_cast_onnx(scale)
+            else:
+                scale_factor = scale.item()
 
-        if torchvision._is_tracing():
-            scale_factor = _fake_cast_onnx(scale)
         else:
-            scale_factor = scale.item()
+            # Do it the normal way
+            min_size = min(im_shape)
+            max_size = max(im_shape)
+            scale_factor = min(self_min_size / min_size, self_max_size / max_size)
+
         recompute_scale_factor = True
 
     image = torch.nn.functional.interpolate(
@@ -76,7 +88,7 @@ class GeneralizedRCNNTransform(nn.Module):
     Performs input / target transformation before feeding the data to a GeneralizedRCNN
     model.
 
-    The transformations it perform are:
+    The transformations it performs are:
         - input normalization (mean subtraction and std division)
         - input / target resizing to match min_size / max_size
 
@@ -87,10 +99,10 @@ def __init__(
         self,
         min_size: int,
         max_size: int,
-        image_mean: List[float],
-        image_std: List[float],
+        image_mean: list[float],
+        image_std: list[float],
         size_divisible: int = 32,
-        fixed_size: Optional[Tuple[int, int]] = None,
+        fixed_size: Optional[tuple[int, int]] = None,
         **kwargs: Any,
     ):
         super().__init__()
@@ -105,17 +117,17 @@ def __init__(
         self._skip_resize = kwargs.pop("_skip_resize", False)
 
     def forward(
-        self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None
-    ) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]:
+        self, images: list[Tensor], targets: Optional[list[dict[str, Tensor]]] = None
+    ) -> tuple[ImageList, Optional[list[dict[str, Tensor]]]]:
         images = [img for img in images]
         if targets is not None:
             # make a copy of targets to avoid modifying it in-place
             # once torchscript supports dict comprehension
             # this can be simplified as follows
             # targets = [{k: v for k,v in t.items()} for t in targets]
-            targets_copy: List[Dict[str, Tensor]] = []
+            targets_copy: list[dict[str, Tensor]] = []
             for t in targets:
-                data: Dict[str, Tensor] = {}
+                data: dict[str, Tensor] = {}
                 for k, v in t.items():
                     data[k] = v
                 targets_copy.append(data)
@@ -134,7 +146,7 @@ def forward(
 
         image_sizes = [img.shape[-2:] for img in images]
         images = self.batch_images(images, size_divisible=self.size_divisible)
-        image_sizes_list: List[Tuple[int, int]] = []
+        image_sizes_list: list[tuple[int, int]] = []
         for image_size in image_sizes:
             torch._assert(
                 len(image_size) == 2,
@@ -156,11 +168,10 @@ def normalize(self, image: Tensor) -> Tensor:
         std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
         return (image - mean[:, None, None]) / std[:, None, None]
 
-    def torch_choice(self, k: List[int]) -> int:
+    def torch_choice(self, k: list[int]) -> int:
         """
-        Implements `random.choice` via torch ops so it can be compiled with
-        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
-        is fixed.
+        Implements `random.choice` via torch ops, so it can be compiled with
+        TorchScript and we use PyTorch's RNG (not native RNG)
         """
         index = int(torch.empty(1).uniform_(0.0, float(len(k))).item())
         return k[index]
@@ -168,17 +179,16 @@ def torch_choice(self, k: List[int]) -> int:
     def resize(
         self,
         image: Tensor,
-        target: Optional[Dict[str, Tensor]] = None,
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        target: Optional[dict[str, Tensor]] = None,
+    ) -> tuple[Tensor, Optional[dict[str, Tensor]]]:
         h, w = image.shape[-2:]
         if self.training:
             if self._skip_resize:
                 return image, target
-            size = float(self.torch_choice(self.min_size))
+            size = self.torch_choice(self.min_size)
         else:
-            # FIXME assume for now that testing uses the largest scale
-            size = float(self.min_size[-1])
-        image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.fixed_size)
+            size = self.min_size[-1]
+        image, target = _resize_image_and_masks(image, size, self.max_size, target, self.fixed_size)
 
         if target is None:
             return image, target
@@ -196,7 +206,7 @@ def resize(
     # _onnx_batch_images() is an implementation of
     # batch_images() that is supported by ONNX tracing.
     @torch.jit.unused
-    def _onnx_batch_images(self, images: List[Tensor], size_divisible: int = 32) -> Tensor:
+    def _onnx_batch_images(self, images: list[Tensor], size_divisible: int = 32) -> Tensor:
         max_size = []
         for i in range(images[0].dim()):
             max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64)
@@ -217,14 +227,14 @@ def _onnx_batch_images(self, images: List[Tensor], size_divisible: int = 32) ->
 
         return torch.stack(padded_imgs)
 
-    def max_by_axis(self, the_list: List[List[int]]) -> List[int]:
+    def max_by_axis(self, the_list: list[list[int]]) -> list[int]:
         maxes = the_list[0]
         for sublist in the_list[1:]:
             for index, item in enumerate(sublist):
                 maxes[index] = max(maxes[index], item)
         return maxes
 
-    def batch_images(self, images: List[Tensor], size_divisible: int = 32) -> Tensor:
+    def batch_images(self, images: list[Tensor], size_divisible: int = 32) -> Tensor:
         if torchvision._is_tracing():
             # batch_images() does not export well to ONNX
             # call _onnx_batch_images() instead
@@ -246,10 +256,10 @@ def batch_images(self, images: List[Tensor], size_divisible: int = 32) -> Tensor
 
     def postprocess(
         self,
-        result: List[Dict[str, Tensor]],
-        image_shapes: List[Tuple[int, int]],
-        original_image_sizes: List[Tuple[int, int]],
-    ) -> List[Dict[str, Tensor]]:
+        result: list[dict[str, Tensor]],
+        image_shapes: list[tuple[int, int]],
+        original_image_sizes: list[tuple[int, int]],
+    ) -> list[dict[str, Tensor]]:
         if self.training:
             return result
         for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
@@ -275,7 +285,7 @@ def __repr__(self) -> str:
         return format_string
 
 
-def resize_keypoints(keypoints: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
+def resize_keypoints(keypoints: Tensor, original_size: list[int], new_size: list[int]) -> Tensor:
     ratios = [
         torch.tensor(s, dtype=torch.float32, device=keypoints.device)
         / torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device)
@@ -293,7 +303,7 @@ def resize_keypoints(keypoints: Tensor, original_size: List[int], new_size: List
     return resized_data
 
 
-def resize_boxes(boxes: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
+def resize_boxes(boxes: Tensor, original_size: list[int], new_size: list[int]) -> Tensor:
     ratios = [
         torch.tensor(s, dtype=torch.float32, device=boxes.device)
         / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index c98eb37f935..4b755a3e207 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -1,9 +1,9 @@
 import copy
 import math
-import warnings
+from collections.abc import Sequence
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch import nn, Tensor
@@ -117,7 +117,7 @@ def __init__(
 
         self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
         activation_layer = nn.SiLU
 
         # expand
@@ -183,7 +183,7 @@ def __init__(
 
         self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
         activation_layer = nn.SiLU
 
         expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
@@ -239,7 +239,6 @@ def __init__(
         num_classes: int = 1000,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
         last_channel: Optional[int] = None,
-        **kwargs: Any,
     ) -> None:
         """
         EfficientNet V1 and V2 main class
@@ -263,20 +262,10 @@ def __init__(
         ):
             raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
 
-        if "block" in kwargs:
-            warnings.warn(
-                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
-                "Please pass this information on 'MBConvConfig.block' instead."
-            )
-            if kwargs["block"] is not None:
-                for s in inverted_residual_setting:
-                    if isinstance(s, MBConvConfig):
-                        s.block = kwargs["block"]
-
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
 
         # building first layer
         firstconv_output_channels = inverted_residual_setting[0].input_channels
@@ -290,7 +279,7 @@ def __init__(
         total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
         stage_block_id = 0
         for cnf in inverted_residual_setting:
-            stage: List[nn.Module] = []
+            stage: list[nn.Module] = []
             for _ in range(cnf.num_layers):
                 # copy to avoid modifications. shallow copy is enough
                 block_cnf = copy.copy(cnf)
@@ -369,7 +358,7 @@ def _efficientnet(
     model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -377,7 +366,7 @@ def _efficientnet(
 def _efficientnet_conf(
     arch: str,
     **kwargs: Any,
-) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
+) -> tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
     inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
     if arch.startswith("efficientnet_b"):
         bneck_conf = partial(MBConvConfig, width_mult=kwargs.pop("width_mult"), depth_mult=kwargs.pop("depth_mult"))
@@ -429,7 +418,7 @@ def _efficientnet_conf(
     return inverted_residual_setting, last_channel
 
 
-_COMMON_META: Dict[str, Any] = {
+_COMMON_META: dict[str, Any] = {
     "categories": _IMAGENET_CATEGORIES,
 }
 
@@ -451,7 +440,7 @@ def _efficientnet_conf(
 class EfficientNet_B0_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
+        url="https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth",
         transforms=partial(
             ImageClassification, crop_size=224, resize_size=256, interpolation=InterpolationMode.BICUBIC
         ),
@@ -464,6 +453,8 @@ class EfficientNet_B0_Weights(WeightsEnum):
                     "acc@5": 93.532,
                 }
             },
+            "_ops": 0.386,
+            "_file_size": 20.451,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -473,7 +464,7 @@ class EfficientNet_B0_Weights(WeightsEnum):
 class EfficientNet_B1_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-bac287d4.pth",
         transforms=partial(
             ImageClassification, crop_size=240, resize_size=256, interpolation=InterpolationMode.BICUBIC
         ),
@@ -486,6 +477,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                     "acc@5": 94.186,
                 }
             },
+            "_ops": 0.687,
+            "_file_size": 30.134,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -504,6 +497,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                     "acc@5": 94.934,
                 }
             },
+            "_ops": 0.687,
+            "_file_size": 30.136,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -517,7 +512,7 @@ class EfficientNet_B1_Weights(WeightsEnum):
 class EfficientNet_B2_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
+        url="https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth",
         transforms=partial(
             ImageClassification, crop_size=288, resize_size=288, interpolation=InterpolationMode.BICUBIC
         ),
@@ -530,6 +525,8 @@ class EfficientNet_B2_Weights(WeightsEnum):
                     "acc@5": 95.310,
                 }
             },
+            "_ops": 1.088,
+            "_file_size": 35.174,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -539,7 +536,7 @@ class EfficientNet_B2_Weights(WeightsEnum):
 class EfficientNet_B3_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
+        url="https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth",
         transforms=partial(
             ImageClassification, crop_size=300, resize_size=320, interpolation=InterpolationMode.BICUBIC
         ),
@@ -552,6 +549,8 @@ class EfficientNet_B3_Weights(WeightsEnum):
                     "acc@5": 96.054,
                 }
             },
+            "_ops": 1.827,
+            "_file_size": 47.184,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -561,7 +560,7 @@ class EfficientNet_B3_Weights(WeightsEnum):
 class EfficientNet_B4_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+        url="https://download.pytorch.org/models/efficientnet_b4_rwightman-23ab8bcd.pth",
         transforms=partial(
             ImageClassification, crop_size=380, resize_size=384, interpolation=InterpolationMode.BICUBIC
         ),
@@ -574,6 +573,8 @@ class EfficientNet_B4_Weights(WeightsEnum):
                     "acc@5": 96.594,
                 }
             },
+            "_ops": 4.394,
+            "_file_size": 74.489,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -583,7 +584,7 @@ class EfficientNet_B4_Weights(WeightsEnum):
 class EfficientNet_B5_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
-        url="https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
+        url="https://download.pytorch.org/models/efficientnet_b5_lukemelas-1a07897c.pth",
         transforms=partial(
             ImageClassification, crop_size=456, resize_size=456, interpolation=InterpolationMode.BICUBIC
         ),
@@ -596,6 +597,8 @@ class EfficientNet_B5_Weights(WeightsEnum):
                     "acc@5": 96.628,
                 }
             },
+            "_ops": 10.266,
+            "_file_size": 116.864,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -605,7 +608,7 @@ class EfficientNet_B5_Weights(WeightsEnum):
 class EfficientNet_B6_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
-        url="https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
+        url="https://download.pytorch.org/models/efficientnet_b6_lukemelas-24a108a5.pth",
         transforms=partial(
             ImageClassification, crop_size=528, resize_size=528, interpolation=InterpolationMode.BICUBIC
         ),
@@ -618,6 +621,8 @@ class EfficientNet_B6_Weights(WeightsEnum):
                     "acc@5": 96.916,
                 }
             },
+            "_ops": 19.068,
+            "_file_size": 165.362,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -627,7 +632,7 @@ class EfficientNet_B6_Weights(WeightsEnum):
 class EfficientNet_B7_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
-        url="https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
+        url="https://download.pytorch.org/models/efficientnet_b7_lukemelas-c5b4e57e.pth",
         transforms=partial(
             ImageClassification, crop_size=600, resize_size=600, interpolation=InterpolationMode.BICUBIC
         ),
@@ -640,6 +645,8 @@ class EfficientNet_B7_Weights(WeightsEnum):
                     "acc@5": 96.908,
                 }
             },
+            "_ops": 37.746,
+            "_file_size": 254.675,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -664,6 +671,8 @@ class EfficientNet_V2_S_Weights(WeightsEnum):
                     "acc@5": 96.878,
                 }
             },
+            "_ops": 8.366,
+            "_file_size": 82.704,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -692,6 +701,8 @@ class EfficientNet_V2_M_Weights(WeightsEnum):
                     "acc@5": 97.156,
                 }
             },
+            "_ops": 24.582,
+            "_file_size": 208.01,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -723,6 +734,8 @@ class EfficientNet_V2_L_Weights(WeightsEnum):
                     "acc@5": 97.788,
                 }
             },
+            "_ops": 56.08,
+            "_file_size": 454.573,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -755,7 +768,9 @@ def efficientnet_b0(
     weights = EfficientNet_B0_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b0", width_mult=1.0, depth_mult=1.0)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -784,7 +799,9 @@ def efficientnet_b1(
     weights = EfficientNet_B1_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b1", width_mult=1.0, depth_mult=1.1)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -813,7 +830,9 @@ def efficientnet_b2(
     weights = EfficientNet_B2_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b2", width_mult=1.1, depth_mult=1.2)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.3), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -842,7 +861,14 @@ def efficientnet_b3(
     weights = EfficientNet_B3_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b3", width_mult=1.2, depth_mult=1.4)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.3),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
 
 
 @register_model()
@@ -871,7 +897,14 @@ def efficientnet_b4(
     weights = EfficientNet_B4_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b4", width_mult=1.4, depth_mult=1.8)
-    return _efficientnet(inverted_residual_setting, 0.4, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
 
 
 @register_model()
@@ -902,7 +935,7 @@ def efficientnet_b5(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b5", width_mult=1.6, depth_mult=2.2)
     return _efficientnet(
         inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
         last_channel,
         weights,
         progress,
@@ -939,7 +972,7 @@ def efficientnet_b6(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b6", width_mult=1.8, depth_mult=2.6)
     return _efficientnet(
         inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
         last_channel,
         weights,
         progress,
@@ -976,7 +1009,7 @@ def efficientnet_b7(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b7", width_mult=2.0, depth_mult=3.1)
     return _efficientnet(
         inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
         last_channel,
         weights,
         progress,
@@ -1014,7 +1047,7 @@ def efficientnet_v2_s(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
     return _efficientnet(
         inverted_residual_setting,
-        0.2,
+        kwargs.pop("dropout", 0.2),
         last_channel,
         weights,
         progress,
@@ -1052,7 +1085,7 @@ def efficientnet_v2_m(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_m")
     return _efficientnet(
         inverted_residual_setting,
-        0.3,
+        kwargs.pop("dropout", 0.3),
         last_channel,
         weights,
         progress,
@@ -1090,28 +1123,10 @@ def efficientnet_v2_l(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_l")
     return _efficientnet(
         inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
         last_channel,
         weights,
         progress,
         norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
         **kwargs,
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "efficientnet_b0": EfficientNet_B0_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b1": EfficientNet_B1_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b2": EfficientNet_B2_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b3": EfficientNet_B3_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b4": EfficientNet_B4_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b5": EfficientNet_B5_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b6": EfficientNet_B6_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b7": EfficientNet_B7_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/feature_extraction.py b/torchvision/models/feature_extraction.py
index d247d9a3e26..320b1936d6f 100644
--- a/torchvision/models/feature_extraction.py
+++ b/torchvision/models/feature_extraction.py
@@ -1,3 +1,4 @@
+import copy
 import inspect
 import math
 import re
@@ -5,12 +6,12 @@
 from collections import OrderedDict
 from copy import deepcopy
 from itertools import chain
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torchvision
 from torch import fx, nn
-from torch.fx.graph_module import _copy_attr
+from torch.fx.graph_module import _CodeOnlyModule, _copy_attr, _USER_PRESERVED_ATTRIBUTES_KEY
 
 
 __all__ = ["create_feature_extractor", "get_graph_node_names"]
@@ -18,7 +19,7 @@
 
 class LeafModuleAwareTracer(fx.Tracer):
     """
-    An fx.Tracer that allows the user to specify a set of leaf modules, ie.
+    An fx.Tracer that allows the user to specify a set of leaf modules, i.e.
     modules that are not to be traced through. The resulting graph ends up
     having single nodes referencing calls to the leaf modules' forward methods.
     """
@@ -103,7 +104,7 @@ def _get_node_qualname(self, module_qualname: str, node: fx.node.Node) -> str:
 
         if node.op != "call_module":
             # In this case module_qualname from torch.fx doesn't go all the
-            # way to the leaf function/op so we need to append it
+            # way to the leaf function/op, so we need to append it
             if len(node_qualname) > 0:
                 # Only append '.' if we are deeper than the top level module
                 node_qualname += "."
@@ -136,7 +137,7 @@ def _get_node_qualname(self, module_qualname: str, node: fx.node.Node) -> str:
 
 
 def _is_subseq(x, y):
-    """Check if y is a subseqence of x
+    """Check if y is a subsequence of x
     https://stackoverflow.com/a/24017747/4391249
     """
     iter_x = iter(x)
@@ -174,7 +175,7 @@ def _warn_graph_differences(train_tracer: NodePathTracer, eval_tracer: NodePathT
     warnings.warn(msg + suggestion_msg)
 
 
-def _get_leaf_modules_for_ops() -> List[type]:
+def _get_leaf_modules_for_ops() -> list[type]:
     members = inspect.getmembers(torchvision.ops)
     result = []
     for _, obj in members:
@@ -183,7 +184,7 @@ def _get_leaf_modules_for_ops() -> List[type]:
     return result
 
 
-def _set_default_tracer_kwargs(original_tr_kwargs: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+def _set_default_tracer_kwargs(original_tr_kwargs: Optional[dict[str, Any]]) -> dict[str, Any]:
     default_autowrap_modules = (math, torchvision.ops)
     default_leaf_modules = _get_leaf_modules_for_ops()
     result_tracer_kwargs = {} if original_tr_kwargs is None else original_tr_kwargs
@@ -202,9 +203,10 @@ def _set_default_tracer_kwargs(original_tr_kwargs: Optional[Dict[str, Any]]) ->
 
 def get_graph_node_names(
     model: nn.Module,
-    tracer_kwargs: Optional[Dict[str, Any]] = None,
+    tracer_kwargs: Optional[dict[str, Any]] = None,
     suppress_diff_warning: bool = False,
-) -> Tuple[List[str], List[str]]:
+    concrete_args: Optional[dict[str, Any]] = None,
+) -> tuple[list[str], list[str]]:
     """
     Dev utility to return node names in order of execution. See note on node
     names under :func:`create_feature_extractor`. Useful for seeing which node
@@ -228,14 +230,17 @@ def get_graph_node_names(
         tracer_kwargs (dict, optional): a dictionary of keyword arguments for
             ``NodePathTracer`` (they are eventually passed onto
             `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
             {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
             WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
             provided dictionary.
-
         suppress_diff_warning (bool, optional): whether to suppress a warning
             when there are discrepancies between the train and eval version of
             the graph. Defaults to False.
+        concrete_args (Optional[Dict[str, any]]): Concrete arguments that should
+            not be treated as Proxies. According to the `Pytorch docs
+            <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer.trace>`_,
+            this parameter's API may not be guaranteed.
 
     Returns:
         tuple(list, list): a list of node names from tracing the model in
@@ -249,9 +254,9 @@ def get_graph_node_names(
     tracer_kwargs = _set_default_tracer_kwargs(tracer_kwargs)
     is_training = model.training
     train_tracer = NodePathTracer(**tracer_kwargs)
-    train_tracer.trace(model.train())
+    train_tracer.trace(model.train(), concrete_args=concrete_args)
     eval_tracer = NodePathTracer(**tracer_kwargs)
-    eval_tracer.trace(model.eval())
+    eval_tracer.trace(model.eval(), concrete_args=concrete_args)
     train_nodes = list(train_tracer.node_to_qualname.values())
     eval_nodes = list(eval_tracer.node_to_qualname.values())
     if not suppress_diff_warning:
@@ -326,14 +331,49 @@ def train(self, mode=True):
             self.graph = self.eval_graph
         return super().train(mode=mode)
 
+    def _deepcopy_init(self):
+        # See __deepcopy__ below
+        return DualGraphModule.__init__
+
+    def __deepcopy__(self, memo):
+        # Same as the base class' __deepcopy__ from pytorch, with minor
+        # modification to account for train_graph and eval_graph
+        # https://github.com/pytorch/pytorch/blob/f684dbd0026f98f8fa291cab74dbc4d61ba30580/torch/fx/graph_module.py#L875
+        #
+        # This is using a bunch of private stuff from torch, so if that breaks,
+        # we'll likely have to remove this, along with the associated
+        # non-regression test.
+        res = type(self).__new__(type(self))
+        memo[id(self)] = res
+        fake_mod = _CodeOnlyModule(copy.deepcopy(self.__dict__, memo))
+        self._deepcopy_init()(res, fake_mod, fake_mod.__dict__["train_graph"], fake_mod.__dict__["eval_graph"])
+
+        extra_preserved_attrs = [
+            "_state_dict_hooks",
+            "_load_state_dict_pre_hooks",
+            "_load_state_dict_post_hooks",
+            "_replace_hook",
+            "_create_node_hooks",
+            "_erase_node_hooks",
+        ]
+        for attr in extra_preserved_attrs:
+            if attr in self.__dict__:
+                setattr(res, attr, copy.deepcopy(self.__dict__[attr], memo))
+        res.meta = copy.deepcopy(getattr(self, "meta", {}), memo)
+        if _USER_PRESERVED_ATTRIBUTES_KEY in res.meta:
+            for attr_name, attr in res.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():
+                setattr(res, attr_name, attr)
+        return res
+
 
 def create_feature_extractor(
     model: nn.Module,
-    return_nodes: Optional[Union[List[str], Dict[str, str]]] = None,
-    train_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None,
-    eval_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None,
-    tracer_kwargs: Optional[Dict[str, Any]] = None,
+    return_nodes: Optional[Union[list[str], dict[str, str]]] = None,
+    train_return_nodes: Optional[Union[list[str], dict[str, str]]] = None,
+    eval_return_nodes: Optional[Union[list[str], dict[str, str]]] = None,
+    tracer_kwargs: Optional[dict[str, Any]] = None,
     suppress_diff_warning: bool = False,
+    concrete_args: Optional[dict[str, Any]] = None,
 ) -> fx.GraphModule:
     """
     Creates a new graph module that returns intermediate nodes from a given
@@ -391,13 +431,17 @@ def create_feature_extractor(
         tracer_kwargs (dict, optional): a dictionary of keyword arguments for
             ``NodePathTracer`` (which passes them onto it's parent class
             `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
             {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
             WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
             provided dictionary.
         suppress_diff_warning (bool, optional): whether to suppress a warning
             when there are discrepancies between the train and eval version of
             the graph. Defaults to False.
+        concrete_args (Optional[Dict[str, any]]): Concrete arguments that should
+            not be treated as Proxies. According to the `Pytorch docs
+            <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer.trace>`_,
+            this parameter's API may not be guaranteed.
 
     Examples::
 
@@ -457,7 +501,7 @@ def create_feature_extractor(
         raise ValueError("If `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified")
 
     # Put *_return_nodes into Dict[str, str] format
-    def to_strdict(n) -> Dict[str, str]:
+    def to_strdict(n) -> dict[str, str]:
         if isinstance(n, list):
             return {str(i): str(i) for i in n}
         return {str(k): str(v) for k, v in n.items()}
@@ -473,7 +517,7 @@ def to_strdict(n) -> Dict[str, str]:
     # Repeat the tracing and graph rewriting for train and eval mode
     tracers = {}
     graphs = {}
-    mode_return_nodes: Dict[str, Dict[str, str]] = {"train": train_return_nodes, "eval": eval_return_nodes}
+    mode_return_nodes: dict[str, dict[str, str]] = {"train": train_return_nodes, "eval": eval_return_nodes}
     for mode in ["train", "eval"]:
         if mode == "train":
             model.train()
@@ -482,7 +526,7 @@ def to_strdict(n) -> Dict[str, str]:
 
         # Instantiate our NodePathTracer and use that to trace the model
         tracer = NodePathTracer(**tracer_kwargs)
-        graph = tracer.trace(model)
+        graph = tracer.trace(model, concrete_args=concrete_args)
 
         name = model.__class__.__name__ if isinstance(model, nn.Module) else model.__name__
         graph_module = fx.GraphModule(tracer.root, graph, name)
@@ -544,7 +588,7 @@ def to_strdict(n) -> Dict[str, str]:
         graph_module.graph.eliminate_dead_code()
         graph_module.recompile()
 
-        # Keep track of the tracer and graph so we can choose the main one
+        # Keep track of the tracer and graph, so we can choose the main one
         tracers[mode] = tracer
         graphs[mode] = graph
 
diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py
index 0ea3dd5d0b9..bfb29764951 100644
--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -1,7 +1,7 @@
 import warnings
 from collections import namedtuple
 from functools import partial
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -35,7 +35,7 @@ def __init__(
         aux_logits: bool = True,
         transform_input: bool = False,
         init_weights: Optional[bool] = None,
-        blocks: Optional[List[Callable[..., nn.Module]]] = None,
+        blocks: Optional[list[Callable[..., nn.Module]]] = None,
         dropout: float = 0.2,
         dropout_aux: float = 0.7,
     ) -> None:
@@ -107,7 +107,7 @@ def _transform_input(self, x: Tensor) -> Tensor:
             x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
         return x
 
-    def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    def _forward(self, x: Tensor) -> tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
         # N x 3 x 224 x 224
         x = self.conv1(x)
         # N x 64 x 112 x 112
@@ -171,7 +171,7 @@ def eager_outputs(self, x: Tensor, aux2: Tensor, aux1: Optional[Tensor]) -> Goog
 
     def forward(self, x: Tensor) -> GoogLeNetOutputs:
         x = self._transform_input(x)
-        x, aux1, aux2 = self._forward(x)
+        x, aux2, aux1 = self._forward(x)
         aux_defined = self.training and self.aux_logits
         if torch.jit.is_scripting():
             if not aux_defined:
@@ -214,7 +214,7 @@ def __init__(
             conv_block(in_channels, pool_proj, kernel_size=1),
         )
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch1 = self.branch1(x)
         branch2 = self.branch2(x)
         branch3 = self.branch3(x)
@@ -290,6 +290,8 @@ class GoogLeNet_Weights(WeightsEnum):
                     "acc@5": 89.530,
                 }
             },
+            "_ops": 1.498,
+            "_file_size": 49.731,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -330,7 +332,7 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
     model = GoogLeNet(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.aux1 = None  # type: ignore[assignment]
@@ -341,15 +343,3 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
             )
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # GoogLeNet ported from TensorFlow
-        "googlenet": GoogLeNet_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index 928c07ac843..7c36ec2a0ad 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -1,7 +1,7 @@
 import warnings
 from collections import namedtuple
 from functools import partial
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn.functional as F
@@ -31,7 +31,7 @@ def __init__(
         num_classes: int = 1000,
         aux_logits: bool = True,
         transform_input: bool = False,
-        inception_blocks: Optional[List[Callable[..., nn.Module]]] = None,
+        inception_blocks: Optional[list[Callable[..., nn.Module]]] = None,
         init_weights: Optional[bool] = None,
         dropout: float = 0.5,
     ) -> None:
@@ -48,7 +48,7 @@ def __init__(
             )
             init_weights = True
         if len(inception_blocks) != 7:
-            raise ValueError(f"lenght of inception_blocks should be 7 instead of {len(inception_blocks)}")
+            raise ValueError(f"length of inception_blocks should be 7 instead of {len(inception_blocks)}")
         conv_block = inception_blocks[0]
         inception_a = inception_blocks[1]
         inception_b = inception_blocks[2]
@@ -100,7 +100,7 @@ def _transform_input(self, x: Tensor) -> Tensor:
             x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
         return x
 
-    def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor]]:
+    def _forward(self, x: Tensor) -> tuple[Tensor, Optional[Tensor]]:
         # N x 3 x 299 x 299
         x = self.Conv2d_1a_3x3(x)
         # N x 32 x 149 x 149
@@ -191,7 +191,7 @@ def __init__(
 
         self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch1x1 = self.branch1x1(x)
 
         branch5x5 = self.branch5x5_1(x)
@@ -223,7 +223,7 @@ def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Modul
         self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
         self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch3x3 = self.branch3x3(x)
 
         branch3x3dbl = self.branch3x3dbl_1(x)
@@ -262,7 +262,7 @@ def __init__(
 
         self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch1x1 = self.branch1x1(x)
 
         branch7x7 = self.branch7x7_1(x)
@@ -299,7 +299,7 @@ def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Modul
         self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
         self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch3x3 = self.branch3x3_1(x)
         branch3x3 = self.branch3x3_2(branch3x3)
 
@@ -335,7 +335,7 @@ def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Modul
 
         self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch1x1 = self.branch1x1(x)
 
         branch3x3 = self.branch3x3_1(x)
@@ -422,6 +422,8 @@ class Inception_V3_Weights(WeightsEnum):
                     "acc@5": 93.450,
                 }
             },
+            "_ops": 5.713,
+            "_file_size": 103.903,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -468,21 +470,9 @@ def inception_v3(*, weights: Optional[Inception_V3_Weights] = None, progress: bo
     model = Inception3(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # Inception v3 ported from TensorFlow
-        "inception_v3_google": Inception_V3_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
new file mode 100644
index 00000000000..53cc53e5ed9
--- /dev/null
+++ b/torchvision/models/maxvit.py
@@ -0,0 +1,834 @@
+import math
+from collections import OrderedDict
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torchvision.models._api import register_model, Weights, WeightsEnum
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import _ovewrite_named_param, handle_legacy_interface
+from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation
+from torchvision.ops.stochastic_depth import StochasticDepth
+from torchvision.transforms._presets import ImageClassification, InterpolationMode
+from torchvision.utils import _log_api_usage_once
+
+__all__ = [
+    "MaxVit",
+    "MaxVit_T_Weights",
+    "maxvit_t",
+]
+
+
+def _get_conv_output_shape(input_size: tuple[int, int], kernel_size: int, stride: int, padding: int) -> tuple[int, int]:
+    return (
+        (input_size[0] - kernel_size + 2 * padding) // stride + 1,
+        (input_size[1] - kernel_size + 2 * padding) // stride + 1,
+    )
+
+
+def _make_block_input_shapes(input_size: tuple[int, int], n_blocks: int) -> list[tuple[int, int]]:
+    """Util function to check that the input size is correct for a MaxVit configuration."""
+    shapes = []
+    block_input_shape = _get_conv_output_shape(input_size, 3, 2, 1)
+    for _ in range(n_blocks):
+        block_input_shape = _get_conv_output_shape(block_input_shape, 3, 2, 1)
+        shapes.append(block_input_shape)
+    return shapes
+
+
+def _get_relative_position_index(height: int, width: int) -> torch.Tensor:
+    coords = torch.stack(torch.meshgrid([torch.arange(height), torch.arange(width)], indexing="ij"))
+    coords_flat = torch.flatten(coords, 1)
+    relative_coords = coords_flat[:, :, None] - coords_flat[:, None, :]
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+    relative_coords[:, :, 0] += height - 1
+    relative_coords[:, :, 1] += width - 1
+    relative_coords[:, :, 0] *= 2 * width - 1
+    return relative_coords.sum(-1)
+
+
+class MBConv(nn.Module):
+    """MBConv: Mobile Inverted Residual Bottleneck.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expansion_ratio (float): Expansion ratio in the bottleneck.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer.
+        stride (int): Stride of the depthwise convolution.
+        activation_layer (Callable[..., nn.Module]): Activation function.
+        norm_layer (Callable[..., nn.Module]): Normalization function.
+        p_stochastic_dropout (float): Probability of stochastic depth.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        expansion_ratio: float,
+        squeeze_ratio: float,
+        stride: int,
+        activation_layer: Callable[..., nn.Module],
+        norm_layer: Callable[..., nn.Module],
+        p_stochastic_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+
+        proj: Sequence[nn.Module]
+        self.proj: nn.Module
+
+        should_proj = stride != 1 or in_channels != out_channels
+        if should_proj:
+            proj = [nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=True)]
+            if stride == 2:
+                proj = [nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)] + proj  # type: ignore
+            self.proj = nn.Sequential(*proj)
+        else:
+            self.proj = nn.Identity()  # type: ignore
+
+        mid_channels = int(out_channels * expansion_ratio)
+        sqz_channels = int(out_channels * squeeze_ratio)
+
+        if p_stochastic_dropout:
+            self.stochastic_depth = StochasticDepth(p_stochastic_dropout, mode="row")  # type: ignore
+        else:
+            self.stochastic_depth = nn.Identity()  # type: ignore
+
+        _layers = OrderedDict()
+        _layers["pre_norm"] = norm_layer(in_channels)
+        _layers["conv_a"] = Conv2dNormActivation(
+            in_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            activation_layer=activation_layer,
+            norm_layer=norm_layer,
+            inplace=None,
+        )
+        _layers["conv_b"] = Conv2dNormActivation(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            activation_layer=activation_layer,
+            norm_layer=norm_layer,
+            groups=mid_channels,
+            inplace=None,
+        )
+        _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels, activation=nn.SiLU)
+        _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=True)
+
+        self.layers = nn.Sequential(_layers)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H / stride, W / stride].
+        """
+        res = self.proj(x)
+        x = self.stochastic_depth(self.layers(x))
+        return res + x
+
+
+class RelativePositionalMultiHeadAttention(nn.Module):
+    """Relative Positional Multi-Head Attention.
+
+    Args:
+        feat_dim (int): Number of input features.
+        head_dim (int): Number of features per head.
+        max_seq_len (int): Maximum sequence length.
+    """
+
+    def __init__(
+        self,
+        feat_dim: int,
+        head_dim: int,
+        max_seq_len: int,
+    ) -> None:
+        super().__init__()
+
+        if feat_dim % head_dim != 0:
+            raise ValueError(f"feat_dim: {feat_dim} must be divisible by head_dim: {head_dim}")
+
+        self.n_heads = feat_dim // head_dim
+        self.head_dim = head_dim
+        self.size = int(math.sqrt(max_seq_len))
+        self.max_seq_len = max_seq_len
+
+        self.to_qkv = nn.Linear(feat_dim, self.n_heads * self.head_dim * 3)
+        self.scale_factor = feat_dim**-0.5
+
+        self.merge = nn.Linear(self.head_dim * self.n_heads, feat_dim)
+        self.relative_position_bias_table = nn.parameter.Parameter(
+            torch.empty(((2 * self.size - 1) * (2 * self.size - 1), self.n_heads), dtype=torch.float32),
+        )
+
+        self.register_buffer("relative_position_index", _get_relative_position_index(self.size, self.size))
+        # initialize with truncated normal the bias
+        torch.nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def get_relative_positional_bias(self) -> torch.Tensor:
+        bias_index = self.relative_position_index.view(-1)  # type: ignore
+        relative_bias = self.relative_position_bias_table[bias_index].view(self.max_seq_len, self.max_seq_len, -1)  # type: ignore
+        relative_bias = relative_bias.permute(2, 0, 1).contiguous()
+        return relative_bias.unsqueeze(0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, G, P, D].
+        Returns:
+            Tensor: Output tensor with expected layout of [B, G, P, D].
+        """
+        B, G, P, D = x.shape
+        H, DH = self.n_heads, self.head_dim
+
+        qkv = self.to_qkv(x)
+        q, k, v = torch.chunk(qkv, 3, dim=-1)
+
+        q = q.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4)
+        k = k.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4)
+        v = v.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4)
+
+        k = k * self.scale_factor
+        dot_prod = torch.einsum("B G H I D, B G H J D -> B G H I J", q, k)
+        pos_bias = self.get_relative_positional_bias()
+
+        dot_prod = F.softmax(dot_prod + pos_bias, dim=-1)
+
+        out = torch.einsum("B G H I J, B G H J D -> B G H I D", dot_prod, v)
+        out = out.permute(0, 1, 3, 2, 4).reshape(B, G, P, D)
+
+        out = self.merge(out)
+        return out
+
+
+class SwapAxes(nn.Module):
+    """Permute the axes of a tensor."""
+
+    def __init__(self, a: int, b: int) -> None:
+        super().__init__()
+        self.a = a
+        self.b = b
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        res = torch.swapaxes(x, self.a, self.b)
+        return res
+
+
+class WindowPartition(nn.Module):
+    """
+    Partition the input tensor into non-overlapping windows.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: Tensor, p: int) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+            p (int): Number of partitions.
+        Returns:
+            Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C].
+        """
+        B, C, H, W = x.shape
+        P = p
+        # chunk up H and W dimensions
+        x = x.reshape(B, C, H // P, P, W // P, P)
+        x = x.permute(0, 2, 4, 3, 5, 1)
+        # colapse P * P dimension
+        x = x.reshape(B, (H // P) * (W // P), P * P, C)
+        return x
+
+
+class WindowDepartition(nn.Module):
+    """
+    Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W].
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: Tensor, p: int, h_partitions: int, w_partitions: int) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C].
+            p (int): Number of partitions.
+            h_partitions (int): Number of vertical partitions.
+            w_partitions (int): Number of horizontal partitions.
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H, W].
+        """
+        B, G, PP, C = x.shape
+        P = p
+        HP, WP = h_partitions, w_partitions
+        # split P * P dimension into 2 P tile dimensionsa
+        x = x.reshape(B, HP, WP, P, P, C)
+        # permute into B, C, HP, P, WP, P
+        x = x.permute(0, 5, 1, 3, 2, 4)
+        # reshape into B, C, H, W
+        x = x.reshape(B, C, HP * P, WP * P)
+        return x
+
+
+class PartitionAttentionLayer(nn.Module):
+    """
+    Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window.
+
+    Args:
+        in_channels (int): Number of input channels.
+        head_dim (int): Dimension of each attention head.
+        partition_size (int): Size of the partitions.
+        partition_type (str): Type of partitioning to use. Can be either "grid" or "window".
+        grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into.
+        mlp_ratio (int): Ratio of the  feature size expansion in the MLP layer.
+        activation_layer (Callable[..., nn.Module]): Activation function to use.
+        norm_layer (Callable[..., nn.Module]): Normalization function to use.
+        attention_dropout (float): Dropout probability for the attention layer.
+        mlp_dropout (float): Dropout probability for the MLP layer.
+        p_stochastic_dropout (float): Probability of dropping out a partition.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        head_dim: int,
+        # partitioning parameters
+        partition_size: int,
+        partition_type: str,
+        # grid size needs to be known at initialization time
+        # because we need to know hamy relative offsets there are in the grid
+        grid_size: tuple[int, int],
+        mlp_ratio: int,
+        activation_layer: Callable[..., nn.Module],
+        norm_layer: Callable[..., nn.Module],
+        attention_dropout: float,
+        mlp_dropout: float,
+        p_stochastic_dropout: float,
+    ) -> None:
+        super().__init__()
+
+        self.n_heads = in_channels // head_dim
+        self.head_dim = head_dim
+        self.n_partitions = grid_size[0] // partition_size
+        self.partition_type = partition_type
+        self.grid_size = grid_size
+
+        if partition_type not in ["grid", "window"]:
+            raise ValueError("partition_type must be either 'grid' or 'window'")
+
+        if partition_type == "window":
+            self.p, self.g = partition_size, self.n_partitions
+        else:
+            self.p, self.g = self.n_partitions, partition_size
+
+        self.partition_op = WindowPartition()
+        self.departition_op = WindowDepartition()
+        self.partition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity()
+        self.departition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity()
+
+        self.attn_layer = nn.Sequential(
+            norm_layer(in_channels),
+            # it's always going to be partition_size ** 2 because
+            # of the axis swap in the case of grid partitioning
+            RelativePositionalMultiHeadAttention(in_channels, head_dim, partition_size**2),
+            nn.Dropout(attention_dropout),
+        )
+
+        # pre-normalization similar to transformer layers
+        self.mlp_layer = nn.Sequential(
+            nn.LayerNorm(in_channels),
+            nn.Linear(in_channels, in_channels * mlp_ratio),
+            activation_layer(),
+            nn.Linear(in_channels * mlp_ratio, in_channels),
+            nn.Dropout(mlp_dropout),
+        )
+
+        # layer scale factors
+        self.stochastic_dropout = StochasticDepth(p_stochastic_dropout, mode="row")
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H, W].
+        """
+
+        # Undefined behavior if H or W are not divisible by p
+        # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766
+        gh, gw = self.grid_size[0] // self.p, self.grid_size[1] // self.p
+        torch._assert(
+            self.grid_size[0] % self.p == 0 and self.grid_size[1] % self.p == 0,
+            "Grid size must be divisible by partition size. Got grid size of {} and partition size of {}".format(
+                self.grid_size, self.p
+            ),
+        )
+
+        x = self.partition_op(x, self.p)
+        x = self.partition_swap(x)
+        x = x + self.stochastic_dropout(self.attn_layer(x))
+        x = x + self.stochastic_dropout(self.mlp_layer(x))
+        x = self.departition_swap(x)
+        x = self.departition_op(x, self.p, gh, gw)
+
+        return x
+
+
+class MaxVitLayer(nn.Module):
+    """
+    MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expansion_ratio (float): Expansion ratio in the bottleneck.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer.
+        stride (int): Stride of the depthwise convolution.
+        activation_layer (Callable[..., nn.Module]): Activation function.
+        norm_layer (Callable[..., nn.Module]): Normalization function.
+        head_dim (int): Dimension of the attention heads.
+        mlp_ratio (int): Ratio of the MLP layer.
+        mlp_dropout (float): Dropout probability for the MLP layer.
+        attention_dropout (float): Dropout probability for the attention layer.
+        p_stochastic_dropout (float): Probability of stochastic depth.
+        partition_size (int): Size of the partitions.
+        grid_size (Tuple[int, int]): Size of the input feature grid.
+    """
+
+    def __init__(
+        self,
+        # conv parameters
+        in_channels: int,
+        out_channels: int,
+        squeeze_ratio: float,
+        expansion_ratio: float,
+        stride: int,
+        # conv + transformer parameters
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        # transformer parameters
+        head_dim: int,
+        mlp_ratio: int,
+        mlp_dropout: float,
+        attention_dropout: float,
+        p_stochastic_dropout: float,
+        # partitioning parameters
+        partition_size: int,
+        grid_size: tuple[int, int],
+    ) -> None:
+        super().__init__()
+
+        layers: OrderedDict = OrderedDict()
+
+        # convolutional layer
+        layers["MBconv"] = MBConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            expansion_ratio=expansion_ratio,
+            squeeze_ratio=squeeze_ratio,
+            stride=stride,
+            activation_layer=activation_layer,
+            norm_layer=norm_layer,
+            p_stochastic_dropout=p_stochastic_dropout,
+        )
+        # attention layers, block -> grid
+        layers["window_attention"] = PartitionAttentionLayer(
+            in_channels=out_channels,
+            head_dim=head_dim,
+            partition_size=partition_size,
+            partition_type="window",
+            grid_size=grid_size,
+            mlp_ratio=mlp_ratio,
+            activation_layer=activation_layer,
+            norm_layer=nn.LayerNorm,
+            attention_dropout=attention_dropout,
+            mlp_dropout=mlp_dropout,
+            p_stochastic_dropout=p_stochastic_dropout,
+        )
+        layers["grid_attention"] = PartitionAttentionLayer(
+            in_channels=out_channels,
+            head_dim=head_dim,
+            partition_size=partition_size,
+            partition_type="grid",
+            grid_size=grid_size,
+            mlp_ratio=mlp_ratio,
+            activation_layer=activation_layer,
+            norm_layer=nn.LayerNorm,
+            attention_dropout=attention_dropout,
+            mlp_dropout=mlp_dropout,
+            p_stochastic_dropout=p_stochastic_dropout,
+        )
+        self.layers = nn.Sequential(layers)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, H, W).
+        Returns:
+            Tensor: Output tensor of shape (B, C, H, W).
+        """
+        x = self.layers(x)
+        return x
+
+
+class MaxVitBlock(nn.Module):
+    """
+    A MaxVit block consisting of `n_layers` MaxVit layers.
+
+     Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expansion_ratio (float): Expansion ratio in the bottleneck.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer.
+        activation_layer (Callable[..., nn.Module]): Activation function.
+        norm_layer (Callable[..., nn.Module]): Normalization function.
+        head_dim (int): Dimension of the attention heads.
+        mlp_ratio (int): Ratio of the MLP layer.
+        mlp_dropout (float): Dropout probability for the MLP layer.
+        attention_dropout (float): Dropout probability for the attention layer.
+        p_stochastic_dropout (float): Probability of stochastic depth.
+        partition_size (int): Size of the partitions.
+        input_grid_size (Tuple[int, int]): Size of the input feature grid.
+        n_layers (int): Number of layers in the block.
+        p_stochastic (List[float]): List of probabilities for stochastic depth for each layer.
+    """
+
+    def __init__(
+        self,
+        # conv parameters
+        in_channels: int,
+        out_channels: int,
+        squeeze_ratio: float,
+        expansion_ratio: float,
+        # conv + transformer parameters
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        # transformer parameters
+        head_dim: int,
+        mlp_ratio: int,
+        mlp_dropout: float,
+        attention_dropout: float,
+        # partitioning parameters
+        partition_size: int,
+        input_grid_size: tuple[int, int],
+        # number of layers
+        n_layers: int,
+        p_stochastic: list[float],
+    ) -> None:
+        super().__init__()
+        if not len(p_stochastic) == n_layers:
+            raise ValueError(f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}.")
+
+        self.layers = nn.ModuleList()
+        # account for the first stride of the first layer
+        self.grid_size = _get_conv_output_shape(input_grid_size, kernel_size=3, stride=2, padding=1)
+
+        for idx, p in enumerate(p_stochastic):
+            stride = 2 if idx == 0 else 1
+            self.layers += [
+                MaxVitLayer(
+                    in_channels=in_channels if idx == 0 else out_channels,
+                    out_channels=out_channels,
+                    squeeze_ratio=squeeze_ratio,
+                    expansion_ratio=expansion_ratio,
+                    stride=stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                    head_dim=head_dim,
+                    mlp_ratio=mlp_ratio,
+                    mlp_dropout=mlp_dropout,
+                    attention_dropout=attention_dropout,
+                    partition_size=partition_size,
+                    grid_size=self.grid_size,
+                    p_stochastic_dropout=p,
+                ),
+            ]
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, H, W).
+        Returns:
+            Tensor: Output tensor of shape (B, C, H, W).
+        """
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MaxVit(nn.Module):
+    """
+    Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_ paper.
+    Args:
+        input_size (Tuple[int, int]): Size of the input image.
+        stem_channels (int): Number of channels in the stem.
+        partition_size (int): Size of the partitions.
+        block_channels (List[int]): Number of channels in each block.
+        block_layers (List[int]): Number of layers in each block.
+        stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25.
+        expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4.
+        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.01)`).
+        activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU.
+        head_dim (int): Dimension of the attention heads.
+        mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4.
+        mlp_dropout (float): Dropout probability for the MLP layer. Default: 0.0.
+        attention_dropout (float): Dropout probability for the attention layer. Default: 0.0.
+        num_classes (int): Number of classes. Default: 1000.
+    """
+
+    def __init__(
+        self,
+        # input size parameters
+        input_size: tuple[int, int],
+        # stem and task parameters
+        stem_channels: int,
+        # partitioning parameters
+        partition_size: int,
+        # block parameters
+        block_channels: list[int],
+        block_layers: list[int],
+        # attention head dimensions
+        head_dim: int,
+        stochastic_depth_prob: float,
+        # conv + transformer parameters
+        # norm_layer is applied only to the conv layers
+        # activation_layer is applied both to conv and transformer layers
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation_layer: Callable[..., nn.Module] = nn.GELU,
+        # conv parameters
+        squeeze_ratio: float = 0.25,
+        expansion_ratio: float = 4,
+        # transformer parameters
+        mlp_ratio: int = 4,
+        mlp_dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        # task parameters
+        num_classes: int = 1000,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        input_channels = 3
+
+        # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030
+        # for the exact parameters used in batchnorm
+        if norm_layer is None:
+            norm_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.01)
+
+        # Make sure input size will be divisible by the partition size in all blocks
+        # Undefined behavior if H or W are not divisible by p
+        # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766
+        block_input_sizes = _make_block_input_shapes(input_size, len(block_channels))
+        for idx, block_input_size in enumerate(block_input_sizes):
+            if block_input_size[0] % partition_size != 0 or block_input_size[1] % partition_size != 0:
+                raise ValueError(
+                    f"Input size {block_input_size} of block {idx} is not divisible by partition size {partition_size}. "
+                    f"Consider changing the partition size or the input size.\n"
+                    f"Current configuration yields the following block input sizes: {block_input_sizes}."
+                )
+
+        # stem
+        self.stem = nn.Sequential(
+            Conv2dNormActivation(
+                input_channels,
+                stem_channels,
+                3,
+                stride=2,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+                bias=False,
+                inplace=None,
+            ),
+            Conv2dNormActivation(
+                stem_channels, stem_channels, 3, stride=1, norm_layer=None, activation_layer=None, bias=True
+            ),
+        )
+
+        # account for stem stride
+        input_size = _get_conv_output_shape(input_size, kernel_size=3, stride=2, padding=1)
+        self.partition_size = partition_size
+
+        # blocks
+        self.blocks = nn.ModuleList()
+        in_channels = [stem_channels] + block_channels[:-1]
+        out_channels = block_channels
+
+        # precompute the stochastich depth probabilities from 0 to stochastic_depth_prob
+        # since we have N blocks with L layers, we will have N * L probabilities uniformly distributed
+        # over the range [0, stochastic_depth_prob]
+        p_stochastic = np.linspace(0, stochastic_depth_prob, sum(block_layers)).tolist()
+
+        p_idx = 0
+        for in_channel, out_channel, num_layers in zip(in_channels, out_channels, block_layers):
+            self.blocks.append(
+                MaxVitBlock(
+                    in_channels=in_channel,
+                    out_channels=out_channel,
+                    squeeze_ratio=squeeze_ratio,
+                    expansion_ratio=expansion_ratio,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                    head_dim=head_dim,
+                    mlp_ratio=mlp_ratio,
+                    mlp_dropout=mlp_dropout,
+                    attention_dropout=attention_dropout,
+                    partition_size=partition_size,
+                    input_grid_size=input_size,
+                    n_layers=num_layers,
+                    p_stochastic=p_stochastic[p_idx : p_idx + num_layers],
+                ),
+            )
+            input_size = self.blocks[-1].grid_size  # type: ignore[assignment]
+            p_idx += num_layers
+
+        # see https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1137-L1158
+        # for why there is Linear -> Tanh -> Linear
+        self.classifier = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.LayerNorm(block_channels[-1]),
+            nn.Linear(block_channels[-1], block_channels[-1]),
+            nn.Tanh(),
+            nn.Linear(block_channels[-1], num_classes, bias=False),
+        )
+
+        self._init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.classifier(x)
+        return x
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+
+def _maxvit(
+    # stem parameters
+    stem_channels: int,
+    # block parameters
+    block_channels: list[int],
+    block_layers: list[int],
+    stochastic_depth_prob: float,
+    # partitioning parameters
+    partition_size: int,
+    # transformer parameters
+    head_dim: int,
+    # Weights API
+    weights: Optional[WeightsEnum] = None,
+    progress: bool = False,
+    # kwargs,
+    **kwargs: Any,
+) -> MaxVit:
+
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        assert weights.meta["min_size"][0] == weights.meta["min_size"][1]
+        _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"])
+
+    input_size = kwargs.pop("input_size", (224, 224))
+
+    model = MaxVit(
+        stem_channels=stem_channels,
+        block_channels=block_channels,
+        block_layers=block_layers,
+        stochastic_depth_prob=stochastic_depth_prob,
+        head_dim=head_dim,
+        partition_size=partition_size,
+        input_size=input_size,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+class MaxVit_T_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # URL empty until official release
+        url="https://download.pytorch.org/models/maxvit_t-bc5ab103.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            "categories": _IMAGENET_CATEGORIES,
+            "num_params": 30919624,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#maxvit",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.700,
+                    "acc@5": 96.722,
+                }
+            },
+            "_ops": 5.558,
+            "_file_size": 118.769,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.
+            They were trained with a BatchNorm2D momentum of 0.99 instead of the more correct 0.01.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MaxVit_T_Weights.IMAGENET1K_V1))
+def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = True, **kwargs: Any) -> MaxVit:
+    """
+    Constructs a maxvit_t architecture from
+    `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.MaxVit_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MaxVit_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.maxvit.MaxVit``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/maxvit.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MaxVit_T_Weights
+        :members:
+    """
+    weights = MaxVit_T_Weights.verify(weights)
+
+    return _maxvit(
+        stem_channels=64,
+        block_channels=[64, 128, 256, 512],
+        block_layers=[2, 2, 5, 2],
+        head_dim=32,
+        stochastic_depth_prob=0.2,
+        partition_size=7,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 48103f11585..a0efff4e962 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -1,6 +1,6 @@
 import warnings
 from functools import partial
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 import torch
 import torch.nn as nn
@@ -87,15 +87,15 @@ def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9)
     return new_val if new_val >= round_up_bias * val else new_val + divisor
 
 
-def _get_depths(alpha: float) -> List[int]:
-    """Scales tensor depths as in reference MobileNet code, prefers rouding up
+def _get_depths(alpha: float) -> list[int]:
+    """Scales tensor depths as in reference MobileNet code, prefers rounding up
     rather than down."""
     depths = [32, 16, 24, 40, 80, 96, 192, 320]
     return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
 
 
 class MNASNet(torch.nn.Module):
-    """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    """MNASNet, as described in https://arxiv.org/abs/1807.11626. This
     implements the B1 variant of the model.
     >>> model = MNASNet(1.0, num_classes=1000)
     >>> x = torch.rand(1, 3, 224, 224)
@@ -163,13 +163,13 @@ def forward(self, x: Tensor) -> Tensor:
 
     def _load_from_state_dict(
         self,
-        state_dict: Dict,
+        state_dict: dict,
         prefix: str,
-        local_metadata: Dict,
+        local_metadata: dict,
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         version = local_metadata.get("version", None)
         if version not in [1, 2]:
@@ -231,6 +231,8 @@ class MNASNet0_5_Weights(WeightsEnum):
                     "acc@5": 87.490,
                 }
             },
+            "_ops": 0.104,
+            "_file_size": 8.591,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -251,6 +253,8 @@ class MNASNet0_75_Weights(WeightsEnum):
                     "acc@5": 90.496,
                 }
             },
+            "_ops": 0.215,
+            "_file_size": 12.303,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -273,6 +277,8 @@ class MNASNet1_0_Weights(WeightsEnum):
                     "acc@5": 91.510,
                 }
             },
+            "_ops": 0.314,
+            "_file_size": 16.915,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -293,6 +299,8 @@ class MNASNet1_3_Weights(WeightsEnum):
                     "acc@5": 93.522,
                 }
             },
+            "_ops": 0.526,
+            "_file_size": 24.246,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -309,7 +317,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
     model = MNASNet(alpha, **kwargs)
 
     if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -319,7 +327,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
 def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 0.5 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet0_5_Weights`, optional): The
@@ -347,7 +355,7 @@ def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool =
 def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 0.75 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet0_75_Weights`, optional): The
@@ -375,7 +383,7 @@ def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool
 def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 1.0 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet1_0_Weights`, optional): The
@@ -403,7 +411,7 @@ def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool =
 def mnasnet1_3(*, weights: Optional[MNASNet1_3_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 1.3 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet1_3_Weights`, optional): The
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index 6d8796b7a16..97f62e398a3 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -1,6 +1,5 @@
-import warnings
 from functools import partial
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Optional
 
 import torch
 from torch import nn, Tensor
@@ -17,24 +16,6 @@
 
 
 # necessary for backwards compatibility
-class _DeprecatedConvBNAct(Conv2dNormActivation):
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            "The ConvBNReLU/ConvBNActivation classes are deprecated since 0.12 and will be removed in 0.14. "
-            "Use torchvision.ops.misc.Conv2dNormActivation instead.",
-            FutureWarning,
-        )
-        if kwargs.get("norm_layer", None) is None:
-            kwargs["norm_layer"] = nn.BatchNorm2d
-        if kwargs.get("activation_layer", None) is None:
-            kwargs["activation_layer"] = nn.ReLU6
-        super().__init__(*args, **kwargs)
-
-
-ConvBNReLU = _DeprecatedConvBNAct
-ConvBNActivation = _DeprecatedConvBNAct
-
-
 class InvertedResidual(nn.Module):
     def __init__(
         self, inp: int, oup: int, stride: int, expand_ratio: int, norm_layer: Optional[Callable[..., nn.Module]] = None
@@ -42,7 +23,7 @@ def __init__(
         super().__init__()
         self.stride = stride
         if stride not in [1, 2]:
-            raise ValueError(f"stride should be 1 or 2 insted of {stride}")
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")
 
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
@@ -50,7 +31,7 @@ def __init__(
         hidden_dim = int(round(inp * expand_ratio))
         self.use_res_connect = self.stride == 1 and inp == oup
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
         if expand_ratio != 1:
             # pw
             layers.append(
@@ -88,7 +69,7 @@ def __init__(
         self,
         num_classes: int = 1000,
         width_mult: float = 1.0,
-        inverted_residual_setting: Optional[List[List[int]]] = None,
+        inverted_residual_setting: Optional[list[list[int]]] = None,
         round_nearest: int = 8,
         block: Optional[Callable[..., nn.Module]] = None,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
@@ -141,7 +122,7 @@ def __init__(
         # building first layer
         input_channel = _make_divisible(input_channel * width_mult, round_nearest)
         self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
-        features: List[nn.Module] = [
+        features: list[nn.Module] = [
             Conv2dNormActivation(3, input_channel, stride=2, norm_layer=norm_layer, activation_layer=nn.ReLU6)
         ]
         # building inverted residual blocks
@@ -213,6 +194,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                     "acc@5": 90.286,
                 }
             },
+            "_ops": 0.301,
+            "_file_size": 13.555,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -228,6 +211,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                     "acc@5": 90.822,
                 }
             },
+            "_ops": 0.301,
+            "_file_size": 13.598,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -270,17 +255,6 @@ def mobilenet_v2(
     model = MobileNetV2(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v2": MobileNet_V2_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index 81fc3c5d4c0..e6239d095ba 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -1,6 +1,6 @@
-import warnings
+from collections.abc import Sequence
 from functools import partial
-from typing import Any, Callable, List, Optional, Sequence
+from typing import Any, Callable, Optional
 
 import torch
 from torch import nn, Tensor
@@ -22,21 +22,6 @@
 ]
 
 
-class SqueezeExcitation(SElayer):
-    """DEPRECATED"""
-
-    def __init__(self, input_channels: int, squeeze_factor: int = 4):
-        squeeze_channels = _make_divisible(input_channels // squeeze_factor, 8)
-        super().__init__(input_channels, squeeze_channels, scale_activation=nn.Hardsigmoid)
-        self.relu = self.activation
-        delattr(self, "activation")
-        warnings.warn(
-            "This SqueezeExcitation class is deprecated since 0.12 and will be removed in 0.14. "
-            "Use torchvision.ops.SqueezeExcitation instead.",
-            FutureWarning,
-        )
-
-
 class InvertedResidualConfig:
     # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
     def __init__(
@@ -79,7 +64,7 @@ def __init__(
 
         self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
         activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
 
         # expand
@@ -133,7 +118,7 @@ def forward(self, input: Tensor) -> Tensor:
 class MobileNetV3(nn.Module):
     def __init__(
         self,
-        inverted_residual_setting: List[InvertedResidualConfig],
+        inverted_residual_setting: list[InvertedResidualConfig],
         last_channel: int,
         num_classes: int = 1000,
         block: Optional[Callable[..., nn.Module]] = None,
@@ -169,7 +154,7 @@ def __init__(
         if norm_layer is None:
             norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
 
         # building first layer
         firstconv_output_channels = inverted_residual_setting[0].input_channels
@@ -286,7 +271,7 @@ def _mobilenet_v3_conf(
 
 
 def _mobilenet_v3(
-    inverted_residual_setting: List[InvertedResidualConfig],
+    inverted_residual_setting: list[InvertedResidualConfig],
     last_channel: int,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -298,7 +283,7 @@ def _mobilenet_v3(
     model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -323,6 +308,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                     "acc@5": 91.340,
                 }
             },
+            "_ops": 0.217,
+            "_file_size": 21.114,
             "_docs": """These weights were trained from scratch by using a simple training recipe.""",
         },
     )
@@ -339,6 +326,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                     "acc@5": 92.566,
                 }
             },
+            "_ops": 0.217,
+            "_file_size": 21.107,
             "_docs": """
                 These weights improve marginally upon the results of the original paper by using a modified version of
                 TorchVision's `new training recipe
@@ -363,6 +352,8 @@ class MobileNet_V3_Small_Weights(WeightsEnum):
                     "acc@5": 87.402,
                 }
             },
+            "_ops": 0.057,
+            "_file_size": 9.829,
             "_docs": """
                 These weights improve upon the results of the original paper by using a simple training recipe.
             """,
@@ -388,7 +379,7 @@ def mobilenet_v3_large(
             weights are used.
         progress (bool, optional): If True, displays a progress bar of the
             download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
             base class. Please refer to the `source code
             <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
             for more details about this class.
@@ -419,7 +410,7 @@ def mobilenet_v3_small(
             weights are used.
         progress (bool, optional): If True, displays a progress bar of the
             download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
             base class. Please refer to the `source code
             <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
             for more details about this class.
@@ -431,15 +422,3 @@ def mobilenet_v3_small(
 
     inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs)
     return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large": MobileNet_V3_Large_Weights.IMAGENET1K_V1.url,
-        "mobilenet_v3_small": MobileNet_V3_Small_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/optical_flow/_utils.py b/torchvision/models/optical_flow/_utils.py
index 2b6068b4fbb..fa2454a2731 100644
--- a/torchvision/models/optical_flow/_utils.py
+++ b/torchvision/models/optical_flow/_utils.py
@@ -19,8 +19,9 @@ def grid_sample(img: Tensor, absolute_grid: Tensor, mode: str = "bilinear", alig
     return F.grid_sample(img, normalized_grid, mode=mode, align_corners=align_corners)
 
 
-def make_coords_grid(batch_size: int, h: int, w: int):
-    coords = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+def make_coords_grid(batch_size: int, h: int, w: int, device: str = "cpu"):
+    device = torch.device(device)
+    coords = torch.meshgrid(torch.arange(h, device=device), torch.arange(w, device=device), indexing="ij")
     coords = torch.stack(coords[::-1], dim=0).float()
     return coords[None].repeat(batch_size, 1, 1, 1)
 
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index 04076c96032..644adc2dc5c 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -27,7 +27,7 @@
 class ResidualBlock(nn.Module):
     """Slightly modified Residual block with extra relu and biases."""
 
-    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
+    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1, always_project: bool = False):
         super().__init__()
 
         # Note regarding bias=True:
@@ -35,7 +35,7 @@ def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
         # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset,
         # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful
         # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm
-        # because these aren't frozen, but we don't bother (also, we woudn't be able to load the original weights).
+        # because these aren't frozen, but we don't bother (also, we wouldn't be able to load the original weights).
         self.convnormrelu1 = Conv2dNormActivation(
             in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
         )
@@ -43,7 +43,10 @@ def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
             out_channels, out_channels, norm_layer=norm_layer, kernel_size=3, bias=True
         )
 
-        if stride == 1:
+        # make mypy happy
+        self.downsample: nn.Module
+
+        if stride == 1 and not always_project:
             self.downsample = nn.Identity()
         else:
             self.downsample = Conv2dNormActivation(
@@ -144,6 +147,10 @@ def __init__(
                 if m.bias is not None:
                     nn.init.constant_(m.bias, 0)
 
+        num_downsamples = len(list(filter(lambda s: s == 2, strides)))
+        self.output_dim = layers[-1]
+        self.downsample_factor = 2**num_downsamples
+
     def _make_2_blocks(self, block, in_channels, out_channels, norm_layer, first_stride):
         block1 = block(in_channels, out_channels, norm_layer=norm_layer, stride=first_stride)
         block2 = block(out_channels, out_channels, norm_layer=norm_layer, stride=1)
@@ -311,7 +318,7 @@ class MaskPredictor(nn.Module):
     def __init__(self, *, in_channels, hidden_size, multiplier=0.25):
         super().__init__()
         self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder
+        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder,
         # and we interpolate with all 9 surrounding neighbors. See paper and appendix B.
         self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0)
 
@@ -342,7 +349,7 @@ def __init__(self, *, num_levels: int = 4, radius: int = 4):
         self.num_levels = num_levels
         self.radius = radius
 
-        self.corr_pyramid: List[Tensor] = [torch.tensor(0)]  # useless, but torchscript is otherwise confused :')
+        self.corr_pyramid: list[Tensor] = [torch.tensor(0)]  # useless, but torchscript is otherwise confused :')
 
         # The neighborhood of a centroid pixel x' is {x' + delta, ||delta||_inf <= radius}
         # so it's a square surrounding x', and its sides have a length of 2 * radius + 1
@@ -362,6 +369,19 @@ def build_pyramid(self, fmap1, fmap2):
             raise ValueError(
                 f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)"
             )
+
+        # Explaining min_fmap_size below: the fmaps are down-sampled (num_levels - 1) times by a factor of 2.
+        # The last corr_volume most have at least 2 values (hence the 2* factor), otherwise grid_sample() would
+        # produce nans in its output.
+        min_fmap_size = 2 * (2 ** (self.num_levels - 1))
+        if any(fmap_size < min_fmap_size for fmap_size in fmap1.shape[-2:]):
+            raise ValueError(
+                "Feature maps are too small to be down-sampled by the correlation pyramid. "
+                f"H and W of feature maps should be at least {min_fmap_size}; got: {fmap1.shape[-2:]}. "
+                "Remember that input images to the model are downsampled by 8, so that means their "
+                f"dimensions should be at least 8 * {min_fmap_size} = {8 * min_fmap_size}."
+            )
+
         corr_volume = self._compute_corr_volume(fmap1, fmap2)
 
         batch_size, h, w, num_channels, _, _ = corr_volume.shape  # _, _ = h, w
@@ -423,7 +443,7 @@ def __init__(self, *, feature_encoder, context_encoder, corr_block, update_block
                 Its input is ``image1``. As in the original implementation, its output will be split into 2 parts:
 
                 - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                   the ``update_block``
 
                 These 2 parts are split according to the ``hidden_state_size`` of the ``update_block``, so the output
@@ -466,8 +486,8 @@ def forward(self, image1, image2, num_flow_updates: int = 12):
         batch_size, _, h, w = image1.shape
         if (h, w) != image2.shape[-2:]:
             raise ValueError(f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}")
-        if not (h % 8 == 0) and (w % 8 == 0):
-            raise ValueError(f"input image H and W should be divisible by 8, insted got {h} (h) and {w} (w)")
+        if not ((h % 8 == 0) and (w % 8 == 0)):
+            raise ValueError(f"input image H and W should be divisible by 8, instead got {h} (h) and {w} (w)")
 
         fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
         fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
@@ -545,6 +565,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 2.7894},
                 "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -563,6 +585,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 2.7161},
                 "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -581,6 +605,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Cleanpass": {"epe": 1.94},
                 "Sintel-Test-Finalpass": {"epe": 3.18},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -605,6 +631,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Cleanpass": {"epe": 1.819},
                 "Sintel-Test-Finalpass": {"epe": 3.067},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -629,6 +657,8 @@ class Raft_Large_Weights(WeightsEnum):
             "_metrics": {
                 "Kitti-Test": {"fl_all": 5.10},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -650,6 +680,8 @@ class Raft_Large_Weights(WeightsEnum):
             "_metrics": {
                 "Kitti-Test": {"fl_all": 5.19},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -691,6 +723,8 @@ class Raft_Small_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 3.2790},
                 "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
             },
+            "_ops": 47.655,
+            "_file_size": 3.821,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -708,6 +742,8 @@ class Raft_Small_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 3.2831},
                 "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
             },
+            "_ops": 47.655,
+            "_file_size": 3.821,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -795,7 +831,7 @@ def _raft(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index a75beb131b7..49ec1a340dd 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -39,7 +39,7 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 
 class QuantizableInception(Inception):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.cat = nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -50,7 +50,7 @@ def forward(self, x: Tensor) -> Tensor:
 class QuantizableInceptionAux(InceptionAux):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.relu = nn.ReLU()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -75,7 +75,7 @@ class QuantizableGoogLeNet(GoogLeNet):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(  # type: ignore[misc]
-            blocks=[QuantizableBasicConv2d, QuantizableInception, QuantizableInceptionAux], *args, **kwargs
+            *args, blocks=[QuantizableBasicConv2d, QuantizableInception, QuantizableInceptionAux], **kwargs
         )
         self.quant = torch.ao.quantization.QuantStub()
         self.dequant = torch.ao.quantization.DeQuantStub()
@@ -108,7 +108,7 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 
 class GoogLeNet_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c00238cf.pth",
+        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c81f6644.pth",
         transforms=partial(ImageClassification, crop_size=224),
         meta={
             "num_params": 6624904,
@@ -123,6 +123,8 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
                     "acc@5": 89.404,
                 }
             },
+            "_ops": 1.498,
+            "_file_size": 12.618,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
@@ -136,9 +138,11 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else GoogLeNet_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else GoogLeNet_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def googlenet(
@@ -195,7 +199,7 @@ def googlenet(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.aux1 = None  # type: ignore[assignment]
@@ -206,16 +210,3 @@ def googlenet(
             )
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..googlenet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 GoogLeNet ported from TensorFlow, with weights quantized in PyTorch
-        "googlenet_fbgemm": GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index 5af73c80fa0..a6eb9370d0d 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -1,6 +1,6 @@
 import warnings
 from functools import partial
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -41,7 +41,7 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 class QuantizableInceptionA(inception_module.InceptionA):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.myop = nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -52,7 +52,7 @@ def forward(self, x: Tensor) -> Tensor:
 class QuantizableInceptionB(inception_module.InceptionB):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.myop = nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -63,7 +63,7 @@ def forward(self, x: Tensor) -> Tensor:
 class QuantizableInceptionC(inception_module.InceptionC):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.myop = nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -74,7 +74,7 @@ def forward(self, x: Tensor) -> Tensor:
 class QuantizableInceptionD(inception_module.InceptionD):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.myop = nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -85,12 +85,12 @@ def forward(self, x: Tensor) -> Tensor:
 class QuantizableInceptionE(inception_module.InceptionE):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
         self.myop1 = nn.quantized.FloatFunctional()
         self.myop2 = nn.quantized.FloatFunctional()
         self.myop3 = nn.quantized.FloatFunctional()
 
-    def _forward(self, x: Tensor) -> List[Tensor]:
+    def _forward(self, x: Tensor) -> list[Tensor]:
         branch1x1 = self.branch1x1(x)
 
         branch3x3 = self.branch3x3_1(x)
@@ -119,20 +119,13 @@ def forward(self, x: Tensor) -> Tensor:
 class QuantizableInceptionAux(inception_module.InceptionAux):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(conv_block=QuantizableBasicConv2d, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
 
 
 class QuantizableInception3(inception_module.Inception3):
-    def __init__(
-        self,
-        num_classes: int = 1000,
-        aux_logits: bool = True,
-        transform_input: bool = False,
-    ) -> None:
-        super().__init__(
-            num_classes=num_classes,
-            aux_logits=aux_logits,
-            transform_input=transform_input,
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(  # type: ignore[misc]
+            *args,
             inception_blocks=[
                 QuantizableBasicConv2d,
                 QuantizableInceptionA,
@@ -142,6 +135,7 @@ def __init__(
                 QuantizableInceptionE,
                 QuantizableInceptionAux,
             ],
+            **kwargs,
         )
         self.quant = torch.ao.quantization.QuantStub()
         self.dequant = torch.ao.quantization.DeQuantStub()
@@ -174,7 +168,7 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 
 class Inception_V3_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-71447a44.pth",
+        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-a2837893.pth",
         transforms=partial(ImageClassification, crop_size=299, resize_size=342),
         meta={
             "num_params": 27161264,
@@ -189,6 +183,8 @@ class Inception_V3_QuantizedWeights(WeightsEnum):
                     "acc@5": 93.354,
                 }
             },
+            "_ops": 5.713,
+            "_file_size": 23.146,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
@@ -202,9 +198,11 @@ class Inception_V3_QuantizedWeights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else Inception_V3_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else Inception_V3_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def inception_v3(
@@ -269,22 +267,9 @@ def inception_v3(
         if quantize and not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not quantize and not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..inception import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 weights ported from TensorFlow, quantized in PyTorch
-        "inception_v3_google_fbgemm": Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py
index 1f91967f146..d1cef2d9413 100644
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -80,6 +80,8 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.150,
                 }
             },
+            "_ops": 0.301,
+            "_file_size": 3.423,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
@@ -93,9 +95,11 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1
-        if kwargs.get("quantize", False)
-        else MobileNet_V2_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1
+            if kwargs.get("quantize", False)
+            else MobileNet_V2_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def mobilenet_v2(
@@ -147,18 +151,6 @@ def mobilenet_v2(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v2_qnnpack": MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py
index 986f67c6080..7431b07df85 100644
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import nn, Tensor
@@ -83,7 +83,7 @@ def _load_from_state_dict(
 class QuantizableInvertedResidual(InvertedResidual):
     # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(se_layer=QuantizableSqueezeExcitation, *args, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, se_layer=QuantizableSqueezeExcitation, **kwargs)  # type: ignore[misc]
         self.skip_add = nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor) -> Tensor:
@@ -123,7 +123,7 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 
 
 def _mobilenet_v3_model(
-    inverted_residual_setting: List[InvertedResidualConfig],
+    inverted_residual_setting: list[InvertedResidualConfig],
     last_channel: int,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -149,7 +149,7 @@ def _mobilenet_v3_model(
         torch.ao.quantization.prepare_qat(model, inplace=True)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     if quantize:
         torch.ao.quantization.convert(model, inplace=True)
@@ -175,6 +175,8 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.858,
                 }
             },
+            "_ops": 0.217,
+            "_file_size": 21.554,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
@@ -188,9 +190,11 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1
-        if kwargs.get("quantize", False)
-        else MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1
+            if kwargs.get("quantize", False)
+            else MobileNet_V3_Large_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def mobilenet_v3_large(
@@ -233,15 +237,3 @@ def mobilenet_v3_large(
 
     inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
     return _mobilenet_v3_model(inverted_residual_setting, last_channel, weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv3 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large_qnnpack": MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py
index 39bea3f48f1..bc4ba003b6a 100644
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, List, Optional, Type, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -125,8 +125,8 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 
 
 def _resnet(
-    block: Type[Union[QuantizableBasicBlock, QuantizableBottleneck]],
-    layers: List[int],
+    block: type[Union[QuantizableBasicBlock, QuantizableBottleneck]],
+    layers: list[int],
     weights: Optional[WeightsEnum],
     progress: bool,
     quantize: bool,
@@ -144,7 +144,7 @@ def _resnet(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -175,6 +175,8 @@ class ResNet18_QuantizedWeights(WeightsEnum):
                     "acc@5": 88.882,
                 }
             },
+            "_ops": 1.814,
+            "_file_size": 11.238,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -194,6 +196,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                     "acc@5": 92.814,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 24.759,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -209,6 +213,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                     "acc@5": 94.976,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 24.953,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -228,6 +234,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                     "acc@5": 94.480,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 86.034,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -243,6 +251,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                     "acc@5": 96.132,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 86.645,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -263,6 +273,8 @@ class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
                     "acc@5": 96.326,
                 }
             },
+            "_ops": 15.46,
+            "_file_size": 81.556,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -272,9 +284,11 @@ class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else ResNet18_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNet18_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def resnet18(
@@ -322,9 +336,11 @@ def resnet18(
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else ResNet50_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNet50_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def resnet50(
@@ -372,9 +388,11 @@ def resnet50(
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else ResNeXt101_32X8D_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNeXt101_32X8D_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def resnext101_32x8d(
@@ -421,6 +439,16 @@ def resnext101_32x8d(
 
 
 @register_model(name="quantized_resnext101_64x4d")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ResNeXt101_64X4D_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNeXt101_64X4D_Weights.IMAGENET1K_V1
+        ),
+    )
+)
 def resnext101_64x4d(
     *,
     weights: Optional[Union[ResNeXt101_64X4D_QuantizedWeights, ResNeXt101_64X4D_Weights]] = None,
@@ -462,17 +490,3 @@ def resnext101_64x4d(
     _ovewrite_named_param(kwargs, "groups", 64)
     _ovewrite_named_param(kwargs, "width_per_group", 4)
     return _resnet(QuantizableBottleneck, [3, 4, 23, 3], weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..resnet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "resnet18_fbgemm": ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnet50_fbgemm": ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnext101_32x8d_fbgemm": ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index 1d3622b6403..d0a2eb8eb1b 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -88,8 +88,8 @@ def fuse_model(self, is_qat: Optional[bool] = None) -> None:
 
 
 def _shufflenetv2(
-    stages_repeats: List[int],
-    stages_out_channels: List[int],
+    stages_repeats: list[int],
+    stages_out_channels: list[int],
     *,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -108,7 +108,7 @@ def _shufflenetv2(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -139,6 +139,8 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
                     "acc@5": 79.780,
                 }
             },
+            "_ops": 0.04,
+            "_file_size": 1.501,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -146,7 +148,7 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
 
 class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-db332c57.pth",
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-1e62bb32.pth",
         transforms=partial(ImageClassification, crop_size=224),
         meta={
             **_COMMON_META,
@@ -158,6 +160,8 @@ class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
                     "acc@5": 87.582,
                 }
             },
+            "_ops": 0.145,
+            "_file_size": 2.334,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -178,6 +182,8 @@ class ShuffleNet_V2_X1_5_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.700,
                 }
             },
+            "_ops": 0.296,
+            "_file_size": 3.672,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -198,6 +204,8 @@ class ShuffleNet_V2_X2_0_QuantizedWeights(WeightsEnum):
                     "acc@5": 92.488,
                 }
             },
+            "_ops": 0.583,
+            "_file_size": 7.467,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -207,9 +215,11 @@ class ShuffleNet_V2_X2_0_QuantizedWeights(WeightsEnum):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: ShuffleNet_V2_X0_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            ShuffleNet_V2_X0_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def shufflenet_v2_x0_5(
@@ -261,9 +271,11 @@ def shufflenet_v2_x0_5(
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: ShuffleNet_V2_X1_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1
-        if kwargs.get("quantize", False)
-        else ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1,
+        lambda kwargs: (
+            ShuffleNet_V2_X1_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1
+        ),
     )
 )
 def shufflenet_v2_x1_0(
@@ -312,6 +324,16 @@ def shufflenet_v2_x1_0(
 
 
 @register_model(name="quantized_shufflenet_v2_x1_5")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ShuffleNet_V2_X1_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1
+        ),
+    )
+)
 def shufflenet_v2_x1_5(
     *,
     weights: Optional[Union[ShuffleNet_V2_X1_5_QuantizedWeights, ShuffleNet_V2_X1_5_Weights]] = None,
@@ -358,6 +380,16 @@ def shufflenet_v2_x1_5(
 
 
 @register_model(name="quantized_shufflenet_v2_x2_0")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ShuffleNet_V2_X2_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1
+        ),
+    )
+)
 def shufflenet_v2_x2_0(
     *,
     weights: Optional[Union[ShuffleNet_V2_X2_0_QuantizedWeights, ShuffleNet_V2_X2_0_Weights]] = None,
@@ -401,16 +433,3 @@ def shufflenet_v2_x2_0(
     return _shufflenetv2(
         [4, 8, 4], [24, 244, 488, 976, 2048], weights=weights, progress=progress, quantize=quantize, **kwargs
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..shufflenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "shufflenetv2_x0.5_fbgemm": ShuffleNet_V2_X0_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "shufflenetv2_x1.0_fbgemm": ShuffleNet_V2_X1_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/utils.py b/torchvision/models/quantization/utils.py
index a21e2af8e01..71d50bb9b48 100644
--- a/torchvision/models/quantization/utils.py
+++ b/torchvision/models/quantization/utils.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import nn
@@ -43,7 +43,7 @@ def quantize_model(model: nn.Module, backend: str) -> None:
 
 
 def _fuse_modules(
-    model: nn.Module, modules_to_fuse: Union[List[str], List[List[str]]], is_qat: Optional[bool], **kwargs: Any
+    model: nn.Module, modules_to_fuse: Union[list[str], list[list[str]]], is_qat: Optional[bool], **kwargs: Any
 ):
     if is_qat is None:
         is_qat = model.training
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 866e62c164d..915ef22bf33 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,7 +1,7 @@
 import math
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 from torch import nn, Tensor
@@ -185,11 +185,11 @@ def __init__(
 class BlockParams:
     def __init__(
         self,
-        depths: List[int],
-        widths: List[int],
-        group_widths: List[int],
-        bottleneck_multipliers: List[float],
-        strides: List[int],
+        depths: list[int],
+        widths: list[int],
+        group_widths: list[int],
+        bottleneck_multipliers: list[float],
+        strides: list[int],
         se_ratio: Optional[float] = None,
     ) -> None:
         self.depths = depths
@@ -212,7 +212,7 @@ def from_init_params(
         **kwargs: Any,
     ) -> "BlockParams":
         """
-        Programatically compute all the per-block settings,
+        Programmatically compute all the per-block settings,
         given the RegNet parameters.
 
         The first step is to compute the quantized linear block parameters,
@@ -277,8 +277,8 @@ def _get_expanded_params(self):
 
     @staticmethod
     def _adjust_widths_groups_compatibilty(
-        stage_widths: List[int], bottleneck_ratios: List[float], group_widths: List[int]
-    ) -> Tuple[List[int], List[int]]:
+        stage_widths: list[int], bottleneck_ratios: list[float], group_widths: list[int]
+    ) -> tuple[list[int], list[int]]:
         """
         Adjusts the compatibility of widths and groups,
         depending on the bottleneck ratio.
@@ -397,12 +397,12 @@ def _regnet(
     model = RegNet(block_params, norm_layer=norm_layer, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
 
-_COMMON_META: Dict[str, Any] = {
+_COMMON_META: dict[str, Any] = {
     "min_size": (1, 1),
     "categories": _IMAGENET_CATEGORIES,
 }
@@ -428,6 +428,8 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                     "acc@5": 91.716,
                 }
             },
+            "_ops": 0.402,
+            "_file_size": 16.806,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -444,6 +446,8 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                     "acc@5": 92.742,
                 }
             },
+            "_ops": 0.402,
+            "_file_size": 16.806,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -468,6 +472,8 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                     "acc@5": 93.136,
                 }
             },
+            "_ops": 0.834,
+            "_file_size": 24.774,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -484,6 +490,8 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                     "acc@5": 94.502,
                 }
             },
+            "_ops": 0.834,
+            "_file_size": 24.774,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -508,6 +516,8 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                     "acc@5": 93.966,
                 }
             },
+            "_ops": 1.612,
+            "_file_size": 43.152,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -524,6 +534,8 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                     "acc@5": 95.444,
                 }
             },
+            "_ops": 1.612,
+            "_file_size": 43.152,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -548,6 +560,8 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                     "acc@5": 94.576,
                 }
             },
+            "_ops": 3.176,
+            "_file_size": 74.567,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -564,6 +578,8 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                     "acc@5": 95.972,
                 }
             },
+            "_ops": 3.176,
+            "_file_size": 74.567,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -588,6 +604,8 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                     "acc@5": 95.048,
                 }
             },
+            "_ops": 8.473,
+            "_file_size": 150.701,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -604,6 +622,8 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                     "acc@5": 96.330,
                 }
             },
+            "_ops": 8.473,
+            "_file_size": 150.701,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -628,6 +648,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 95.240,
                 }
             },
+            "_ops": 15.912,
+            "_file_size": 319.49,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -644,6 +666,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 96.328,
                 }
             },
+            "_ops": 15.912,
+            "_file_size": 319.49,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -665,6 +689,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 98.054,
                 }
             },
+            "_ops": 46.735,
+            "_file_size": 319.49,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -686,6 +712,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 97.244,
                 }
             },
+            "_ops": 15.912,
+            "_file_size": 319.49,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -709,6 +737,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 95.340,
                 }
             },
+            "_ops": 32.28,
+            "_file_size": 554.076,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -725,6 +755,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 96.498,
                 }
             },
+            "_ops": 32.28,
+            "_file_size": 554.076,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -746,6 +778,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 98.362,
                 }
             },
+            "_ops": 94.826,
+            "_file_size": 554.076,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -767,6 +801,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 97.480,
                 }
             },
+            "_ops": 32.28,
+            "_file_size": 554.076,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -791,6 +827,8 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                     "acc@5": 98.682,
                 }
             },
+            "_ops": 374.57,
+            "_file_size": 2461.564,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -812,6 +850,8 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                     "acc@5": 97.844,
                 }
             },
+            "_ops": 127.518,
+            "_file_size": 2461.564,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -835,6 +875,8 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                     "acc@5": 90.950,
                 }
             },
+            "_ops": 0.414,
+            "_file_size": 21.258,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -851,6 +893,8 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                     "acc@5": 92.322,
                 }
             },
+            "_ops": 0.414,
+            "_file_size": 21.257,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -875,6 +919,8 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                     "acc@5": 92.348,
                 }
             },
+            "_ops": 0.8,
+            "_file_size": 27.945,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -891,6 +937,8 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                     "acc@5": 93.826,
                 }
             },
+            "_ops": 0.8,
+            "_file_size": 27.945,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -915,6 +963,8 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                     "acc@5": 93.440,
                 }
             },
+            "_ops": 1.603,
+            "_file_size": 35.339,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -931,6 +981,8 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                     "acc@5": 94.922,
                 }
             },
+            "_ops": 1.603,
+            "_file_size": 35.339,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -955,6 +1007,8 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                     "acc@5": 93.992,
                 }
             },
+            "_ops": 3.177,
+            "_file_size": 58.756,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -971,6 +1025,8 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                     "acc@5": 95.430,
                 }
             },
+            "_ops": 3.177,
+            "_file_size": 58.756,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -995,6 +1051,8 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                     "acc@5": 94.686,
                 }
             },
+            "_ops": 7.995,
+            "_file_size": 151.456,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1011,6 +1069,8 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                     "acc@5": 95.678,
                 }
             },
+            "_ops": 7.995,
+            "_file_size": 151.456,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1035,6 +1095,8 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                     "acc@5": 94.944,
                 }
             },
+            "_ops": 15.941,
+            "_file_size": 207.627,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1051,6 +1113,8 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                     "acc@5": 96.196,
                 }
             },
+            "_ops": 15.941,
+            "_file_size": 207.627,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1075,6 +1139,8 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                     "acc@5": 95.248,
                 }
             },
+            "_ops": 31.736,
+            "_file_size": 412.039,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1091,6 +1157,8 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                     "acc@5": 96.288,
                 }
             },
+            "_ops": 31.736,
+            "_file_size": 412.039,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1501,27 +1569,3 @@ def regnet_x_32gf(*, weights: Optional[RegNet_X_32GF_Weights] = None, progress:
 
     params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, **kwargs)
     return _regnet(params, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "regnet_y_400mf": RegNet_Y_400MF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_800mf": RegNet_Y_800MF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_1_6gf": RegNet_Y_1_6GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_3_2gf": RegNet_Y_3_2GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_8gf": RegNet_Y_8GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_16gf": RegNet_Y_16GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_32gf": RegNet_Y_32GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_400mf": RegNet_X_400MF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_800mf": RegNet_X_800MF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_1_6gf": RegNet_X_1_6GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_3_2gf": RegNet_X_3_2GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_8gf": RegNet_X_8GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_16gf": RegNet_X_16GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_32gf": RegNet_X_32GF_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index e743474b331..47067ec8317 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Callable, List, Optional, Type, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -108,7 +108,7 @@ def forward(self, x: Tensor) -> Tensor:
 class Bottleneck(nn.Module):
     # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
     # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
     # This variant is also known as ResNet V1.5 and improves accuracy according to
     # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
 
@@ -166,13 +166,13 @@ def forward(self, x: Tensor) -> Tensor:
 class ResNet(nn.Module):
     def __init__(
         self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        layers: List[int],
+        block: type[Union[BasicBlock, Bottleneck]],
+        layers: list[int],
         num_classes: int = 1000,
         zero_init_residual: bool = False,
         groups: int = 1,
         width_per_group: int = 64,
-        replace_stride_with_dilation: Optional[List[bool]] = None,
+        replace_stride_with_dilation: Optional[list[bool]] = None,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
     ) -> None:
         super().__init__()
@@ -224,7 +224,7 @@ def __init__(
 
     def _make_layer(
         self,
-        block: Type[Union[BasicBlock, Bottleneck]],
+        block: type[Union[BasicBlock, Bottleneck]],
         planes: int,
         blocks: int,
         stride: int = 1,
@@ -286,8 +286,8 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 def _resnet(
-    block: Type[Union[BasicBlock, Bottleneck]],
-    layers: List[int],
+    block: type[Union[BasicBlock, Bottleneck]],
+    layers: list[int],
     weights: Optional[WeightsEnum],
     progress: bool,
     **kwargs: Any,
@@ -298,7 +298,7 @@ def _resnet(
     model = ResNet(block, layers, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -323,6 +323,8 @@ class ResNet18_Weights(WeightsEnum):
                     "acc@5": 89.078,
                 }
             },
+            "_ops": 1.814,
+            "_file_size": 44.661,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -343,6 +345,8 @@ class ResNet34_Weights(WeightsEnum):
                     "acc@5": 91.420,
                 }
             },
+            "_ops": 3.664,
+            "_file_size": 83.275,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -363,6 +367,8 @@ class ResNet50_Weights(WeightsEnum):
                     "acc@5": 92.862,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 97.781,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -379,6 +385,8 @@ class ResNet50_Weights(WeightsEnum):
                     "acc@5": 95.434,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 97.79,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -402,6 +410,8 @@ class ResNet101_Weights(WeightsEnum):
                     "acc@5": 93.546,
                 }
             },
+            "_ops": 7.801,
+            "_file_size": 170.511,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -418,6 +428,8 @@ class ResNet101_Weights(WeightsEnum):
                     "acc@5": 95.780,
                 }
             },
+            "_ops": 7.801,
+            "_file_size": 170.53,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -441,6 +453,8 @@ class ResNet152_Weights(WeightsEnum):
                     "acc@5": 94.046,
                 }
             },
+            "_ops": 11.514,
+            "_file_size": 230.434,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -457,6 +471,8 @@ class ResNet152_Weights(WeightsEnum):
                     "acc@5": 96.002,
                 }
             },
+            "_ops": 11.514,
+            "_file_size": 230.474,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -480,6 +496,8 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                     "acc@5": 93.698,
                 }
             },
+            "_ops": 4.23,
+            "_file_size": 95.789,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -496,6 +514,8 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                     "acc@5": 95.340,
                 }
             },
+            "_ops": 4.23,
+            "_file_size": 95.833,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -519,6 +539,8 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                     "acc@5": 94.526,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 339.586,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -535,6 +557,8 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                     "acc@5": 96.228,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 339.673,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -558,6 +582,8 @@ class ResNeXt101_64X4D_Weights(WeightsEnum):
                     "acc@5": 96.454,
                 }
             },
+            "_ops": 15.46,
+            "_file_size": 319.318,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -581,6 +607,8 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                     "acc@5": 94.086,
                 }
             },
+            "_ops": 11.398,
+            "_file_size": 131.82,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -597,6 +625,8 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                     "acc@5": 95.758,
                 }
             },
+            "_ops": 11.398,
+            "_file_size": 263.124,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -620,6 +650,8 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                     "acc@5": 94.284,
                 }
             },
+            "_ops": 22.753,
+            "_file_size": 242.896,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -636,6 +668,8 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                     "acc@5": 96.020,
                 }
             },
+            "_ops": 22.753,
+            "_file_size": 484.747,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -648,7 +682,7 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
 def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     Args:
         weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The
@@ -674,7 +708,7 @@ def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
 def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     Args:
         weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
@@ -700,7 +734,7 @@ def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
 def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -732,7 +766,7 @@ def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
 def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -764,7 +798,7 @@ def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = T
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
 def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -854,6 +888,7 @@ def resnext101_32x8d(
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", ResNeXt101_64X4D_Weights.IMAGENET1K_V1))
 def resnext101_64x4d(
     *, weights: Optional[ResNeXt101_64X4D_Weights] = None, progress: bool = True, **kwargs: Any
 ) -> ResNet:
@@ -948,22 +983,3 @@ def wide_resnet101_2(
 
     _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
     return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "resnet18": ResNet18_Weights.IMAGENET1K_V1.url,
-        "resnet34": ResNet34_Weights.IMAGENET1K_V1.url,
-        "resnet50": ResNet50_Weights.IMAGENET1K_V1.url,
-        "resnet101": ResNet101_Weights.IMAGENET1K_V1.url,
-        "resnet152": ResNet152_Weights.IMAGENET1K_V1.url,
-        "resnext50_32x4d": ResNeXt50_32X4D_Weights.IMAGENET1K_V1.url,
-        "resnext101_32x8d": ResNeXt101_32X8D_Weights.IMAGENET1K_V1.url,
-        "wide_resnet50_2": Wide_ResNet50_2_Weights.IMAGENET1K_V1.url,
-        "wide_resnet101_2": Wide_ResNet101_2_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/_utils.py b/torchvision/models/segmentation/_utils.py
index 56560e9dab5..45bc2e7c435 100644
--- a/torchvision/models/segmentation/_utils.py
+++ b/torchvision/models/segmentation/_utils.py
@@ -1,5 +1,5 @@
 from collections import OrderedDict
-from typing import Dict, Optional
+from typing import Optional
 
 from torch import nn, Tensor
 from torch.nn import functional as F
@@ -17,7 +17,7 @@ def __init__(self, backbone: nn.Module, classifier: nn.Module, aux_classifier: O
         self.classifier = classifier
         self.aux_classifier = aux_classifier
 
-    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
         input_shape = x.shape[-2:]
         # contract: features is a dict of tensors
         features = self.backbone(x)
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
index 3e451a21aaf..62790ecb4dd 100644
--- a/torchvision/models/segmentation/deeplabv3.py
+++ b/torchvision/models/segmentation/deeplabv3.py
@@ -1,5 +1,6 @@
+from collections.abc import Sequence
 from functools import partial
-from typing import Any, List, Optional
+from typing import Any, Optional
 
 import torch
 from torch import nn
@@ -46,9 +47,9 @@ class DeepLabV3(_SimpleSegmentationModel):
 
 
 class DeepLabHead(nn.Sequential):
-    def __init__(self, in_channels: int, num_classes: int) -> None:
+    def __init__(self, in_channels: int, num_classes: int, atrous_rates: Sequence[int] = (12, 24, 36)) -> None:
         super().__init__(
-            ASPP(in_channels, [12, 24, 36]),
+            ASPP(in_channels, atrous_rates),
             nn.Conv2d(256, 256, 3, padding=1, bias=False),
             nn.BatchNorm2d(256),
             nn.ReLU(),
@@ -83,7 +84,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class ASPP(nn.Module):
-    def __init__(self, in_channels: int, atrous_rates: List[int], out_channels: int = 256) -> None:
+    def __init__(self, in_channels: int, atrous_rates: Sequence[int], out_channels: int = 256) -> None:
         super().__init__()
         modules = []
         modules.append(
@@ -152,6 +153,8 @@ class DeepLabV3_ResNet50_Weights(WeightsEnum):
                     "pixel_acc": 92.4,
                 }
             },
+            "_ops": 178.722,
+            "_file_size": 160.515,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -171,6 +174,8 @@ class DeepLabV3_ResNet101_Weights(WeightsEnum):
                     "pixel_acc": 92.4,
                 }
             },
+            "_ops": 258.743,
+            "_file_size": 233.217,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -190,6 +195,8 @@ class DeepLabV3_MobileNet_V3_Large_Weights(WeightsEnum):
                     "pixel_acc": 91.2,
                 }
             },
+            "_ops": 10.452,
+            "_file_size": 42.301,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -260,8 +267,8 @@ def deeplabv3_resnet50(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
-        aux_loss = _ovewrite_value_param(aux_loss, True)
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
     elif num_classes is None:
         num_classes = 21
 
@@ -269,7 +276,7 @@ def deeplabv3_resnet50(
     model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -316,8 +323,8 @@ def deeplabv3_resnet101(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
-        aux_loss = _ovewrite_value_param(aux_loss, True)
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
     elif num_classes is None:
         num_classes = 21
 
@@ -325,7 +332,7 @@ def deeplabv3_resnet101(
     model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -370,8 +377,8 @@ def deeplabv3_mobilenet_v3_large(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
-        aux_loss = _ovewrite_value_param(aux_loss, True)
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
     elif num_classes is None:
         num_classes = 21
 
@@ -379,19 +386,6 @@ def deeplabv3_mobilenet_v3_large(
     model = _deeplabv3_mobilenetv3(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "deeplabv3_resnet50_coco": DeepLabV3_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "deeplabv3_resnet101_coco": DeepLabV3_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "deeplabv3_mobilenet_v3_large_coco": DeepLabV3_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
index 5ec0747b710..fb2e242adac 100644
--- a/torchvision/models/segmentation/fcn.py
+++ b/torchvision/models/segmentation/fcn.py
@@ -71,6 +71,8 @@ class FCN_ResNet50_Weights(WeightsEnum):
                     "pixel_acc": 91.4,
                 }
             },
+            "_ops": 152.717,
+            "_file_size": 135.009,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -90,6 +92,8 @@ class FCN_ResNet101_Weights(WeightsEnum):
                     "pixel_acc": 91.9,
                 }
             },
+            "_ops": 232.738,
+            "_file_size": 207.711,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -155,8 +159,8 @@ def fcn_resnet50(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
-        aux_loss = _ovewrite_value_param(aux_loss, True)
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
     elif num_classes is None:
         num_classes = 21
 
@@ -164,7 +168,7 @@ def fcn_resnet50(
     model = _fcn_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -214,8 +218,8 @@ def fcn_resnet101(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
-        aux_loss = _ovewrite_value_param(aux_loss, True)
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
     elif num_classes is None:
         num_classes = 21
 
@@ -223,18 +227,6 @@ def fcn_resnet101(
     model = _fcn_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fcn_resnet50_coco": FCN_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "fcn_resnet101_coco": FCN_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
index 4bf71e77ae2..e49b06d5b9f 100644
--- a/torchvision/models/segmentation/lraspp.py
+++ b/torchvision/models/segmentation/lraspp.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from torch import nn, Tensor
 from torch.nn import functional as F
@@ -40,7 +40,7 @@ def __init__(
         self.backbone = backbone
         self.classifier = LRASPPHead(low_channels, high_channels, num_classes, inter_channels)
 
-    def forward(self, input: Tensor) -> Dict[str, Tensor]:
+    def forward(self, input: Tensor) -> dict[str, Tensor]:
         features = self.backbone(input)
         out = self.classifier(features)
         out = F.interpolate(out, size=input.shape[-2:], mode="bilinear", align_corners=False)
@@ -67,7 +67,7 @@ def __init__(self, low_channels: int, high_channels: int, num_classes: int, inte
         self.low_classifier = nn.Conv2d(low_channels, num_classes, 1)
         self.high_classifier = nn.Conv2d(inter_channels, num_classes, 1)
 
-    def forward(self, input: Dict[str, Tensor]) -> Tensor:
+    def forward(self, input: dict[str, Tensor]) -> Tensor:
         low = input["low"]
         high = input["high"]
 
@@ -108,6 +108,8 @@ class LRASPP_MobileNet_V3_Large_Weights(WeightsEnum):
                     "pixel_acc": 91.2,
                 }
             },
+            "_ops": 2.086,
+            "_file_size": 12.49,
             "_docs": """
                 These weights were trained on a subset of COCO, using only the 20 categories that are present in the
                 Pascal VOC dataset.
@@ -163,7 +165,7 @@ def lraspp_mobilenet_v3_large(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
     elif num_classes is None:
         num_classes = 21
 
@@ -171,17 +173,6 @@ def lraspp_mobilenet_v3_large(
     model = _lraspp_mobilenetv3(backbone, num_classes)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "lraspp_mobilenet_v3_large_coco": LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/segmentation.py b/torchvision/models/segmentation/segmentation.py
deleted file mode 100644
index ea05a0b0b7b..00000000000
--- a/torchvision/models/segmentation/segmentation.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import warnings
-
-# Import all methods/classes for BC:
-from . import *  # noqa: F401, F403
-
-
-warnings.warn(
-    "The 'torchvision.models.segmentation.segmentation' module is deprecated since 0.12 and will be removed in "
-    "0.14. Please use the 'torchvision.models.segmentation' directly instead."
-)
diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index 159e1be3bc8..96736f6a7ac 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -35,7 +35,7 @@ def channel_shuffle(x: Tensor, groups: int) -> Tensor:
     x = torch.transpose(x, 1, 2).contiguous()
 
     # flatten
-    x = x.view(batchsize, -1, height, width)
+    x = x.view(batchsize, num_channels, height, width)
 
     return x
 
@@ -104,8 +104,8 @@ def forward(self, x: Tensor) -> Tensor:
 class ShuffleNetV2(nn.Module):
     def __init__(
         self,
-        stages_repeats: List[int],
-        stages_out_channels: List[int],
+        stages_repeats: list[int],
+        stages_out_channels: list[int],
         num_classes: int = 1000,
         inverted_residual: Callable[..., nn.Module] = InvertedResidual,
     ) -> None:
@@ -178,7 +178,7 @@ def _shufflenetv2(
     model = ShuffleNetV2(*args, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -204,6 +204,8 @@ class ShuffleNet_V2_X0_5_Weights(WeightsEnum):
                     "acc@5": 81.746,
                 }
             },
+            "_ops": 0.04,
+            "_file_size": 5.282,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -224,6 +226,8 @@ class ShuffleNet_V2_X1_0_Weights(WeightsEnum):
                     "acc@5": 88.316,
                 }
             },
+            "_ops": 0.145,
+            "_file_size": 8.791,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -244,6 +248,8 @@ class ShuffleNet_V2_X1_5_Weights(WeightsEnum):
                     "acc@5": 91.086,
                 }
             },
+            "_ops": 0.296,
+            "_file_size": 13.557,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -267,6 +273,8 @@ class ShuffleNet_V2_X2_0_Weights(WeightsEnum):
                     "acc@5": 93.006,
                 }
             },
+            "_ops": 0.583,
+            "_file_size": 28.433,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -398,17 +406,3 @@ def shufflenet_v2_x2_0(
     weights = ShuffleNet_V2_X2_0_Weights.verify(weights)
 
     return _shufflenetv2(weights, progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "shufflenetv2_x0.5": ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1.url,
-        "shufflenetv2_x1.0": ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1.url,
-        "shufflenetv2_x1.5": None,
-        "shufflenetv2_x2.0": None,
-    }
-)
diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py
index 9fe6521e1a1..982b32107b0 100644
--- a/torchvision/models/squeezenet.py
+++ b/torchvision/models/squeezenet.py
@@ -109,7 +109,7 @@ def _squeezenet(
     model = SqueezeNet(version, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -135,6 +135,8 @@ class SqueezeNet1_0_Weights(WeightsEnum):
                     "acc@5": 80.420,
                 }
             },
+            "_ops": 0.819,
+            "_file_size": 4.778,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -154,6 +156,8 @@ class SqueezeNet1_1_Weights(WeightsEnum):
                     "acc@5": 80.624,
                 }
             },
+            "_ops": 0.349,
+            "_file_size": 4.729,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -217,15 +221,3 @@ def squeezenet1_1(
     """
     weights = SqueezeNet1_1_Weights.verify(weights)
     return _squeezenet("1_1", weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "squeezenet1_0": SqueezeNet1_0_Weights.IMAGENET1K_V1.url,
-        "squeezenet1_1": SqueezeNet1_1_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 9f43b546d59..80850b4a389 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -1,6 +1,6 @@
 import math
 from functools import partial
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn.functional as F
@@ -12,7 +12,7 @@
 from ..utils import _log_api_usage_once
 from ._api import register_model, Weights, WeightsEnum
 from ._meta import _IMAGENET_CATEGORIES
-from ._utils import _ovewrite_named_param
+from ._utils import _ovewrite_named_param, handle_legacy_interface
 
 
 __all__ = [
@@ -47,7 +47,7 @@ def _patch_merging_pad(x: torch.Tensor) -> torch.Tensor:
 
 
 def _get_relative_position_bias(
-    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: List[int]
+    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: list[int]
 ) -> torch.Tensor:
     N = window_size[0] * window_size[1]
     relative_position_bias = relative_position_bias_table[relative_position_index]  # type: ignore[index]
@@ -118,15 +118,16 @@ def shifted_window_attention(
     qkv_weight: Tensor,
     proj_weight: Tensor,
     relative_position_bias: Tensor,
-    window_size: List[int],
+    window_size: list[int],
     num_heads: int,
-    shift_size: List[int],
+    shift_size: list[int],
     attention_dropout: float = 0.0,
     dropout: float = 0.0,
     qkv_bias: Optional[Tensor] = None,
     proj_bias: Optional[Tensor] = None,
     logit_scale: Optional[torch.Tensor] = None,
-):
+    training: bool = True,
+) -> Tensor:
     """
     Window based multi-head self attention (W-MSA) module with relative position bias.
     It supports both of shifted and non-shifted window.
@@ -143,6 +144,7 @@ def shifted_window_attention(
         qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
         proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
         logit_scale (Tensor[out_dim], optional): Logit scale of cosine attention for Swin Transformer V2. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
     Returns:
         Tensor[N, H, W, C]: The output tensor after shifted window attention.
     """
@@ -207,11 +209,11 @@ def shifted_window_attention(
         attn = attn.view(-1, num_heads, x.size(1), x.size(1))
 
     attn = F.softmax(attn, dim=-1)
-    attn = F.dropout(attn, p=attention_dropout)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
 
     x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C)
     x = F.linear(x, proj_weight, proj_bias)
-    x = F.dropout(x, p=dropout)
+    x = F.dropout(x, p=dropout, training=training)
 
     # reverse windows
     x = x.view(B, pad_H // window_size[0], pad_W // window_size[1], window_size[0], window_size[1], C)
@@ -237,8 +239,8 @@ class ShiftedWindowAttention(nn.Module):
     def __init__(
         self,
         dim: int,
-        window_size: List[int],
-        shift_size: List[int],
+        window_size: list[int],
+        shift_size: list[int],
         num_heads: int,
         qkv_bias: bool = True,
         proj_bias: bool = True,
@@ -286,7 +288,7 @@ def get_relative_position_bias(self) -> torch.Tensor:
             self.relative_position_bias_table, self.relative_position_index, self.window_size  # type: ignore[arg-type]
         )
 
-    def forward(self, x: Tensor):
+    def forward(self, x: Tensor) -> Tensor:
         """
         Args:
             x (Tensor): Tensor with layout of [B, H, W, C]
@@ -306,6 +308,7 @@ def forward(self, x: Tensor):
             dropout=self.dropout,
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
+            training=self.training,
         )
 
 
@@ -317,8 +320,8 @@ class ShiftedWindowAttentionV2(ShiftedWindowAttention):
     def __init__(
         self,
         dim: int,
-        window_size: List[int],
-        shift_size: List[int],
+        window_size: list[int],
+        shift_size: list[int],
         num_heads: int,
         qkv_bias: bool = True,
         proj_bias: bool = True,
@@ -391,6 +394,7 @@ def forward(self, x: Tensor):
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
             logit_scale=self.logit_scale,
+            training=self.training,
         )
 
 
@@ -414,8 +418,8 @@ def __init__(
         self,
         dim: int,
         num_heads: int,
-        window_size: List[int],
-        shift_size: List[int],
+        window_size: list[int],
+        shift_size: list[int],
         mlp_ratio: float = 4.0,
         dropout: float = 0.0,
         attention_dropout: float = 0.0,
@@ -471,8 +475,8 @@ def __init__(
         self,
         dim: int,
         num_heads: int,
-        window_size: List[int],
-        shift_size: List[int],
+        window_size: list[int],
+        shift_size: list[int],
         mlp_ratio: float = 4.0,
         dropout: float = 0.0,
         attention_dropout: float = 0.0,
@@ -494,6 +498,8 @@ def __init__(
         )
 
     def forward(self, x: Tensor):
+        # Here is the difference, we apply norm after the attention in V2.
+        # In V1 we applied norm before the attention.
         x = x + self.stochastic_depth(self.norm1(self.attn(x)))
         x = x + self.stochastic_depth(self.norm2(self.mlp(x)))
         return x
@@ -502,7 +508,7 @@ def forward(self, x: Tensor):
 class SwinTransformer(nn.Module):
     """
     Implements Swin Transformer from the `"Swin Transformer: Hierarchical Vision Transformer using
-    Shifted Windows" <https://arxiv.org/pdf/2103.14030>`_ paper.
+    Shifted Windows" <https://arxiv.org/abs/2103.14030>`_ paper.
     Args:
         patch_size (List[int]): Patch size.
         embed_dim (int): Patch embedding dimension.
@@ -521,11 +527,11 @@ class SwinTransformer(nn.Module):
 
     def __init__(
         self,
-        patch_size: List[int],
+        patch_size: list[int],
         embed_dim: int,
-        depths: List[int],
-        num_heads: List[int],
-        window_size: List[int],
+        depths: list[int],
+        num_heads: list[int],
+        window_size: list[int],
         mlp_ratio: float = 4.0,
         dropout: float = 0.0,
         attention_dropout: float = 0.0,
@@ -544,7 +550,7 @@ def __init__(
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-5)
 
-        layers: List[nn.Module] = []
+        layers: list[nn.Module] = []
         # split image into non-overlapping patches
         layers.append(
             nn.Sequential(
@@ -560,7 +566,7 @@ def __init__(
         stage_block_id = 0
         # build SwinTransformer blocks
         for i_stage in range(len(depths)):
-            stage: List[nn.Module] = []
+            stage: list[nn.Module] = []
             dim = embed_dim * 2**i_stage
             for i_layer in range(depths[i_stage]):
                 # adjust stochastic depth probability based on the depth of the stage block
@@ -587,7 +593,7 @@ def __init__(
 
         num_features = embed_dim * 2 ** (len(depths) - 1)
         self.norm = norm_layer(num_features)
-        self.permute = Permute([0, 3, 1, 2])
+        self.permute = Permute([0, 3, 1, 2])  # B H W C -> B C H W
         self.avgpool = nn.AdaptiveAvgPool2d(1)
         self.flatten = nn.Flatten(1)
         self.head = nn.Linear(num_features, num_classes)
@@ -609,11 +615,11 @@ def forward(self, x):
 
 
 def _swin_transformer(
-    patch_size: List[int],
+    patch_size: list[int],
     embed_dim: int,
-    depths: List[int],
-    num_heads: List[int],
-    window_size: List[int],
+    depths: list[int],
+    num_heads: list[int],
+    window_size: list[int],
     stochastic_depth_prob: float,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -633,7 +639,7 @@ def _swin_transformer(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -660,6 +666,8 @@ class Swin_T_Weights(WeightsEnum):
                     "acc@5": 95.776,
                 }
             },
+            "_ops": 4.491,
+            "_file_size": 108.19,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -683,6 +691,8 @@ class Swin_S_Weights(WeightsEnum):
                     "acc@5": 96.360,
                 }
             },
+            "_ops": 8.741,
+            "_file_size": 189.786,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -706,6 +716,8 @@ class Swin_B_Weights(WeightsEnum):
                     "acc@5": 96.640,
                 }
             },
+            "_ops": 15.431,
+            "_file_size": 335.364,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -729,6 +741,8 @@ class Swin_V2_T_Weights(WeightsEnum):
                     "acc@5": 96.132,
                 }
             },
+            "_ops": 5.94,
+            "_file_size": 108.626,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -752,6 +766,8 @@ class Swin_V2_S_Weights(WeightsEnum):
                     "acc@5": 96.816,
                 }
             },
+            "_ops": 11.546,
+            "_file_size": 190.675,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -775,6 +791,8 @@ class Swin_V2_B_Weights(WeightsEnum):
                     "acc@5": 96.864,
                 }
             },
+            "_ops": 20.325,
+            "_file_size": 336.372,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -782,10 +800,11 @@ class Swin_V2_B_Weights(WeightsEnum):
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_T_Weights.IMAGENET1K_V1))
 def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_tiny architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_T_Weights`, optional): The
@@ -819,10 +838,11 @@ def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, *
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_S_Weights.IMAGENET1K_V1))
 def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_small architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_S_Weights`, optional): The
@@ -856,10 +876,11 @@ def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, *
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_B_Weights.IMAGENET1K_V1))
 def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_base architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_B_Weights`, optional): The
@@ -893,10 +914,11 @@ def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, *
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_V2_T_Weights.IMAGENET1K_V1))
 def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_tiny architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_T_Weights`, optional): The
@@ -932,10 +954,11 @@ def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = T
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_V2_S_Weights.IMAGENET1K_V1))
 def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_small architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_S_Weights`, optional): The
@@ -971,10 +994,11 @@ def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = T
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_V2_B_Weights.IMAGENET1K_V1))
 def swin_v2_b(*, weights: Optional[Swin_V2_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_base architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_B_Weights`, optional): The
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index dea783c2fb1..feed0ce8d77 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, cast, Dict, List, Optional, Union
+from typing import Any, cast, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -70,8 +70,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
-    layers: List[nn.Module] = []
+def make_layers(cfg: list[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
+    layers: list[nn.Module] = []
     in_channels = 3
     for v in cfg:
         if v == "M":
@@ -87,7 +87,7 @@ def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequ
     return nn.Sequential(*layers)
 
 
-cfgs: Dict[str, List[Union[str, int]]] = {
+cfgs: dict[str, list[Union[str, int]]] = {
     "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
     "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
     "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
@@ -102,7 +102,7 @@ def _vgg(cfg: str, batch_norm: bool, weights: Optional[WeightsEnum], progress: b
             _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
     model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
     return model
 
 
@@ -127,6 +127,8 @@ class VGG11_Weights(WeightsEnum):
                     "acc@5": 88.628,
                 }
             },
+            "_ops": 7.609,
+            "_file_size": 506.84,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -145,6 +147,8 @@ class VGG11_BN_Weights(WeightsEnum):
                     "acc@5": 89.810,
                 }
             },
+            "_ops": 7.609,
+            "_file_size": 506.881,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -163,6 +167,8 @@ class VGG13_Weights(WeightsEnum):
                     "acc@5": 89.246,
                 }
             },
+            "_ops": 11.308,
+            "_file_size": 507.545,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -181,6 +187,8 @@ class VGG13_BN_Weights(WeightsEnum):
                     "acc@5": 90.374,
                 }
             },
+            "_ops": 11.308,
+            "_file_size": 507.59,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -199,6 +207,8 @@ class VGG16_Weights(WeightsEnum):
                     "acc@5": 90.382,
                 }
             },
+            "_ops": 15.47,
+            "_file_size": 527.796,
         },
     )
     IMAGENET1K_FEATURES = Weights(
@@ -221,6 +231,8 @@ class VGG16_Weights(WeightsEnum):
                     "acc@5": float("nan"),
                 }
             },
+            "_ops": 15.47,
+            "_file_size": 527.802,
             "_docs": """
                 These weights can't be used for classification because they are missing values in the `classifier`
                 module. Only the `features` module has valid values and can be used for feature extraction. The weights
@@ -244,6 +256,8 @@ class VGG16_BN_Weights(WeightsEnum):
                     "acc@5": 91.516,
                 }
             },
+            "_ops": 15.47,
+            "_file_size": 527.866,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -262,6 +276,8 @@ class VGG19_Weights(WeightsEnum):
                     "acc@5": 90.876,
                 }
             },
+            "_ops": 19.632,
+            "_file_size": 548.051,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -280,6 +296,8 @@ class VGG19_BN_Weights(WeightsEnum):
                     "acc@5": 91.842,
                 }
             },
+            "_ops": 19.632,
+            "_file_size": 548.143,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -491,21 +509,3 @@ def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = Tru
     weights = VGG19_BN_Weights.verify(weights)
 
     return _vgg("E", True, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "vgg11": VGG11_Weights.IMAGENET1K_V1.url,
-        "vgg13": VGG13_Weights.IMAGENET1K_V1.url,
-        "vgg16": VGG16_Weights.IMAGENET1K_V1.url,
-        "vgg19": VGG19_Weights.IMAGENET1K_V1.url,
-        "vgg11_bn": VGG11_BN_Weights.IMAGENET1K_V1.url,
-        "vgg13_bn": VGG13_BN_Weights.IMAGENET1K_V1.url,
-        "vgg16_bn": VGG16_BN_Weights.IMAGENET1K_V1.url,
-        "vgg19_bn": VGG19_BN_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/video/__init__.py b/torchvision/models/video/__init__.py
index 232c92013c7..f1eedd31160 100644
--- a/torchvision/models/video/__init__.py
+++ b/torchvision/models/video/__init__.py
@@ -1,3 +1,4 @@
 from .mvit import *
 from .resnet import *
 from .s3d import *
+from .swin_transformer import *
diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index 7283a21bb0d..64d6d171b75 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -1,7 +1,8 @@
 import math
+from collections.abc import Sequence
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 import torch.fx
@@ -12,7 +13,7 @@
 from ...utils import _log_api_usage_once
 from .._api import register_model, Weights, WeightsEnum
 from .._meta import _KINETICS400_CATEGORIES
-from .._utils import _ovewrite_named_param
+from .._utils import _ovewrite_named_param, handle_legacy_interface
 
 
 __all__ = [
@@ -29,10 +30,10 @@ class MSBlockConfig:
     num_heads: int
     input_channels: int
     output_channels: int
-    kernel_q: List[int]
-    kernel_kv: List[int]
-    stride_q: List[int]
-    stride_kv: List[int]
+    kernel_q: list[int]
+    kernel_kv: list[int]
+    stride_q: list[int]
+    stride_kv: list[int]
 
 
 def _prod(s: Sequence[int]) -> int:
@@ -42,7 +43,7 @@ def _prod(s: Sequence[int]) -> int:
     return product
 
 
-def _unsqueeze(x: torch.Tensor, target_dim: int, expand_dim: int) -> Tuple[torch.Tensor, int]:
+def _unsqueeze(x: torch.Tensor, target_dim: int, expand_dim: int) -> tuple[torch.Tensor, int]:
     tensor_dim = x.dim()
     if tensor_dim == target_dim - 1:
         x = x.unsqueeze(expand_dim)
@@ -79,7 +80,7 @@ def __init__(
         self.norm_act = nn.Sequential(*layers) if layers else None
         self.norm_before_pool = norm_before_pool
 
-    def forward(self, x: torch.Tensor, thw: Tuple[int, int, int]) -> Tuple[torch.Tensor, Tuple[int, int, int]]:
+    def forward(self, x: torch.Tensor, thw: tuple[int, int, int]) -> tuple[torch.Tensor, tuple[int, int, int]]:
         x, tensor_dim = _unsqueeze(x, 4, 1)
 
         # Separate the class token and reshape the input
@@ -123,8 +124,8 @@ def _interpolate(embedding: torch.Tensor, d: int) -> torch.Tensor:
 def _add_rel_pos(
     attn: torch.Tensor,
     q: torch.Tensor,
-    q_thw: Tuple[int, int, int],
-    k_thw: Tuple[int, int, int],
+    q_thw: tuple[int, int, int],
+    k_thw: tuple[int, int, int],
     rel_pos_h: torch.Tensor,
     rel_pos_w: torch.Tensor,
     rel_pos_t: torch.Tensor,
@@ -147,7 +148,7 @@ def _add_rel_pos(
     k_t_ratio = max(q_t / k_t, 1.0)
     dist_t = torch.arange(q_t)[:, None] * q_t_ratio - (torch.arange(k_t)[None, :] + (1.0 - k_t)) * k_t_ratio
 
-    # Intepolate rel pos if needed.
+    # Interpolate rel pos if needed.
     rel_pos_h = _interpolate(rel_pos_h, dh)
     rel_pos_w = _interpolate(rel_pos_w, dw)
     rel_pos_t = _interpolate(rel_pos_t, dt)
@@ -195,14 +196,14 @@ def _add_shortcut(x: torch.Tensor, shortcut: torch.Tensor, residual_with_cls_emb
 class MultiscaleAttention(nn.Module):
     def __init__(
         self,
-        input_size: List[int],
+        input_size: list[int],
         embed_dim: int,
         output_dim: int,
         num_heads: int,
-        kernel_q: List[int],
-        kernel_kv: List[int],
-        stride_q: List[int],
-        stride_kv: List[int],
+        kernel_q: list[int],
+        kernel_kv: list[int],
+        stride_q: list[int],
+        stride_kv: list[int],
         residual_pool: bool,
         residual_with_cls_embed: bool,
         rel_pos_embed: bool,
@@ -219,7 +220,7 @@ def __init__(
         self.residual_with_cls_embed = residual_with_cls_embed
 
         self.qkv = nn.Linear(embed_dim, 3 * output_dim)
-        layers: List[nn.Module] = [nn.Linear(output_dim, output_dim)]
+        layers: list[nn.Module] = [nn.Linear(output_dim, output_dim)]
         if dropout > 0.0:
             layers.append(nn.Dropout(dropout, inplace=True))
         self.project = nn.Sequential(*layers)
@@ -285,7 +286,7 @@ def __init__(
             nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
             nn.init.trunc_normal_(self.rel_pos_t, std=0.02)
 
-    def forward(self, x: torch.Tensor, thw: Tuple[int, int, int]) -> Tuple[torch.Tensor, Tuple[int, int, int]]:
+    def forward(self, x: torch.Tensor, thw: tuple[int, int, int]) -> tuple[torch.Tensor, tuple[int, int, int]]:
         B, N, C = x.shape
         q, k, v = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).transpose(1, 3).unbind(dim=2)
 
@@ -323,7 +324,7 @@ def forward(self, x: torch.Tensor, thw: Tuple[int, int, int]) -> Tuple[torch.Ten
 class MultiscaleBlock(nn.Module):
     def __init__(
         self,
-        input_size: List[int],
+        input_size: list[int],
         cnf: MSBlockConfig,
         residual_pool: bool,
         residual_with_cls_embed: bool,
@@ -379,7 +380,7 @@ def __init__(
         if cnf.input_channels != cnf.output_channels:
             self.project = nn.Linear(cnf.input_channels, cnf.output_channels)
 
-    def forward(self, x: torch.Tensor, thw: Tuple[int, int, int]) -> Tuple[torch.Tensor, Tuple[int, int, int]]:
+    def forward(self, x: torch.Tensor, thw: tuple[int, int, int]) -> tuple[torch.Tensor, tuple[int, int, int]]:
         x_norm1 = self.norm1(x.transpose(1, 2)).transpose(1, 2) if self.needs_transposal else self.norm1(x)
         x_attn, thw_new = self.attn(x_norm1, thw)
         x = x if self.project is None or not self.proj_after_attn else self.project(x_norm1)
@@ -393,7 +394,7 @@ def forward(self, x: torch.Tensor, thw: Tuple[int, int, int]) -> Tuple[torch.Ten
 
 
 class PositionalEncoding(nn.Module):
-    def __init__(self, embed_size: int, spatial_size: Tuple[int, int], temporal_size: int, rel_pos_embed: bool) -> None:
+    def __init__(self, embed_size: int, spatial_size: tuple[int, int], temporal_size: int, rel_pos_embed: bool) -> None:
         super().__init__()
         self.spatial_size = spatial_size
         self.temporal_size = temporal_size
@@ -424,7 +425,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class MViT(nn.Module):
     def __init__(
         self,
-        spatial_size: Tuple[int, int],
+        spatial_size: tuple[int, int],
         temporal_size: int,
         block_setting: Sequence[MSBlockConfig],
         residual_pool: bool,
@@ -437,9 +438,9 @@ def __init__(
         num_classes: int = 400,
         block: Optional[Callable[..., nn.Module]] = None,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
-        patch_embed_kernel: Tuple[int, int, int] = (3, 7, 7),
-        patch_embed_stride: Tuple[int, int, int] = (2, 4, 4),
-        patch_embed_padding: Tuple[int, int, int] = (1, 3, 3),
+        patch_embed_kernel: tuple[int, int, int] = (3, 7, 7),
+        patch_embed_stride: tuple[int, int, int] = (2, 4, 4),
+        patch_embed_padding: tuple[int, int, int] = (1, 3, 3),
     ) -> None:
         """
         MViT main class.
@@ -566,7 +567,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 def _mvit(
-    block_setting: List[MSBlockConfig],
+    block_setting: list[MSBlockConfig],
     stochastic_depth_prob: float,
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -593,7 +594,7 @@ def _mvit(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -624,6 +625,8 @@ class MViT_V1_B_Weights(WeightsEnum):
                     "acc@5": 93.582,
                 }
             },
+            "_ops": 70.599,
+            "_file_size": 139.764,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -655,17 +658,22 @@ class MViT_V2_S_Weights(WeightsEnum):
                     "acc@5": 94.665,
                 }
             },
+            "_ops": 64.224,
+            "_file_size": 131.884,
         },
     )
     DEFAULT = KINETICS400_V1
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", MViT_V1_B_Weights.KINETICS400_V1))
 def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
     """
     Constructs a base MViTV1 architecture from
     `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.
 
+    .. betastatus:: video module
+
     Args:
         weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
             pretrained weights to use. See
@@ -684,7 +692,7 @@ def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = T
     """
     weights = MViT_V1_B_Weights.verify(weights)
 
-    config: Dict[str, List] = {
+    config: dict[str, list] = {
         "num_heads": [1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8],
         "input_channels": [96, 192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768, 768],
         "output_channels": [192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768, 768, 768],
@@ -756,10 +764,14 @@ def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = T
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", MViT_V2_S_Weights.KINETICS400_V1))
 def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
-    """
-    Constructs a small MViTV2 architecture from
-    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.
+    """Constructs a small MViTV2 architecture from
+    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
+    `MViTv2: Improved Multiscale Vision Transformers for Classification
+    and Detection <https://arxiv.org/abs/2112.01526>`__.
+
+    .. betastatus:: video module
 
     Args:
         weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
@@ -775,11 +787,11 @@ def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = T
             for more details about this class.
 
     .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
-        :members:
+            :members:
     """
     weights = MViT_V2_S_Weights.verify(weights)
 
-    config: Dict[str, List] = {
+    config: dict[str, list] = {
         "num_heads": [1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8],
         "input_channels": [96, 96, 192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768],
         "output_channels": [96, 192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768, 768],
diff --git a/torchvision/models/video/resnet.py b/torchvision/models/video/resnet.py
index 352ae92d194..43b0df48ffe 100644
--- a/torchvision/models/video/resnet.py
+++ b/torchvision/models/video/resnet.py
@@ -1,5 +1,6 @@
+from collections.abc import Sequence
 from functools import partial
-from typing import Any, Callable, List, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Callable, Optional, Union
 
 import torch.nn as nn
 from torch import Tensor
@@ -37,7 +38,7 @@ def __init__(
         )
 
     @staticmethod
-    def get_downsample_stride(stride: int) -> Tuple[int, int, int]:
+    def get_downsample_stride(stride: int) -> tuple[int, int, int]:
         return stride, stride, stride
 
 
@@ -60,7 +61,7 @@ def __init__(self, in_planes: int, out_planes: int, midplanes: int, stride: int
         )
 
     @staticmethod
-    def get_downsample_stride(stride: int) -> Tuple[int, int, int]:
+    def get_downsample_stride(stride: int) -> tuple[int, int, int]:
         return stride, stride, stride
 
 
@@ -79,7 +80,7 @@ def __init__(
         )
 
     @staticmethod
-    def get_downsample_stride(stride: int) -> Tuple[int, int, int]:
+    def get_downsample_stride(stride: int) -> tuple[int, int, int]:
         return 1, stride, stride
 
 
@@ -197,9 +198,9 @@ def __init__(self) -> None:
 class VideoResNet(nn.Module):
     def __init__(
         self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        conv_makers: Sequence[Type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]],
-        layers: List[int],
+        block: type[Union[BasicBlock, Bottleneck]],
+        conv_makers: Sequence[type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]],
+        layers: list[int],
         stem: Callable[..., nn.Module],
         num_classes: int = 400,
         zero_init_residual: bool = False,
@@ -264,8 +265,8 @@ def forward(self, x: Tensor) -> Tensor:
 
     def _make_layer(
         self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        conv_builder: Type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]],
+        block: type[Union[BasicBlock, Bottleneck]],
+        conv_builder: type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]],
         planes: int,
         blocks: int,
         stride: int = 1,
@@ -289,9 +290,9 @@ def _make_layer(
 
 
 def _video_resnet(
-    block: Type[Union[BasicBlock, Bottleneck]],
-    conv_makers: Sequence[Type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]],
-    layers: List[int],
+    block: type[Union[BasicBlock, Bottleneck]],
+    conv_makers: Sequence[type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]],
+    layers: list[int],
     stem: Callable[..., nn.Module],
     weights: Optional[WeightsEnum],
     progress: bool,
@@ -303,7 +304,7 @@ def _video_resnet(
     model = VideoResNet(block, conv_makers, layers, stem, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -332,6 +333,8 @@ class R3D_18_Weights(WeightsEnum):
                     "acc@5": 83.479,
                 }
             },
+            "_ops": 40.697,
+            "_file_size": 127.359,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -350,6 +353,8 @@ class MC3_18_Weights(WeightsEnum):
                     "acc@5": 84.130,
                 }
             },
+            "_ops": 43.343,
+            "_file_size": 44.672,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -368,6 +373,8 @@ class R2Plus1D_18_Weights(WeightsEnum):
                     "acc@5": 86.175,
                 }
             },
+            "_ops": 40.519,
+            "_file_size": 120.318,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py
index f80d849683c..4b202829b24 100644
--- a/torchvision/models/video/s3d.py
+++ b/torchvision/models/video/s3d.py
@@ -9,7 +9,7 @@
 from ...utils import _log_api_usage_once
 from .._api import register_model, Weights, WeightsEnum
 from .._meta import _KINETICS400_CATEGORIES
-from .._utils import _ovewrite_named_param
+from .._utils import _ovewrite_named_param, handle_legacy_interface
 
 
 __all__ = [
@@ -104,7 +104,7 @@ class S3D(nn.Module):
     def __init__(
         self,
         num_classes: int = 400,
-        dropout: float = 0.0,
+        dropout: float = 0.2,
         norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
     ) -> None:
         super().__init__()
@@ -153,41 +153,44 @@ def forward(self, x):
 
 class S3D_Weights(WeightsEnum):
     KINETICS400_V1 = Weights(
-        url="https://download.pytorch.org/models/s3d-1bd8ae63.pth",
+        url="https://download.pytorch.org/models/s3d-d76dad2f.pth",
         transforms=partial(
             VideoClassification,
             crop_size=(224, 224),
             resize_size=(256, 256),
-            mean=(0.5, 0.5, 0.5),
-            std=(0.5, 0.5, 0.5),
         ),
         meta={
             "min_size": (224, 224),
             "min_temporal_size": 14,
             "categories": _KINETICS400_CATEGORIES,
-            "recipe": "https://github.com/pytorch/vision/pull/6412#issuecomment-1219687434",
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification#s3d",
             "_docs": (
-                "The weights are ported from a community repository. The accuracies are estimated on clip-level "
+                "The weights aim to approximate the accuracy of the paper. The accuracies are estimated on clip-level "
                 "with parameters `frame_rate=15`, `clips_per_video=1`, and `clip_len=128`."
             ),
             "num_params": 8320048,
             "_metrics": {
                 "Kinetics-400": {
-                    "acc@1": 67.315,
-                    "acc@5": 87.593,
+                    "acc@1": 68.368,
+                    "acc@5": 88.050,
                 }
             },
+            "_ops": 17.979,
+            "_file_size": 31.972,
         },
     )
     DEFAULT = KINETICS400_V1
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", S3D_Weights.KINETICS400_V1))
 def s3d(*, weights: Optional[S3D_Weights] = None, progress: bool = True, **kwargs: Any) -> S3D:
     """Construct Separable 3D CNN model.
 
     Reference: `Rethinking Spatiotemporal Feature Learning <https://arxiv.org/abs/1712.04851>`__.
 
+    .. betastatus:: video module
+
     Args:
         weights (:class:`~torchvision.models.video.S3D_Weights`, optional): The
             pretrained weights to use. See
@@ -211,6 +214,6 @@ def s3d(*, weights: Optional[S3D_Weights] = None, progress: bool = True, **kwarg
     model = S3D(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/video/swin_transformer.py b/torchvision/models/video/swin_transformer.py
new file mode 100644
index 00000000000..1a198142874
--- /dev/null
+++ b/torchvision/models/video/swin_transformer.py
@@ -0,0 +1,743 @@
+# Modified from 2d Swin Transformers in torchvision:
+# https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py
+
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ...transforms._presets import VideoClassification
+
+from ...utils import _log_api_usage_once
+
+from .._api import register_model, Weights, WeightsEnum
+
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..swin_transformer import PatchMerging, SwinTransformerBlock
+
+__all__ = [
+    "SwinTransformer3d",
+    "Swin3D_T_Weights",
+    "Swin3D_S_Weights",
+    "Swin3D_B_Weights",
+    "swin3d_t",
+    "swin3d_s",
+    "swin3d_b",
+]
+
+
+def _get_window_and_shift_size(
+    shift_size: list[int], size_dhw: list[int], window_size: list[int]
+) -> tuple[list[int], list[int]]:
+    for i in range(3):
+        if size_dhw[i] <= window_size[i]:
+            # In this case, window_size will adapt to the input size, and no need to shift
+            window_size[i] = size_dhw[i]
+            shift_size[i] = 0
+
+    return window_size, shift_size
+
+
+torch.fx.wrap("_get_window_and_shift_size")
+
+
+def _get_relative_position_bias(
+    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: list[int]
+) -> Tensor:
+    window_vol = window_size[0] * window_size[1] * window_size[2]
+    # In 3d case we flatten the relative_position_bias
+    relative_position_bias = relative_position_bias_table[
+        relative_position_index[:window_vol, :window_vol].flatten()  # type: ignore[index]
+    ]
+    relative_position_bias = relative_position_bias.view(window_vol, window_vol, -1)
+    relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().unsqueeze(0)
+    return relative_position_bias
+
+
+torch.fx.wrap("_get_relative_position_bias")
+
+
+def _compute_pad_size_3d(size_dhw: tuple[int, int, int], patch_size: tuple[int, int, int]) -> tuple[int, int, int]:
+    pad_size = [(patch_size[i] - size_dhw[i] % patch_size[i]) % patch_size[i] for i in range(3)]
+    return pad_size[0], pad_size[1], pad_size[2]
+
+
+torch.fx.wrap("_compute_pad_size_3d")
+
+
+def _compute_attention_mask_3d(
+    x: Tensor,
+    size_dhw: tuple[int, int, int],
+    window_size: tuple[int, int, int],
+    shift_size: tuple[int, int, int],
+) -> Tensor:
+    # generate attention mask
+    attn_mask = x.new_zeros(*size_dhw)
+    num_windows = (size_dhw[0] // window_size[0]) * (size_dhw[1] // window_size[1]) * (size_dhw[2] // window_size[2])
+    slices = [
+        (
+            (0, -window_size[i]),
+            (-window_size[i], -shift_size[i]),
+            (-shift_size[i], None),
+        )
+        for i in range(3)
+    ]
+    count = 0
+    for d in slices[0]:
+        for h in slices[1]:
+            for w in slices[2]:
+                attn_mask[d[0] : d[1], h[0] : h[1], w[0] : w[1]] = count
+                count += 1
+
+    # Partition window on attn_mask
+    attn_mask = attn_mask.view(
+        size_dhw[0] // window_size[0],
+        window_size[0],
+        size_dhw[1] // window_size[1],
+        window_size[1],
+        size_dhw[2] // window_size[2],
+        window_size[2],
+    )
+    attn_mask = attn_mask.permute(0, 2, 4, 1, 3, 5).reshape(
+        num_windows, window_size[0] * window_size[1] * window_size[2]
+    )
+    attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+torch.fx.wrap("_compute_attention_mask_3d")
+
+
+def shifted_window_attention_3d(
+    input: Tensor,
+    qkv_weight: Tensor,
+    proj_weight: Tensor,
+    relative_position_bias: Tensor,
+    window_size: list[int],
+    num_heads: int,
+    shift_size: list[int],
+    attention_dropout: float = 0.0,
+    dropout: float = 0.0,
+    qkv_bias: Optional[Tensor] = None,
+    proj_bias: Optional[Tensor] = None,
+    training: bool = True,
+) -> Tensor:
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        input (Tensor[B, T, H, W, C]): The input tensor, 5-dimensions.
+        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
+        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
+        relative_position_bias (Tensor): The learned relative position bias added to attention.
+        window_size (List[int]): 3-dimensions window size, T, H, W .
+        num_heads (int): Number of attention heads.
+        shift_size (List[int]): Shift size for shifted window attention (T, H, W).
+        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
+        dropout (float): Dropout ratio of output. Default: 0.0.
+        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
+        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
+    Returns:
+        Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
+    """
+    b, t, h, w, c = input.shape
+    # pad feature maps to multiples of window size
+    pad_size = _compute_pad_size_3d((t, h, w), (window_size[0], window_size[1], window_size[2]))
+    x = F.pad(input, (0, 0, 0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+    _, tp, hp, wp, _ = x.shape
+    padded_size = (tp, hp, wp)
+
+    # cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+
+    # partition windows
+    num_windows = (
+        (padded_size[0] // window_size[0]) * (padded_size[1] // window_size[1]) * (padded_size[2] // window_size[2])
+    )
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        window_size[0],
+        padded_size[1] // window_size[1],
+        window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).reshape(
+        b * num_windows, window_size[0] * window_size[1] * window_size[2], c
+    )  # B*nW, Wd*Wh*Ww, C
+
+    # multi-head attention
+    qkv = F.linear(x, qkv_weight, qkv_bias)
+    qkv = qkv.reshape(x.size(0), x.size(1), 3, num_heads, c // num_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0], qkv[1], qkv[2]
+    q = q * (c // num_heads) ** -0.5
+    attn = q.matmul(k.transpose(-2, -1))
+    # add relative position bias
+    attn = attn + relative_position_bias
+
+    if sum(shift_size) > 0:
+        # generate attention mask to handle shifted windows with varying size
+        attn_mask = _compute_attention_mask_3d(
+            x,
+            (padded_size[0], padded_size[1], padded_size[2]),
+            (window_size[0], window_size[1], window_size[2]),
+            (shift_size[0], shift_size[1], shift_size[2]),
+        )
+        attn = attn.view(x.size(0) // num_windows, num_windows, num_heads, x.size(1), x.size(1))
+        attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
+        attn = attn.view(-1, num_heads, x.size(1), x.size(1))
+
+    attn = F.softmax(attn, dim=-1)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
+
+    x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), c)
+    x = F.linear(x, proj_weight, proj_bias)
+    x = F.dropout(x, p=dropout, training=training)
+
+    # reverse windows
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        padded_size[1] // window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[0],
+        window_size[1],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).reshape(b, tp, hp, wp, c)
+
+    # reverse cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+
+    # unpad features
+    x = x[:, :t, :h, :w, :].contiguous()
+    return x
+
+
+torch.fx.wrap("shifted_window_attention_3d")
+
+
+class ShiftedWindowAttention3d(nn.Module):
+    """
+    See :func:`shifted_window_attention_3d`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        window_size: list[int],
+        shift_size: list[int],
+        num_heads: int,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        if len(window_size) != 3 or len(shift_size) != 3:
+            raise ValueError("window_size and shift_size must be of length 2")
+
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.shift_size = shift_size
+        self.num_heads = num_heads
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+
+        self.define_relative_position_bias_table()
+        self.define_relative_position_index()
+
+    def define_relative_position_bias_table(self) -> None:
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1),
+                self.num_heads,
+            )
+        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def define_relative_position_index(self) -> None:
+        # get pair-wise relative position index for each token inside the window
+        coords_dhw = [torch.arange(self.window_size[i]) for i in range(3)]
+        coords = torch.stack(
+            torch.meshgrid(coords_dhw[0], coords_dhw[1], coords_dhw[2], indexing="ij")
+        )  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= 2 * self.window_size[2] - 1
+        # We don't flatten the relative_position_index here in 3d case.
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def get_relative_position_bias(self, window_size: list[int]) -> torch.Tensor:
+        return _get_relative_position_bias(self.relative_position_bias_table, self.relative_position_index, window_size)  # type: ignore
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, t, h, w, _ = x.shape
+        size_dhw = [t, h, w]
+        window_size, shift_size = self.window_size.copy(), self.shift_size.copy()
+        # Handle case where window_size is larger than the input tensor
+        window_size, shift_size = _get_window_and_shift_size(shift_size, size_dhw, window_size)
+
+        relative_position_bias = self.get_relative_position_bias(window_size)
+
+        return shifted_window_attention_3d(
+            x,
+            self.qkv.weight,
+            self.proj.weight,
+            relative_position_bias,
+            window_size,
+            self.num_heads,
+            shift_size=shift_size,
+            attention_dropout=self.attention_dropout,
+            dropout=self.dropout,
+            qkv_bias=self.qkv.bias,
+            proj_bias=self.proj.bias,
+            training=self.training,
+        )
+
+
+# Modified from:
+# https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/mmaction/models/backbones/swin_transformer.py
+class PatchEmbed3d(nn.Module):
+    """Video to Patch Embedding.
+
+    Args:
+        patch_size (List[int]): Patch token size.
+        in_channels (int): Number of input channels. Default: 3
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self,
+        patch_size: list[int],
+        in_channels: int = 3,
+        embed_dim: int = 96,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.tuple_patch_size = (patch_size[0], patch_size[1], patch_size[2])
+
+        self.proj = nn.Conv3d(
+            in_channels,
+            embed_dim,
+            kernel_size=self.tuple_patch_size,
+            stride=self.tuple_patch_size,
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        # padding
+        _, _, t, h, w = x.size()
+        pad_size = _compute_pad_size_3d((t, h, w), self.tuple_patch_size)
+        x = F.pad(x, (0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+        x = self.proj(x)  # B C T Wh Ww
+        x = x.permute(0, 2, 3, 4, 1)  # B T Wh Ww C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class SwinTransformer3d(nn.Module):
+    """
+    Implements 3D Swin Transformer from the `"Video Swin Transformer" <https://arxiv.org/abs/2106.13230>`_ paper.
+    Args:
+        patch_size (List[int]): Patch size.
+        embed_dim (int): Patch embedding dimension.
+        depths (List(int)): Depth of each Swin Transformer layer.
+        num_heads (List(int)): Number of attention heads in different layers.
+        window_size (List[int]): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
+        num_classes (int): Number of classes for classification head. Default: 400.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None.
+        block (nn.Module, optional): SwinTransformer Block. Default: None.
+        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
+        patch_embed (nn.Module, optional): Patch Embedding layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        patch_size: list[int],
+        embed_dim: int,
+        depths: list[int],
+        num_heads: list[int],
+        window_size: list[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.1,
+        num_classes: int = 400,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        block: Optional[Callable[..., nn.Module]] = None,
+        downsample_layer: Callable[..., nn.Module] = PatchMerging,
+        patch_embed: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+
+        if block is None:
+            block = partial(SwinTransformerBlock, attn_layer=ShiftedWindowAttention3d)
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-5)
+
+        if patch_embed is None:
+            patch_embed = PatchEmbed3d
+
+        # split image into non-overlapping patches
+        self.patch_embed = patch_embed(patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=dropout)
+
+        layers: list[nn.Module] = []
+        total_stage_blocks = sum(depths)
+        stage_block_id = 0
+        # build SwinTransformer blocks
+        for i_stage in range(len(depths)):
+            stage: list[nn.Module] = []
+            dim = embed_dim * 2**i_stage
+            for i_layer in range(depths[i_stage]):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / (total_stage_blocks - 1)
+                stage.append(
+                    block(
+                        dim,
+                        num_heads[i_stage],
+                        window_size=window_size,
+                        shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
+                        mlp_ratio=mlp_ratio,
+                        dropout=dropout,
+                        attention_dropout=attention_dropout,
+                        stochastic_depth_prob=sd_prob,
+                        norm_layer=norm_layer,
+                        attn_layer=ShiftedWindowAttention3d,
+                    )
+                )
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            # add patch merging layer
+            if i_stage < (len(depths) - 1):
+                layers.append(downsample_layer(dim, norm_layer))
+        self.features = nn.Sequential(*layers)
+
+        self.num_features = embed_dim * 2 ** (len(depths) - 1)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool3d(1)
+        self.head = nn.Linear(self.num_features, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: B C T H W
+        x = self.patch_embed(x)  # B _T _H _W C
+        x = self.pos_drop(x)
+        x = self.features(x)  # B _T _H _W C
+        x = self.norm(x)
+        x = x.permute(0, 4, 1, 2, 3)  # B, C, _T, _H, _W
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.head(x)
+        return x
+
+
+def _swin_transformer3d(
+    patch_size: list[int],
+    embed_dim: int,
+    depths: list[int],
+    num_heads: list[int],
+    window_size: list[int],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> SwinTransformer3d:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = SwinTransformer3d(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        stochastic_depth_prob=stochastic_depth_prob,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "categories": _KINETICS400_CATEGORIES,
+    "min_size": (1, 1),
+    "min_temporal_size": 1,
+}
+
+
+class Swin3D_T_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_t-7615ae03.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 28158070,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 77.715,
+                    "acc@5": 93.519,
+                }
+            },
+            "_ops": 43.882,
+            "_file_size": 121.543,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_S_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_s-da41c237.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 49816678,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.521,
+                    "acc@5": 94.158,
+                }
+            },
+            "_ops": 82.841,
+            "_file_size": 218.288,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_B_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.427,
+                    "acc@5": 94.386,
+                }
+            },
+            "_ops": 140.667,
+            "_file_size": 364.134,
+        },
+    )
+    KINETICS400_IMAGENET22K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_22k-7c6ae6fa.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 81.643,
+                    "acc@5": 95.574,
+                }
+            },
+            "_ops": 140.667,
+            "_file_size": 364.134,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_T_Weights.KINETICS400_V1))
+def swin3d_t(*, weights: Optional[Swin3D_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_tiny architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_T_Weights
+        :members:
+    """
+    weights = Swin3D_T_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_S_Weights.KINETICS400_V1))
+def swin3d_s(*, weights: Optional[Swin3D_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_small architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_S_Weights
+        :members:
+    """
+    weights = Swin3D_S_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_B_Weights.KINETICS400_V1))
+def swin3d_b(*, weights: Optional[Swin3D_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_base architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_B_Weights
+        :members:
+    """
+    weights = Swin3D_B_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index a0a42ab07b7..4ec3a5c59f0 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -1,7 +1,7 @@
 import math
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, Dict, List, NamedTuple, Optional
+from typing import Any, Callable, NamedTuple, Optional
 
 import torch
 import torch.nn as nn
@@ -110,7 +110,7 @@ def __init__(
     def forward(self, input: torch.Tensor):
         torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
         x = self.ln_1(input)
-        x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False)
+        x, _ = self.self_attention(x, x, x, need_weights=False)
         x = self.dropout(x)
         x = x + input
 
@@ -173,7 +173,7 @@ def __init__(
         num_classes: int = 1000,
         representation_size: Optional[int] = None,
         norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
-        conv_stem_configs: Optional[List[ConvStemConfig]] = None,
+        conv_stem_configs: Optional[list[ConvStemConfig]] = None,
     ):
         super().__init__()
         _log_api_usage_once(self)
@@ -268,8 +268,8 @@ def __init__(
     def _process_input(self, x: torch.Tensor) -> torch.Tensor:
         n, c, h, w = x.shape
         p = self.patch_size
-        torch._assert(h == self.image_size, "Wrong image height!")
-        torch._assert(w == self.image_size, "Wrong image width!")
+        torch._assert(h == self.image_size, f"Wrong image height! Expected {self.image_size} but got {h}!")
+        torch._assert(w == self.image_size, f"Wrong image width! Expected {self.image_size} but got {w}!")
         n_h = h // p
         n_w = w // p
 
@@ -332,12 +332,12 @@ def _vision_transformer(
     )
 
     if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
 
-_COMMON_META: Dict[str, Any] = {
+_COMMON_META: dict[str, Any] = {
     "categories": _IMAGENET_CATEGORIES,
 }
 
@@ -363,6 +363,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 95.318,
                 }
             },
+            "_ops": 17.564,
+            "_file_size": 330.285,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -387,6 +389,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 97.650,
                 }
             },
+            "_ops": 55.484,
+            "_file_size": 331.398,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -412,6 +416,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 96.180,
                 }
             },
+            "_ops": 17.564,
+            "_file_size": 330.285,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -436,6 +442,8 @@ class ViT_B_32_Weights(WeightsEnum):
                     "acc@5": 92.466,
                 }
             },
+            "_ops": 4.409,
+            "_file_size": 336.604,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -460,6 +468,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 94.638,
                 }
             },
+            "_ops": 61.555,
+            "_file_size": 1161.023,
             "_docs": """
                 These weights were trained from scratch by using a modified version of TorchVision's
                 `new training recipe
@@ -485,6 +495,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 98.512,
                 }
             },
+            "_ops": 361.986,
+            "_file_size": 1164.258,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -510,6 +522,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 97.422,
                 }
             },
+            "_ops": 61.555,
+            "_file_size": 1161.023,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -534,6 +548,8 @@ class ViT_L_32_Weights(WeightsEnum):
                     "acc@5": 93.07,
                 }
             },
+            "_ops": 15.378,
+            "_file_size": 1169.449,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -562,6 +578,8 @@ class ViT_H_14_Weights(WeightsEnum):
                     "acc@5": 98.694,
                 }
             },
+            "_ops": 1016.717,
+            "_file_size": 2416.643,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -587,6 +605,8 @@ class ViT_H_14_Weights(WeightsEnum):
                     "acc@5": 97.730,
                 }
             },
+            "_ops": 167.295,
+            "_file_size": 2411.209,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -733,6 +753,7 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", None))
 def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
     """
     Constructs a vit_h_14 architecture from
@@ -772,7 +793,7 @@ def interpolate_embeddings(
     interpolation_mode: str = "bicubic",
     reset_heads: bool = False,
 ) -> "OrderedDict[str, torch.Tensor]":
-    """This function helps interpolating positional embeddings during checkpoint loading,
+    """This function helps interpolate positional embeddings during checkpoint loading,
     especially when you want to apply a pre-trained model on images with different resolution.
 
     Args:
@@ -797,7 +818,7 @@ def interpolate_embeddings(
     # We do this by reshaping the positions embeddings to a 2d grid, performing
     # an interpolation in the (h, w) space and then reshaping back to a 1d grid.
     if new_seq_length != seq_length:
-        # The class token embedding shouldn't be interpolated so we split it up.
+        # The class token embedding shouldn't be interpolated, so we split it up.
         seq_length -= 1
         new_seq_length -= 1
         pos_embedding_token = pos_embedding[:, :1, :]
@@ -841,17 +862,3 @@ def interpolate_embeddings(
             model_state = model_state_copy
 
     return model_state
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "vit_b_16": ViT_B_16_Weights.IMAGENET1K_V1.url,
-        "vit_b_32": ViT_B_32_Weights.IMAGENET1K_V1.url,
-        "vit_l_16": ViT_L_16_Weights.IMAGENET1K_V1.url,
-        "vit_l_32": ViT_L_32_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/ops/_box_convert.py b/torchvision/ops/_box_convert.py
index 5e0520fc48a..81406248020 100644
--- a/torchvision/ops/_box_convert.py
+++ b/torchvision/ops/_box_convert.py
@@ -50,7 +50,7 @@ def _box_xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
 def _box_xywh_to_xyxy(boxes: Tensor) -> Tensor:
     """
     Converts bounding boxes from (x, y, w, h) format to (x1, y1, x2, y2) format.
-    (x, y) refers to top left of bouding box.
+    (x, y) refers to top left of bounding box.
     (w, h) refers to width and height of box.
     Args:
         boxes (Tensor[N, 4]): boxes in (x, y, w, h) which will be converted.
@@ -79,3 +79,129 @@ def _box_xyxy_to_xywh(boxes: Tensor) -> Tensor:
     h = y2 - y1  # y2 - y1
     boxes = torch.stack((x1, y1, w, h), dim=-1)
     return boxes
+
+
+def _box_cxcywhr_to_xywhr(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (cx, cy, w, h, r) format to (x1, y1, w, h, r) format.
+    (cx, cy) refers to center of bounding box
+    (w, h) refers to width and height of rotated bounding box
+    (x1, y1) refers to top left of rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+    Args:
+        boxes (Tensor[N, 5]): boxes in (cx, cy, w, h, r) format which will be converted.
+
+    Returns:
+        boxes (Tensor(N, 5)): rotated boxes in (x1, y1, w, h, r) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    cx, cy, w, h, r = boxes.unbind(-1)
+    r_rad = r * torch.pi / 180.0
+    cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+
+    x1 = cx - w / 2 * cos - h / 2 * sin
+    y1 = cy - h / 2 * cos + w / 2 * sin
+    boxes = torch.stack((x1, y1, w, h, r), dim=-1)
+
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
+
+
+def _box_xywhr_to_cxcywhr(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (cx, cy, w, h, r) format.
+    (x1, y1) refers to top left of rotated bounding box
+    (w, h) refers to width and height of rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+    Args:
+        boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format which will be converted.
+
+    Returns:
+        boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    x1, y1, w, h, r = boxes.unbind(-1)
+    r_rad = r * torch.pi / 180.0
+    cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+
+    cx = x1 + w / 2 * cos + h / 2 * sin
+    cy = y1 - w / 2 * sin + h / 2 * cos
+
+    boxes = torch.stack([cx, cy, w, h, r], dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
+
+
+def _box_xywhr_to_xyxyxyxy(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (x1, y1, x2, y2, x3, y3, x4, y4) format.
+    (x1, y1) refer to top left of bounding box
+    (w, h) are width and height of the rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    (x1, y1) refer to top left of rotated bounding box
+    (x2, y2) refer to top right of rotated bounding box
+    (x3, y3) refer to bottom right of rotated bounding box
+    (x4, y4) refer to bottom left ofrotated bounding box
+    Args:
+        boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format which will be converted.
+
+    Returns:
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    x1, y1, w, h, r = boxes.unbind(-1)
+    r_rad = r * torch.pi / 180.0
+    cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+
+    x2 = x1 + w * cos
+    y2 = y1 - w * sin
+    x3 = x2 + h * sin
+    y3 = y2 + h * cos
+    x4 = x1 + h * sin
+    y4 = y1 + h * cos
+
+    boxes = torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
+
+
+def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (x1, y1, x2, y2, x3, y3, x4, y4) format to (x1, y1, w, h, r) format.
+    (x1, y1) refer to top left of the rotated bounding box
+    (x2, y2) refer to bottom left of the rotated bounding box
+    (x3, y3) refer to bottom right of the rotated bounding box
+    (x4, y4) refer to top right of the rotated bounding box
+    (w, h) refers to width and height of rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    Args:
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
+
+    Returns:
+        boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    x1, y1, x2, y2, x3, y3, x4, y4 = boxes.unbind(-1)
+    r_rad = torch.atan2(y1 - y2, x2 - x1)
+    r = r_rad * 180 / torch.pi
+
+    w = ((x2 - x1) ** 2 + (y1 - y2) ** 2).sqrt()
+    h = ((x3 - x2) ** 2 + (y3 - y2) ** 2).sqrt()
+
+    boxes = torch.stack((x1, y1, w, h, r), dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py
index 629c19c1646..5dd263a5d8e 100644
--- a/torchvision/ops/_register_onnx_ops.py
+++ b/torchvision/ops/_register_onnx_ops.py
@@ -2,65 +2,106 @@
 import warnings
 
 import torch
+from torch.onnx import symbolic_opset11 as opset11
+from torch.onnx.symbolic_helper import parse_args
 
-_onnx_opset_version = 11
+_ONNX_OPSET_VERSION_11 = 11
+_ONNX_OPSET_VERSION_16 = 16
+BASE_ONNX_OPSET_VERSION = _ONNX_OPSET_VERSION_11
 
 
-def _register_custom_op():
-    from torch.onnx.symbolic_helper import parse_args
-    from torch.onnx.symbolic_opset11 import select, squeeze, unsqueeze
-    from torch.onnx.symbolic_opset9 import _cast_Long
-
-    @parse_args("v", "v", "f")
-    def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
-        boxes = unsqueeze(g, boxes, 0)
-        scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
-        max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
-        iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
-        nms_out = g.op("NonMaxSuppression", boxes, scores, max_output_per_class, iou_threshold)
-        return squeeze(g, select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1)
-
-    @parse_args("v", "v", "f", "i", "i", "i", "i")
-    def roi_align(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
-        batch_indices = _cast_Long(
-            g, squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1), False
-        )
-        rois = select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
-        # TODO: Remove this warning after ONNX opset 16 is supported.
-        if aligned:
-            warnings.warn(
-                "ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16. "
-                "The workaround is that the user need apply the patch "
-                "https://github.com/microsoft/onnxruntime/pull/8564 "
-                "and build ONNXRuntime from source."
-            )
-
-        # ONNX doesn't support negative sampling_ratio
-        if sampling_ratio < 0:
-            warnings.warn(
-                "ONNX doesn't support negative sampling ratio, therefore is set to 0 in order to be exported."
-            )
-            sampling_ratio = 0
-        return g.op(
-            "RoiAlign",
-            input,
-            rois,
-            batch_indices,
-            spatial_scale_f=spatial_scale,
-            output_height_i=pooled_height,
-            output_width_i=pooled_width,
-            sampling_ratio_i=sampling_ratio,
+@parse_args("v", "v", "f")
+def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
+    boxes = opset11.unsqueeze(g, boxes, 0)
+    scores = opset11.unsqueeze(g, opset11.unsqueeze(g, scores, 0), 0)
+    max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
+    iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
+
+    # Cast boxes and scores to float32 in case they are float64 inputs
+    nms_out = g.op(
+        "NonMaxSuppression",
+        g.op("Cast", boxes, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        g.op("Cast", scores, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        max_output_per_class,
+        iou_threshold,
+    )
+    return opset11.squeeze(
+        g, opset11.select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1
+    )
+
+
+def _process_batch_indices_for_roi_align(g, rois):
+    indices = opset11.squeeze(
+        g, opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1
+    )
+    return g.op("Cast", indices, to_i=torch.onnx.TensorProtoDataType.INT64)
+
+
+def _process_rois_for_roi_align(g, rois):
+    return opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+
+
+def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int):
+    if sampling_ratio < 0:
+        warnings.warn(
+            "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. "
+            "The model will be exported with a sampling_ratio of 0."
         )
+        sampling_ratio = 0
+    return sampling_ratio
+
 
-    @parse_args("v", "v", "f", "i", "i")
-    def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
-        roi_pool = g.op(
-            "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    if aligned:
+        warnings.warn(
+            "ROIAlign with aligned=True is only supported in opset >= 16. "
+            "Please export with opset 16 or higher, or use aligned=False."
         )
-        return roi_pool, None
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
+
+
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel"
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        coordinate_transformation_mode_s=coordinate_transformation_mode,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
 
-    from torch.onnx import register_custom_op_symbolic
 
-    register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version)
-    register_custom_op_symbolic("torchvision::roi_align", roi_align, _onnx_opset_version)
-    register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version)
+@parse_args("v", "v", "f", "i", "i")
+def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
+    roi_pool = g.op(
+        "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+    )
+    return roi_pool, None
+
+
+def _register_custom_op():
+    torch.onnx.register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _ONNX_OPSET_VERSION_16)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _ONNX_OPSET_VERSION_11)
diff --git a/torchvision/ops/_utils.py b/torchvision/ops/_utils.py
index a6ca557a98b..40bae605d02 100644
--- a/torchvision/ops/_utils.py
+++ b/torchvision/ops/_utils.py
@@ -1,10 +1,10 @@
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 from torch import nn, Tensor
 
 
-def _cat(tensors: List[Tensor], dim: int = 0) -> Tensor:
+def _cat(tensors: list[Tensor], dim: int = 0) -> Tensor:
     """
     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
     """
@@ -15,7 +15,7 @@ def _cat(tensors: List[Tensor], dim: int = 0) -> Tensor:
     return torch.cat(tensors, dim)
 
 
-def convert_boxes_to_roi_format(boxes: List[Tensor]) -> Tensor:
+def convert_boxes_to_roi_format(boxes: list[Tensor]) -> Tensor:
     concat_boxes = _cat([b for b in boxes], dim=0)
     temp = []
     for i, b in enumerate(boxes):
@@ -25,7 +25,7 @@ def convert_boxes_to_roi_format(boxes: List[Tensor]) -> Tensor:
     return rois
 
 
-def check_roi_boxes_shape(boxes: Union[Tensor, List[Tensor]]):
+def check_roi_boxes_shape(boxes: Union[Tensor, list[Tensor]]):
     if isinstance(boxes, (list, tuple)):
         for _tensor in boxes:
             torch._assert(
@@ -39,8 +39,8 @@ def check_roi_boxes_shape(boxes: Union[Tensor, List[Tensor]]):
 
 
 def split_normalization_params(
-    model: nn.Module, norm_classes: Optional[List[type]] = None
-) -> Tuple[List[Tensor], List[Tensor]]:
+    model: nn.Module, norm_classes: Optional[list[type]] = None
+) -> tuple[list[Tensor], list[Tensor]]:
     # Adapted from https://github.com/facebookresearch/ClassyVision/blob/659d7f78/classy_vision/generic/util.py#L501
     if not norm_classes:
         norm_classes = [
@@ -87,7 +87,7 @@ def _upcast_non_float(t: Tensor) -> Tensor:
 def _loss_inter_union(
     boxes1: torch.Tensor,
     boxes2: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
 
     x1, y1, x2, y2 = boxes1.unbind(dim=-1)
     x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py
index a541f8d880a..54f8d6b86e9 100644
--- a/torchvision/ops/boxes.py
+++ b/torchvision/ops/boxes.py
@@ -1,12 +1,19 @@
-from typing import Tuple
-
 import torch
 import torchvision
 from torch import Tensor
 from torchvision.extension import _assert_has_ops
 
 from ..utils import _log_api_usage_once
-from ._box_convert import _box_cxcywh_to_xyxy, _box_xywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xyxy_to_xywh
+from ._box_convert import (
+    _box_cxcywh_to_xyxy,
+    _box_cxcywhr_to_xywhr,
+    _box_xywh_to_xyxy,
+    _box_xywhr_to_cxcywhr,
+    _box_xywhr_to_xyxyxyxy,
+    _box_xyxy_to_cxcywh,
+    _box_xyxy_to_xywh,
+    _box_xyxyxyxy_to_xywhr,
+)
 from ._utils import _upcast
 
 
@@ -16,7 +23,7 @@ def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
     to their intersection-over-union (IoU).
 
     NMS iteratively removes lower scoring boxes which have an
-    IoU greater than iou_threshold with another (higher scoring)
+    IoU greater than ``iou_threshold`` with another (higher scoring)
     box.
 
     If multiple boxes have the exact same score and satisfy the IoU
@@ -69,7 +76,8 @@ def batched_nms(
         _log_api_usage_once(batched_nms)
     # Benchmarks that drove the following thresholds are at
     # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
-    if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing():
+    # and https://github.com/pytorch/vision/pull/8925
+    if boxes.numel() > (4000 if boxes.device.type == "cpu" else 100_000) and not torchvision._is_tracing():
         return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
     else:
         return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
@@ -114,36 +122,45 @@ def _batched_nms_vanilla(
 
 def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
     """
-    Remove boxes which contains at least one side smaller than min_size.
+    Remove every box from ``boxes`` which contains at least one side length
+    that is smaller than ``min_size``.
+
+    .. note::
+        For sanitizing a :class:`~torchvision.tv_tensors.BoundingBoxes` object, consider using
+        the transform :func:`~torchvision.transforms.v2.SanitizeBoundingBoxes` instead.
 
     Args:
-        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
+        boxes (Tensor[..., 4]): boxes in ``(x1, y1, x2, y2)`` format
             with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         min_size (float): minimum size
 
     Returns:
         Tensor[K]: indices of the boxes that have both sides
-        larger than min_size
+        larger than ``min_size``
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(remove_small_boxes)
-    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
+    ws, hs = boxes[..., 2] - boxes[..., 0], boxes[..., 3] - boxes[..., 1]
     keep = (ws >= min_size) & (hs >= min_size)
     keep = torch.where(keep)[0]
     return keep
 
 
-def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
+def clip_boxes_to_image(boxes: Tensor, size: tuple[int, int]) -> Tensor:
     """
-    Clip boxes so that they lie inside an image of size `size`.
+    Clip boxes so that they lie inside an image of size ``size``.
+
+    .. note::
+        For clipping a :class:`~torchvision.tv_tensors.BoundingBoxes` object, consider using
+        the transform :func:`~torchvision.transforms.v2.ClampBoundingBoxes` instead.
 
     Args:
-        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
+        boxes (Tensor[..., 4]): boxes in ``(x1, y1, x2, y2)`` format
             with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         size (Tuple[height, width]): size of the image
 
     Returns:
-        Tensor[N, 4]: clipped boxes
+        Tensor[..., 4]: clipped boxes
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(clip_boxes_to_image)
@@ -167,108 +184,188 @@ def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
 
 def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
     """
-    Converts boxes from given in_fmt to out_fmt.
-    Supported in_fmt and out_fmt are:
+    Converts :class:`torch.Tensor` boxes from a given ``in_fmt`` to ``out_fmt``.
 
-    'xyxy': boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
+    .. note::
+        For converting a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.BoundingBoxes` object
+        between different formats,
+        consider using :func:`~torchvision.transforms.v2.functional.convert_bounding_box_format` instead.
+        Or see the corresponding transform :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat`.
+
+    Supported ``in_fmt`` and ``out_fmt`` strings are:
+
+    ``'xyxy'``: boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
     This is the format that torchvision utilities expect.
 
-    'xywh' : boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
+    ``'xywh'``: boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
+
+    ``'cxcywh'``: boxes are represented via centre, width and height, cx, cy being center of box, w, h
+    being width and height.
+
+    ``'xywhr'``: boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
-    'cxcywh' : boxes are represented via centre, width and height, cx, cy being center of box, w, h
+    ``'cxcywhr'``: boxes are represented via centre, width and height, cx, cy being center of box, w, h
     being width and height.
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    ``'xyxyxyxy'``: boxes are represented via corners, x1, y1 being top left, x2, y2 top right,
+    x3, y3 bottom right, and x4, y4 bottom left.
 
     Args:
-        boxes (Tensor[N, 4]): boxes which will be converted.
-        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh'].
-        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh']
+        boxes (Tensor[N, K]): boxes which will be converted. K is the number of coordinates (4 for unrotated bounding boxes, 5 or 8 for rotated bounding boxes)
+        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh', 'xywhr', 'cxcywhr', 'xyxyxyxy'].
+        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh', 'xywhr', 'cxcywhr', 'xyxyxyxy']
 
     Returns:
-        Tensor[N, 4]: Boxes into converted format.
+        Tensor[N, K]: Boxes into converted format.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(box_convert)
-    allowed_fmts = ("xyxy", "xywh", "cxcywh")
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+        "xywhr",
+        "cxcywhr",
+        "xyxyxyxy",
+    )
     if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
-        raise ValueError("Unsupported Bounding Box Conversions for given in_fmt and out_fmt")
+        raise ValueError(f"Unsupported Bounding Box Conversions for given in_fmt {in_fmt} and out_fmt {out_fmt}")
 
     if in_fmt == out_fmt:
         return boxes.clone()
+    e = (in_fmt, out_fmt)
+    if e == ("xywh", "xyxy"):
+        boxes = _box_xywh_to_xyxy(boxes)
+    elif e == ("cxcywh", "xyxy"):
+        boxes = _box_cxcywh_to_xyxy(boxes)
+    elif e == ("xyxy", "xywh"):
+        boxes = _box_xyxy_to_xywh(boxes)
+    elif e == ("xyxy", "cxcywh"):
+        boxes = _box_xyxy_to_cxcywh(boxes)
+    elif e == ("xywh", "cxcywh"):
+        boxes = _box_xywh_to_xyxy(boxes)
+        boxes = _box_xyxy_to_cxcywh(boxes)
+    elif e == ("cxcywh", "xywh"):
+        boxes = _box_cxcywh_to_xyxy(boxes)
+        boxes = _box_xyxy_to_xywh(boxes)
+    elif e == ("cxcywhr", "xywhr"):
+        boxes = _box_cxcywhr_to_xywhr(boxes)
+    elif e == ("xywhr", "cxcywhr"):
+        boxes = _box_xywhr_to_cxcywhr(boxes)
+    elif e == ("cxcywhr", "xyxyxyxy"):
+        boxes = _box_cxcywhr_to_xywhr(boxes).to(boxes.dtype)
+        boxes = _box_xywhr_to_xyxyxyxy(boxes)
+    elif e == ("xyxyxyxy", "cxcywhr"):
+        boxes = _box_xyxyxyxy_to_xywhr(boxes).to(boxes.dtype)
+        boxes = _box_xywhr_to_cxcywhr(boxes)
+    elif e == ("xywhr", "xyxyxyxy"):
+        boxes = _box_xywhr_to_xyxyxyxy(boxes)
+    elif e == ("xyxyxyxy", "xywhr"):
+        boxes = _box_xyxyxyxy_to_xywhr(boxes)
+    else:
+        raise NotImplementedError(f"Unsupported Bounding Box Conversions for given in_fmt {e[0]} and out_fmt {e[1]}")
 
-    if in_fmt != "xyxy" and out_fmt != "xyxy":
-        # convert to xyxy and change in_fmt xyxy
-        if in_fmt == "xywh":
-            boxes = _box_xywh_to_xyxy(boxes)
-        elif in_fmt == "cxcywh":
-            boxes = _box_cxcywh_to_xyxy(boxes)
-        in_fmt = "xyxy"
-
-    if in_fmt == "xyxy":
-        if out_fmt == "xywh":
-            boxes = _box_xyxy_to_xywh(boxes)
-        elif out_fmt == "cxcywh":
-            boxes = _box_xyxy_to_cxcywh(boxes)
-    elif out_fmt == "xyxy":
-        if in_fmt == "xywh":
-            boxes = _box_xywh_to_xyxy(boxes)
-        elif in_fmt == "cxcywh":
-            boxes = _box_cxcywh_to_xyxy(boxes)
     return boxes
 
 
-def box_area(boxes: Tensor) -> Tensor:
+def box_area(boxes: Tensor, fmt: str = "xyxy") -> Tensor:
     """
-    Computes the area of a set of bounding boxes, which are specified by their
-    (x1, y1, x2, y2) coordinates.
+    Computes the area of a set of bounding boxes from a given format.
 
     Args:
-        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
-            are expected to be in (x1, y1, x2, y2) format with
-            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+        boxes (Tensor[..., 4]): boxes for which the area will be computed.
+        fmt (str): Format of the input boxes.
+            Default is "xyxy" to preserve backward compatibility.
+            Supported formats are "xyxy", "xywh", and "cxcywh".
 
     Returns:
-        Tensor[N]: the area for each box
+        Tensor[N]: Tensor containing the area for each box.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(box_area)
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+    )
+    if fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Bounding Box area for given format {fmt}")
     boxes = _upcast(boxes)
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    if fmt == "xyxy":
+        area = (boxes[..., 2] - boxes[..., 0]) * (boxes[..., 3] - boxes[..., 1])
+    else:
+        # For formats with width and height, area = width * height
+        # Supported: cxcywh, xywh
+        area = boxes[..., 2] * boxes[..., 3]
+
+    return area
 
 
 # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
 # with slight modifications
-def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+def _box_inter_union(boxes1: Tensor, boxes2: Tensor, fmt: str = "xyxy") -> tuple[Tensor, Tensor]:
+    area1 = box_area(boxes1, fmt=fmt)
+    area2 = box_area(boxes2, fmt=fmt)
+
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+    )
+    if fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Box IoU Calculation for given fmt {fmt}.")
+
+    if fmt == "xyxy":
+        lt = torch.max(boxes1[..., None, :2], boxes2[..., None, :, :2])  # [...,N,M,2]
+        rb = torch.min(boxes1[..., None, 2:], boxes2[..., None, :, 2:])  # [...,N,M,2]
+    elif fmt == "xywh":
+        lt = torch.max(boxes1[..., None, :2], boxes2[..., None, :, :2])  # [...,N,M,2]
+        rb = torch.min(
+            boxes1[..., None, :2] + boxes1[..., None, 2:], boxes2[..., None, :, :2] + boxes2[..., None, :, 2:]
+        )  # [...,N,M,2]
+    else:  # fmt == "cxcywh":
+        lt = torch.max(
+            boxes1[..., None, :2] - boxes1[..., None, 2:] / 2, boxes2[..., None, :, :2] - boxes2[..., None, :, 2:] / 2
+        )  # [N,M,2]
+        rb = torch.min(
+            boxes1[..., None, :2] + boxes1[..., None, 2:] / 2, boxes2[..., None, :, :2] + boxes2[..., None, :, 2:] / 2
+        )  # [N,M,2]
 
     wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    inter = wh[..., 0] * wh[..., 1]  # [N,M]
 
-    union = area1[:, None] + area2 - inter
+    union = area1[..., None] + area2[..., None, :] - inter
 
     return inter, union
 
 
-def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+def box_iou(boxes1: Tensor, boxes2: Tensor, fmt: str = "xyxy") -> Tensor:
     """
-    Return intersection-over-union (Jaccard index) between two sets of boxes.
-
-    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
-    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+    Return intersection-over-union (Jaccard index) between two sets of boxes from a given format.
 
     Args:
-        boxes1 (Tensor[N, 4]): first set of boxes
-        boxes2 (Tensor[M, 4]): second set of boxes
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
+        fmt (str): Format of the input boxes.
+            Default is "xyxy" to preserve backward compatibility.
+            Supported formats are "xyxy", "xywh", and "cxcywh".
 
     Returns:
-        Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
+        Tensor[..., N, M]: the NxM matrix containing the pairwise IoU values for every element
+        in boxes1 and boxes2
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(box_iou)
-    inter, union = _box_inter_union(boxes1, boxes2)
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+    )
+    if fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Box IoU Calculation for given format {fmt}.")
+    inter, union = _box_inter_union(boxes1, boxes2, fmt=fmt)
     iou = inter / union
     return iou
 
@@ -282,11 +379,11 @@ def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
     ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 
     Args:
-        boxes1 (Tensor[N, 4]): first set of boxes
-        boxes2 (Tensor[M, 4]): second set of boxes
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
 
     Returns:
-        Tensor[N, M]: the NxM matrix containing the pairwise generalized IoU values
+        Tensor[..., N, M]: the NxM matrix containing the pairwise generalized IoU values
         for every element in boxes1 and boxes2
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
@@ -295,11 +392,11 @@ def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
     inter, union = _box_inter_union(boxes1, boxes2)
     iou = inter / union
 
-    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    lti = torch.min(boxes1[..., None, :2], boxes2[..., None, :, :2])
+    rbi = torch.max(boxes1[..., None, 2:], boxes2[..., None, :, 2:])
 
     whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
-    areai = whi[:, :, 0] * whi[:, :, 1]
+    areai = whi[..., 0] * whi[..., 1]
 
     return iou - (areai - union) / areai
 
@@ -310,11 +407,11 @@ def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
     Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
     ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
     Args:
-        boxes1 (Tensor[N, 4]): first set of boxes
-        boxes2 (Tensor[M, 4]): second set of boxes
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
         eps (float, optional): small number to prevent division by zero. Default: 1e-7
     Returns:
-        Tensor[N, M]: the NxM matrix containing the pairwise complete IoU values
+        Tensor[..., N, M]: the NxM matrix containing the pairwise complete IoU values
         for every element in boxes1 and boxes2
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
@@ -325,11 +422,11 @@ def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
 
     diou, iou = _box_diou_iou(boxes1, boxes2, eps)
 
-    w_pred = boxes1[:, None, 2] - boxes1[:, None, 0]
-    h_pred = boxes1[:, None, 3] - boxes1[:, None, 1]
+    w_pred = boxes1[..., None, 2] - boxes1[..., None, 0]
+    h_pred = boxes1[..., None, 3] - boxes1[..., None, 1]
 
-    w_gt = boxes2[:, 2] - boxes2[:, 0]
-    h_gt = boxes2[:, 3] - boxes2[:, 1]
+    w_gt = boxes2[..., None, :, 2] - boxes2[..., None, :, 0]
+    h_gt = boxes2[..., None, :, 3] - boxes2[..., None, :, 1]
 
     v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w_pred / h_pred) - torch.atan(w_gt / h_gt), 2)
     with torch.no_grad():
@@ -345,12 +442,12 @@ def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
     ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 
     Args:
-        boxes1 (Tensor[N, 4]): first set of boxes
-        boxes2 (Tensor[M, 4]): second set of boxes
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
         eps (float, optional): small number to prevent division by zero. Default: 1e-7
 
     Returns:
-        Tensor[N, M]: the NxM matrix containing the pairwise distance IoU values
+        Tensor[..., N, M]: the NxM matrix containing the pairwise distance IoU values
         for every element in boxes1 and boxes2
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
@@ -362,21 +459,21 @@ def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
     return diou
 
 
-def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tuple[Tensor, Tensor]:
+def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> tuple[Tensor, Tensor]:
 
     iou = box_iou(boxes1, boxes2)
-    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    lti = torch.min(boxes1[..., None, :2], boxes2[..., None, :, :2])
+    rbi = torch.max(boxes1[..., None, 2:], boxes2[..., None, :, 2:])
     whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
-    diagonal_distance_squared = (whi[:, :, 0] ** 2) + (whi[:, :, 1] ** 2) + eps
+    diagonal_distance_squared = (whi[..., 0] ** 2) + (whi[..., 1] ** 2) + eps
     # centers of boxes
-    x_p = (boxes1[:, 0] + boxes1[:, 2]) / 2
-    y_p = (boxes1[:, 1] + boxes1[:, 3]) / 2
-    x_g = (boxes2[:, 0] + boxes2[:, 2]) / 2
-    y_g = (boxes2[:, 1] + boxes2[:, 3]) / 2
+    x_p = (boxes1[..., 0] + boxes1[..., 2]) / 2
+    y_p = (boxes1[..., 1] + boxes1[..., 3]) / 2
+    x_g = (boxes2[..., 0] + boxes2[..., 2]) / 2
+    y_g = (boxes2[..., 1] + boxes2[..., 3]) / 2
     # The distance between boxes' centers squared.
-    centers_distance_squared = (_upcast((x_p[:, None] - x_g[None, :])) ** 2) + (
-        _upcast((y_p[:, None] - y_g[None, :])) ** 2
+    centers_distance_squared = (_upcast(x_p[..., None] - x_g[..., None, :]) ** 2) + (
+        _upcast(y_p[..., None] - y_g[..., None, :]) ** 2
     )
     # The distance IoU is the IoU penalized by a normalized
     # distance between boxes' centers squared.
@@ -388,7 +485,13 @@ def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
     Compute the bounding boxes around the provided masks.
 
     Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
-    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+    ``0 <= x1 <= x2`` and ``0 <= y1 <= y2``.
+
+    .. warning::
+
+        In most cases the output will guarantee ``x1 < x2`` and ``y1 < y2``. But
+        if the input is degenerate, e.g. if a mask is a single row or a single
+        column, then the output may have x1 = x2 or y1 = y2.
 
     Args:
         masks (Tensor[N, H, W]): masks to transform where N is the number of masks
diff --git a/torchvision/ops/ciou_loss.py b/torchvision/ops/ciou_loss.py
index a9f20a5f4c8..d825e79dff0 100644
--- a/torchvision/ops/ciou_loss.py
+++ b/torchvision/ops/ciou_loss.py
@@ -11,7 +11,6 @@ def complete_box_iou_loss(
     reduction: str = "none",
     eps: float = 1e-7,
 ) -> torch.Tensor:
-
     """
     Gradient-friendly IoU loss with an additional penalty that is non-zero when the
     boxes do not overlap. This loss function considers important geometrical
@@ -63,9 +62,16 @@ def complete_box_iou_loss(
         alpha = v / (1 - iou + v + eps)
 
     loss = diou_loss + alpha * v
-    if reduction == "mean":
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/deform_conv.py b/torchvision/ops/deform_conv.py
index bb4400e5c29..da13ee6da9a 100644
--- a/torchvision/ops/deform_conv.py
+++ b/torchvision/ops/deform_conv.py
@@ -1,5 +1,5 @@
 import math
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 from torch import nn, Tensor
@@ -16,9 +16,9 @@ def deform_conv2d(
     offset: Tensor,
     weight: Tensor,
     bias: Optional[Tensor] = None,
-    stride: Tuple[int, int] = (1, 1),
-    padding: Tuple[int, int] = (0, 0),
-    dilation: Tuple[int, int] = (1, 1),
+    stride: tuple[int, int] = (1, 1),
+    padding: tuple[int, int] = (0, 0),
+    dilation: tuple[int, int] = (1, 1),
     mask: Optional[Tensor] = None,
 ) -> Tensor:
     r"""
@@ -68,7 +68,7 @@ def deform_conv2d(
     use_mask = mask is not None
 
     if mask is None:
-        mask = torch.zeros((input.shape[0], 0), device=input.device, dtype=input.dtype)
+        mask = torch.zeros((input.shape[0], 1), device=input.device, dtype=input.dtype)
 
     if bias is None:
         bias = torch.zeros(out_channels, device=input.device, dtype=input.dtype)
diff --git a/torchvision/ops/diou_loss.py b/torchvision/ops/diou_loss.py
index 2187aea4cc5..9381878ce1d 100644
--- a/torchvision/ops/diou_loss.py
+++ b/torchvision/ops/diou_loss.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 
 from ..utils import _log_api_usage_once
@@ -12,7 +10,6 @@ def distance_box_iou_loss(
     reduction: str = "none",
     eps: float = 1e-7,
 ) -> torch.Tensor:
-
     """
     Gradient-friendly IoU loss with an additional penalty that is non-zero when the
     distance between boxes' centers isn't zero. Indeed, for two exactly overlapping
@@ -36,7 +33,7 @@ def distance_box_iou_loss(
         Tensor: Loss tensor with the reduction option applied.
 
     Reference:
-        Zhaohui Zheng et. al: Distance Intersection over Union Loss:
+        Zhaohui Zheng et al.: Distance Intersection over Union Loss:
         https://arxiv.org/abs/1911.08287
     """
 
@@ -50,10 +47,17 @@ def distance_box_iou_loss(
 
     loss, _ = _diou_iou_loss(boxes1, boxes2, eps)
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
 
 
@@ -61,7 +65,7 @@ def _diou_iou_loss(
     boxes1: torch.Tensor,
     boxes2: torch.Tensor,
     eps: float = 1e-7,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
 
     intsct, union = _loss_inter_union(boxes1, boxes2)
     iou = intsct / (union + eps)
diff --git a/torchvision/ops/feature_pyramid_network.py b/torchvision/ops/feature_pyramid_network.py
index ffec3505ec0..5c85e19a699 100644
--- a/torchvision/ops/feature_pyramid_network.py
+++ b/torchvision/ops/feature_pyramid_network.py
@@ -1,5 +1,5 @@
 from collections import OrderedDict
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Optional
 
 import torch.nn.functional as F
 from torch import nn, Tensor
@@ -26,10 +26,10 @@ class ExtraFPNBlock(nn.Module):
 
     def forward(
         self,
-        results: List[Tensor],
-        x: List[Tensor],
-        names: List[str],
-    ) -> Tuple[List[Tensor], List[str]]:
+        results: list[Tensor],
+        x: list[Tensor],
+        names: list[str],
+    ) -> tuple[list[Tensor], list[str]]:
         pass
 
 
@@ -76,7 +76,7 @@ class FeaturePyramidNetwork(nn.Module):
 
     def __init__(
         self,
-        in_channels_list: List[int],
+        in_channels_list: list[int],
         out_channels: int,
         extra_blocks: Optional[ExtraFPNBlock] = None,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
@@ -169,7 +169,7 @@ def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
                 out = module(x)
         return out
 
-    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def forward(self, x: dict[str, Tensor]) -> dict[str, Tensor]:
         """
         Computes the FPN for a set of feature maps.
 
@@ -178,7 +178,7 @@ def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
 
         Returns:
             results (OrderedDict[Tensor]): feature maps after FPN layers.
-                They are ordered from highest resolution first.
+                They are ordered from the highest resolution first.
         """
         # unpack OrderedDict into two lists for easier handling
         names = list(x.keys())
@@ -206,17 +206,18 @@ def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
 
 class LastLevelMaxPool(ExtraFPNBlock):
     """
-    Applies a max_pool2d on top of the last feature map
+    Applies a max_pool2d (not actual max_pool2d, we just subsample) on top of the last feature map
     """
 
     def forward(
         self,
-        x: List[Tensor],
-        y: List[Tensor],
-        names: List[str],
-    ) -> Tuple[List[Tensor], List[str]]:
+        x: list[Tensor],
+        y: list[Tensor],
+        names: list[str],
+    ) -> tuple[list[Tensor], list[str]]:
         names.append("pool")
-        x.append(F.max_pool2d(x[-1], 1, 2, 0))
+        # Use max pooling to simulate stride 2 subsampling
+        x.append(F.max_pool2d(x[-1], kernel_size=1, stride=2, padding=0))
         return x, names
 
 
@@ -236,10 +237,10 @@ def __init__(self, in_channels: int, out_channels: int):
 
     def forward(
         self,
-        p: List[Tensor],
-        c: List[Tensor],
-        names: List[str],
-    ) -> Tuple[List[Tensor], List[str]]:
+        p: list[Tensor],
+        c: list[Tensor],
+        names: list[str],
+    ) -> tuple[list[Tensor], list[str]]:
         p5, c5 = p[-1], c[-1]
         x = p5 if self.use_P5 else c5
         p6 = self.p6(x)
diff --git a/torchvision/ops/focal_loss.py b/torchvision/ops/focal_loss.py
index c8cc9a8ac14..5cd781eaab5 100644
--- a/torchvision/ops/focal_loss.py
+++ b/torchvision/ops/focal_loss.py
@@ -20,7 +20,7 @@ def sigmoid_focal_loss(
         targets (Tensor): A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs
                 (0 for the negative class and 1 for the positive class).
-        alpha (float): Weighting factor in range (0,1) to balance
+        alpha (float): Weighting factor in range [0, 1] to balance
                 positive vs negative examples or -1 for ignore. Default: ``0.25``.
         gamma (float): Exponent of the modulating factor (1 - p_t) to
                 balance easy vs hard examples. Default: ``2``.
@@ -32,6 +32,10 @@ def sigmoid_focal_loss(
         Loss tensor with the reduction option applied.
     """
     # Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py
+
+    if not (0 <= alpha <= 1) and alpha != -1:
+        raise ValueError(f"Invalid alpha value: {alpha}. alpha must be in the range [0,1] or -1 for ignore.")
+
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(sigmoid_focal_loss)
     p = torch.sigmoid(inputs)
@@ -43,9 +47,15 @@ def sigmoid_focal_loss(
         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
         loss = alpha_t * loss
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/giou_loss.py b/torchvision/ops/giou_loss.py
index 0c555ec4fe9..e56dcc16c7d 100644
--- a/torchvision/ops/giou_loss.py
+++ b/torchvision/ops/giou_loss.py
@@ -10,7 +10,6 @@ def generalized_box_iou_loss(
     reduction: str = "none",
     eps: float = 1e-7,
 ) -> torch.Tensor:
-
     """
     Gradient-friendly IoU loss with an additional penalty that is non-zero when the
     boxes do not overlap and scales with the size of their smallest enclosing box.
@@ -33,7 +32,7 @@ def generalized_box_iou_loss(
         Tensor: Loss tensor with the reduction option applied.
 
     Reference:
-        Hamid Rezatofighi et. al: Generalized Intersection over Union:
+        Hamid Rezatofighi et al.: Generalized Intersection over Union:
         A Metric and A Loss for Bounding Box Regression:
         https://arxiv.org/abs/1902.09630
     """
@@ -62,9 +61,15 @@ def generalized_box_iou_loss(
 
     loss = 1 - miouk
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index d4bda7decc5..cfa1e23a5ee 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -1,5 +1,6 @@
 import warnings
-from typing import Callable, List, Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Callable, Optional, Union
 
 import torch
 from torch import Tensor
@@ -38,9 +39,9 @@ def _load_from_state_dict(
         prefix: str,
         local_metadata: dict,
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ):
         num_batches_tracked_key = prefix + "num_batches_tracked"
         if num_batches_tracked_key in state_dict:
@@ -70,13 +71,13 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: Union[int, Tuple[int, ...]] = 3,
-        stride: Union[int, Tuple[int, ...]] = 1,
-        padding: Optional[Union[int, Tuple[int, ...], str]] = None,
+        kernel_size: Union[int, tuple[int, ...]] = 3,
+        stride: Union[int, tuple[int, ...]] = 1,
+        padding: Optional[Union[int, tuple[int, ...], str]] = None,
         groups: int = 1,
         norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: Union[int, Tuple[int, ...]] = 1,
+        dilation: Union[int, tuple[int, ...]] = 1,
         inplace: Optional[bool] = True,
         bias: Optional[bool] = None,
         conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
@@ -131,10 +132,10 @@ class Conv2dNormActivation(ConvNormActivation):
         out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
         kernel_size: (int, optional): Size of the convolving kernel. Default: 3
         stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -145,13 +146,13 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]] = 3,
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Optional[Union[int, Tuple[int, int], str]] = None,
+        kernel_size: Union[int, tuple[int, int]] = 3,
+        stride: Union[int, tuple[int, int]] = 1,
+        padding: Optional[Union[int, tuple[int, int], str]] = None,
         groups: int = 1,
         norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: Union[int, Tuple[int, int]] = 1,
+        dilation: Union[int, tuple[int, int]] = 1,
         inplace: Optional[bool] = True,
         bias: Optional[bool] = None,
     ) -> None:
@@ -181,10 +182,10 @@ class Conv3dNormActivation(ConvNormActivation):
         out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
         kernel_size: (int, optional): Size of the convolving kernel. Default: 3
         stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -194,13 +195,13 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: Union[int, Tuple[int, int, int]] = 3,
-        stride: Union[int, Tuple[int, int, int]] = 1,
-        padding: Optional[Union[int, Tuple[int, int, int], str]] = None,
+        kernel_size: Union[int, tuple[int, int, int]] = 3,
+        stride: Union[int, tuple[int, int, int]] = 1,
+        padding: Optional[Union[int, tuple[int, int, int], str]] = None,
         groups: int = 1,
         norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm3d,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, tuple[int, int, int]] = 1,
         inplace: Optional[bool] = True,
         bias: Optional[bool] = None,
     ) -> None:
@@ -266,9 +267,10 @@ class MLP(torch.nn.Sequential):
     Args:
         in_channels (int): Number of channels of the input
         hidden_channels (List[int]): List of the hidden channel dimensions
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer won't be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
+        inplace (bool, optional): Parameter for the activation layer, which can optionally do the operation in-place.
+            Default is ``None``, which uses the respective default values of the ``activation_layer`` and Dropout layer.
         bias (bool): Whether to use bias in the linear layer. Default ``True``
         dropout (float): The probability for the dropout layer. Default: 0.0
     """
@@ -276,10 +278,10 @@ class MLP(torch.nn.Sequential):
     def __init__(
         self,
         in_channels: int,
-        hidden_channels: List[int],
+        hidden_channels: list[int],
         norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        inplace: Optional[bool] = True,
+        inplace: Optional[bool] = None,
         bias: bool = True,
         dropout: float = 0.0,
     ):
@@ -311,7 +313,7 @@ class Permute(torch.nn.Module):
         dims (List[int]): The desired ordering of dimensions
     """
 
-    def __init__(self, dims: List[int]):
+    def __init__(self, dims: list[int]):
         super().__init__()
         self.dims = dims
 
diff --git a/torchvision/ops/poolers.py b/torchvision/ops/poolers.py
index cfcb9e94056..f887f6aee33 100644
--- a/torchvision/ops/poolers.py
+++ b/torchvision/ops/poolers.py
@@ -1,5 +1,4 @@
-import warnings
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 import torch.fx
@@ -16,7 +15,7 @@
 # _onnx_merge_levels() is an implementation supported by ONNX
 # that merges the levels to the right indices
 @torch.jit.unused
-def _onnx_merge_levels(levels: Tensor, unmerged_results: List[Tensor]) -> Tensor:
+def _onnx_merge_levels(levels: Tensor, unmerged_results: list[Tensor]) -> Tensor:
     first_result = unmerged_results[0]
     dtype, device = first_result.dtype, first_result.device
     res = torch.zeros(
@@ -71,7 +70,7 @@ def __init__(
         self.lvl0 = canonical_level
         self.eps = eps
 
-    def __call__(self, boxlists: List[Tensor]) -> Tensor:
+    def __call__(self, boxlists: list[Tensor]) -> Tensor:
         """
         Args:
             boxlists (list[BoxList])
@@ -85,7 +84,7 @@ def __call__(self, boxlists: List[Tensor]) -> Tensor:
         return (target_lvls.to(torch.int64) - self.k_min).to(torch.int64)
 
 
-def _convert_to_roi_format(boxes: List[Tensor]) -> Tensor:
+def _convert_to_roi_format(boxes: list[Tensor]) -> Tensor:
     concat_boxes = torch.cat(boxes, dim=0)
     device, dtype = concat_boxes.device, concat_boxes.dtype
     ids = torch.cat(
@@ -96,10 +95,10 @@ def _convert_to_roi_format(boxes: List[Tensor]) -> Tensor:
     return rois
 
 
-def _infer_scale(feature: Tensor, original_size: List[int]) -> float:
+def _infer_scale(feature: Tensor, original_size: list[int]) -> float:
     # assumption: the scale is of the form 2 ** (-k), with k integer
     size = feature.shape[-2:]
-    possible_scales: List[float] = []
+    possible_scales: list[float] = []
     for s1, s2 in zip(size, original_size):
         approx_scale = float(s1) / float(s2)
         scale = 2 ** float(torch.tensor(approx_scale).log2().round())
@@ -109,8 +108,8 @@ def _infer_scale(feature: Tensor, original_size: List[int]) -> float:
 
 @torch.fx.wrap
 def _setup_scales(
-    features: List[Tensor], image_shapes: List[Tuple[int, int]], canonical_scale: int, canonical_level: int
-) -> Tuple[List[float], LevelMapper]:
+    features: list[Tensor], image_shapes: list[tuple[int, int]], canonical_scale: int, canonical_level: int
+) -> tuple[list[float], LevelMapper]:
     if not image_shapes:
         raise ValueError("images list should not be empty")
     max_x = 0
@@ -136,7 +135,7 @@ def _setup_scales(
 
 
 @torch.fx.wrap
-def _filter_input(x: Dict[str, Tensor], featmap_names: List[str]) -> List[Tensor]:
+def _filter_input(x: dict[str, Tensor], featmap_names: list[str]) -> list[Tensor]:
     x_filtered = []
     for k, v in x.items():
         if k in featmap_names:
@@ -146,11 +145,11 @@ def _filter_input(x: Dict[str, Tensor], featmap_names: List[str]) -> List[Tensor
 
 @torch.fx.wrap
 def _multiscale_roi_align(
-    x_filtered: List[Tensor],
-    boxes: List[Tensor],
-    output_size: List[int],
+    x_filtered: list[Tensor],
+    boxes: list[Tensor],
+    output_size: list[int],
     sampling_ratio: int,
-    scales: Optional[List[float]],
+    scales: Optional[list[float]],
     mapper: Optional[LevelMapper],
 ) -> Tensor:
     """
@@ -161,8 +160,8 @@ def _multiscale_roi_align(
             reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         output_size (Union[List[Tuple[int, int]], List[int]]): size of the output
         sampling_ratio (int): sampling ratio for ROIAlign
-        scales (Optional[List[float]]): If None, scales will be automatically infered. Default value is None.
-        mapper (Optional[LevelMapper]): If none, mapper will be automatically infered. Default value is None.
+        scales (Optional[List[float]]): If None, scales will be automatically inferred. Default value is None.
+        mapper (Optional[LevelMapper]): If none, mapper will be automatically inferred. Default value is None.
     Returns:
         result (Tensor)
     """
@@ -264,12 +263,12 @@ class MultiScaleRoIAlign(nn.Module):
 
     """
 
-    __annotations__ = {"scales": Optional[List[float]], "map_levels": Optional[LevelMapper]}
+    __annotations__ = {"scales": Optional[list[float]], "map_levels": Optional[LevelMapper]}
 
     def __init__(
         self,
-        featmap_names: List[str],
-        output_size: Union[int, Tuple[int], List[int]],
+        featmap_names: list[str],
+        output_size: Union[int, tuple[int], list[int]],
         sampling_ratio: int,
         *,
         canonical_scale: int = 224,
@@ -287,27 +286,11 @@ def __init__(
         self.canonical_scale = canonical_scale
         self.canonical_level = canonical_level
 
-    def convert_to_roi_format(self, boxes: List[Tensor]) -> Tensor:
-        warnings.warn("The 'convert_to_roi_format' method is deprecated since 0.12 and will be removed in 0.14.")
-        return _convert_to_roi_format(boxes)
-
-    def infer_scale(self, feature: Tensor, original_size: List[int]) -> float:
-        warnings.warn("The 'infer_scale' method is deprecated since 0.12 and will be removed in 0.14.")
-        return _infer_scale(feature, original_size)
-
-    def setup_setup_scales(
-        self,
-        features: List[Tensor],
-        image_shapes: List[Tuple[int, int]],
-    ) -> None:
-        warnings.warn("The 'setup_setup_scales' method is deprecated since 0.12 and will be removed in 0.14.")
-        self.scales, self.map_levels = _setup_scales(features, image_shapes, self.canonical_scale, self.canonical_level)
-
     def forward(
         self,
-        x: Dict[str, Tensor],
-        boxes: List[Tensor],
-        image_shapes: List[Tuple[int, int]],
+        x: dict[str, Tensor],
+        boxes: list[Tensor],
+        image_shapes: list[tuple[int, int]],
     ) -> Tensor:
         """
         Args:
diff --git a/torchvision/ops/ps_roi_align.py b/torchvision/ops/ps_roi_align.py
index 0228a2a5554..82809b8f888 100644
--- a/torchvision/ops/ps_roi_align.py
+++ b/torchvision/ops/ps_roi_align.py
@@ -1,4 +1,5 @@
 import torch
+import torch.fx
 from torch import nn, Tensor
 from torch.nn.modules.utils import _pair
 from torchvision.extension import _assert_has_ops
@@ -7,6 +8,7 @@
 from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
 
 
+@torch.fx.wrap
 def ps_roi_align(
     input: Tensor,
     boxes: Tensor,
diff --git a/torchvision/ops/ps_roi_pool.py b/torchvision/ops/ps_roi_pool.py
index 1a3eed35915..15292dcad97 100644
--- a/torchvision/ops/ps_roi_pool.py
+++ b/torchvision/ops/ps_roi_pool.py
@@ -1,4 +1,5 @@
 import torch
+import torch.fx
 from torch import nn, Tensor
 from torch.nn.modules.utils import _pair
 from torchvision.extension import _assert_has_ops
@@ -7,6 +8,7 @@
 from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
 
 
+@torch.fx.wrap
 def ps_roi_pool(
     input: Tensor,
     boxes: Tensor,
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index f331a37da4b..25214d6b130 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -1,18 +1,209 @@
-from typing import List, Union
+import functools
+from typing import Union
 
 import torch
+import torch.fx
 from torch import nn, Tensor
+from torch._dynamo.utils import is_compile_supported
 from torch.jit.annotations import BroadcastingList2
 from torch.nn.modules.utils import _pair
-from torchvision.extension import _assert_has_ops
+from torchvision.extension import _assert_has_ops, _has_ops
 
 from ..utils import _log_api_usage_once
 from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
 
 
+def lazy_compile(**compile_kwargs):
+    """Lazily wrap a function with torch.compile on the first call
+
+    This avoids eagerly importing dynamo.
+    """
+
+    def decorate_fn(fn):
+        @functools.wraps(fn)
+        def compile_hook(*args, **kwargs):
+            compiled_fn = torch.compile(fn, **compile_kwargs)
+            globals()[fn.__name__] = functools.wraps(fn)(compiled_fn)
+            return compiled_fn(*args, **kwargs)
+
+        return compile_hook
+
+    return decorate_fn
+
+
+# NB: all inputs are tensors
+def _bilinear_interpolate(
+    input,  # [N, C, H, W]
+    roi_batch_ind,  # [K]
+    y,  # [K, PH, IY]
+    x,  # [K, PW, IX]
+    ymask,  # [K, IY]
+    xmask,  # [K, IX]
+):
+    _, channels, height, width = input.size()
+
+    # deal with inverse element out of feature map boundary
+    y = y.clamp(min=0)
+    x = x.clamp(min=0)
+    y_low = y.int()
+    x_low = x.int()
+    y_high = torch.where(y_low >= height - 1, height - 1, y_low + 1)
+    y_low = torch.where(y_low >= height - 1, height - 1, y_low)
+    y = torch.where(y_low >= height - 1, y.to(input.dtype), y)
+
+    x_high = torch.where(x_low >= width - 1, width - 1, x_low + 1)
+    x_low = torch.where(x_low >= width - 1, width - 1, x_low)
+    x = torch.where(x_low >= width - 1, x.to(input.dtype), x)
+
+    ly = y - y_low
+    lx = x - x_low
+    hy = 1.0 - ly
+    hx = 1.0 - lx
+
+    # do bilinear interpolation, but respect the masking!
+    # TODO: It's possible the masking here is unnecessary if y and
+    # x were clamped appropriately; hard to tell
+    def masked_index(
+        y,  # [K, PH, IY]
+        x,  # [K, PW, IX]
+    ):
+        if ymask is not None:
+            assert xmask is not None
+            y = torch.where(ymask[:, None, :], y, 0)
+            x = torch.where(xmask[:, None, :], x, 0)
+        return input[
+            roi_batch_ind[:, None, None, None, None, None],
+            torch.arange(channels, device=input.device)[None, :, None, None, None, None],
+            y[:, None, :, None, :, None],  # prev [K, PH, IY]
+            x[:, None, None, :, None, :],  # prev [K, PW, IX]
+        ]  # [K, C, PH, PW, IY, IX]
+
+    v1 = masked_index(y_low, x_low)
+    v2 = masked_index(y_low, x_high)
+    v3 = masked_index(y_high, x_low)
+    v4 = masked_index(y_high, x_high)
+
+    # all ws preemptively [K, C, PH, PW, IY, IX]
+    def outer_prod(y, x):
+        return y[:, None, :, None, :, None] * x[:, None, None, :, None, :]
+
+    w1 = outer_prod(hy, hx)
+    w2 = outer_prod(hy, lx)
+    w3 = outer_prod(ly, hx)
+    w4 = outer_prod(ly, lx)
+
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+    return val
+
+
+# TODO: this doesn't actually cache
+# TODO: main library should make this easier to do
+def maybe_cast(tensor):
+    if torch.is_autocast_enabled() and tensor.is_cuda and tensor.dtype != torch.double:
+        return tensor.float()
+    else:
+        return tensor
+
+
+# This is a pure Python and differentiable implementation of roi_align.  When
+# run in eager mode, it uses a lot of memory, but when compiled it has
+# acceptable memory usage.  The main point of this implementation is that
+# its backwards is deterministic.
+# It is transcribed directly off of the roi_align CUDA kernel, see
+# https://dev-discuss.pytorch.org/t/a-pure-python-implementation-of-roi-align-that-looks-just-like-its-cuda-kernel/1266
+@lazy_compile(dynamic=True)
+def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    orig_dtype = input.dtype
+
+    input = maybe_cast(input)
+    rois = maybe_cast(rois)
+
+    _, _, height, width = input.size()
+
+    ph = torch.arange(pooled_height, device=input.device)  # [PH]
+    pw = torch.arange(pooled_width, device=input.device)  # [PW]
+
+    # input: [N, C, H, W]
+    # rois: [K, 5]
+
+    roi_batch_ind = rois[:, 0].int()  # [K]
+    offset = 0.5 if aligned else 0.0
+    roi_start_w = rois[:, 1] * spatial_scale - offset  # [K]
+    roi_start_h = rois[:, 2] * spatial_scale - offset  # [K]
+    roi_end_w = rois[:, 3] * spatial_scale - offset  # [K]
+    roi_end_h = rois[:, 4] * spatial_scale - offset  # [K]
+
+    roi_width = roi_end_w - roi_start_w  # [K]
+    roi_height = roi_end_h - roi_start_h  # [K]
+    if not aligned:
+        roi_width = torch.clamp(roi_width, min=1.0)  # [K]
+        roi_height = torch.clamp(roi_height, min=1.0)  # [K]
+
+    bin_size_h = roi_height / pooled_height  # [K]
+    bin_size_w = roi_width / pooled_width  # [K]
+
+    exact_sampling = sampling_ratio > 0
+
+    roi_bin_grid_h = sampling_ratio if exact_sampling else torch.ceil(roi_height / pooled_height)  # scalar or [K]
+    roi_bin_grid_w = sampling_ratio if exact_sampling else torch.ceil(roi_width / pooled_width)  # scalar or [K]
+
+    """
+    iy, ix = dims(2)
+    """
+
+    if exact_sampling:
+        count = max(roi_bin_grid_h * roi_bin_grid_w, 1)  # scalar
+        iy = torch.arange(roi_bin_grid_h, device=input.device)  # [IY]
+        ix = torch.arange(roi_bin_grid_w, device=input.device)  # [IX]
+        ymask = None
+        xmask = None
+    else:
+        count = torch.clamp(roi_bin_grid_h * roi_bin_grid_w, min=1)  # [K]
+        # When doing adaptive sampling, the number of samples we need to do
+        # is data-dependent based on how big the ROIs are.  This is a bit
+        # awkward because first-class dims can't actually handle this.
+        # So instead, we inefficiently suppose that we needed to sample ALL
+        # the points and mask out things that turned out to be unnecessary
+        iy = torch.arange(height, device=input.device)  # [IY]
+        ix = torch.arange(width, device=input.device)  # [IX]
+        ymask = iy[None, :] < roi_bin_grid_h[:, None]  # [K, IY]
+        xmask = ix[None, :] < roi_bin_grid_w[:, None]  # [K, IX]
+
+    def from_K(t):
+        return t[:, None, None]
+
+    y = (
+        from_K(roi_start_h)
+        + ph[None, :, None] * from_K(bin_size_h)
+        + (iy[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_h / roi_bin_grid_h)
+    )  # [K, PH, IY]
+    x = (
+        from_K(roi_start_w)
+        + pw[None, :, None] * from_K(bin_size_w)
+        + (ix[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_w / roi_bin_grid_w)
+    )  # [K, PW, IX]
+    val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask)  # [K, C, PH, PW, IY, IX]
+
+    # Mask out samples that weren't actually adaptively needed
+    if not exact_sampling:
+        val = torch.where(ymask[:, None, None, None, :, None], val, 0)
+        val = torch.where(xmask[:, None, None, None, None, :], val, 0)
+
+    output = val.sum((-1, -2))  # remove IY, IX ~> [K, C, PH, PW]
+    if isinstance(count, torch.Tensor):
+        output /= count[:, None, None, None]
+    else:
+        output /= count
+
+    output = output.to(orig_dtype)
+
+    return output
+
+
+@torch.fx.wrap
 def roi_align(
     input: Tensor,
-    boxes: Union[Tensor, List[Tensor]],
+    boxes: Union[Tensor, list[Tensor]],
     output_size: BroadcastingList2[int],
     spatial_scale: float = 1.0,
     sampling_ratio: int = -1,
@@ -52,12 +243,18 @@ def roi_align(
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(roi_align)
-    _assert_has_ops()
     check_roi_boxes_shape(boxes)
     rois = boxes
     output_size = _pair(output_size)
     if not isinstance(rois, torch.Tensor):
         rois = convert_boxes_to_roi_format(rois)
+    if not torch.jit.is_scripting():
+        if (
+            not _has_ops()
+            or (torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_mps or input.is_xpu))
+        ) and is_compile_supported(input.device.type):
+            return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
+    _assert_has_ops()
     return torch.ops.torchvision.roi_align(
         input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
     )
@@ -82,7 +279,7 @@ def __init__(
         self.sampling_ratio = sampling_ratio
         self.aligned = aligned
 
-    def forward(self, input: Tensor, rois: Union[Tensor, List[Tensor]]) -> Tensor:
+    def forward(self, input: Tensor, rois: Union[Tensor, list[Tensor]]) -> Tensor:
         return roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned)
 
     def __repr__(self) -> str:
diff --git a/torchvision/ops/roi_pool.py b/torchvision/ops/roi_pool.py
index 9fd7bd84ee2..5f4bb95c0f3 100644
--- a/torchvision/ops/roi_pool.py
+++ b/torchvision/ops/roi_pool.py
@@ -1,6 +1,7 @@
-from typing import List, Union
+from typing import Union
 
 import torch
+import torch.fx
 from torch import nn, Tensor
 from torch.jit.annotations import BroadcastingList2
 from torch.nn.modules.utils import _pair
@@ -10,9 +11,10 @@
 from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
 
 
+@torch.fx.wrap
 def roi_pool(
     input: Tensor,
-    boxes: Union[Tensor, List[Tensor]],
+    boxes: Union[Tensor, list[Tensor]],
     output_size: BroadcastingList2[int],
     spatial_scale: float = 1.0,
 ) -> Tensor:
@@ -62,7 +64,7 @@ def __init__(self, output_size: BroadcastingList2[int], spatial_scale: float):
         self.output_size = output_size
         self.spatial_scale = spatial_scale
 
-    def forward(self, input: Tensor, rois: Union[Tensor, List[Tensor]]) -> Tensor:
+    def forward(self, input: Tensor, rois: Union[Tensor, list[Tensor]]) -> Tensor:
         return roi_pool(input, rois, self.output_size, self.spatial_scale)
 
     def __repr__(self) -> str:
diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py
index bef5ecc411d..0621c9bf756 100644
--- a/torchvision/prototype/__init__.py
+++ b/torchvision/prototype/__init__.py
@@ -1 +1 @@
-from . import datasets, features, models, transforms, utils
+from . import models, transforms, tv_tensors, utils
diff --git a/torchvision/prototype/datasets/README.md b/torchvision/prototype/datasets/README.md
new file mode 100644
index 00000000000..79b426caaf3
--- /dev/null
+++ b/torchvision/prototype/datasets/README.md
@@ -0,0 +1,7 @@
+# Status of prototype datasets
+
+These prototype datasets are based on [torchdata](https://github.com/pytorch/data)'s datapipes. Torchdata
+development [is
+paused](https://github.com/pytorch/data/#torchdata-see-note-below-on-current-status)
+as of July 2023, so we are not actively maintaining this module. There is no
+estimated date for a stable release of these datasets.
diff --git a/torchvision/prototype/datasets/_api.py b/torchvision/prototype/datasets/_api.py
index f6f06c60a21..ab126e4a49e 100644
--- a/torchvision/prototype/datasets/_api.py
+++ b/torchvision/prototype/datasets/_api.py
@@ -1,5 +1,5 @@
 import pathlib
-from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 from torchvision.prototype.datasets import home
 from torchvision.prototype.datasets.utils import Dataset
@@ -7,13 +7,13 @@
 
 
 T = TypeVar("T")
-D = TypeVar("D", bound=Type[Dataset])
+D = TypeVar("D", bound=type[Dataset])
 
-BUILTIN_INFOS: Dict[str, Dict[str, Any]] = {}
+BUILTIN_INFOS: dict[str, dict[str, Any]] = {}
 
 
-def register_info(name: str) -> Callable[[Callable[[], Dict[str, Any]]], Callable[[], Dict[str, Any]]]:
-    def wrapper(fn: Callable[[], Dict[str, Any]]) -> Callable[[], Dict[str, Any]]:
+def register_info(name: str) -> Callable[[Callable[[], dict[str, Any]]], Callable[[], dict[str, Any]]]:
+    def wrapper(fn: Callable[[], dict[str, Any]]) -> Callable[[], dict[str, Any]]:
         BUILTIN_INFOS[name] = fn()
         return fn
 
@@ -31,11 +31,11 @@ def wrapper(dataset_cls: D) -> D:
     return wrapper
 
 
-def list_datasets() -> List[str]:
+def list_datasets() -> list[str]:
     return sorted(BUILTIN_DATASETS.keys())
 
 
-def find(dct: Dict[str, T], name: str) -> T:
+def find(dct: dict[str, T], name: str) -> T:
     name = name.lower()
     try:
         return dct[name]
@@ -52,7 +52,7 @@ def find(dct: Dict[str, T], name: str) -> T:
         ) from error
 
 
-def info(name: str) -> Dict[str, Any]:
+def info(name: str) -> dict[str, Any]:
     return find(BUILTIN_INFOS, name)
 
 
diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index 05d61c6870e..3b33100eb81 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -91,7 +91,7 @@ import hashlib
 def sha256sum(path, chunk_size=1024 * 1024):
     checksum = hashlib.sha256()
     with open(path, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
+        while chunk := f.read(chunk_size):
             checksum.update(chunk)
     print(checksum.hexdigest())
 ```
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index a00bf2e2cc9..e4d8b8579d0 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -1,10 +1,12 @@
 import pathlib
 import re
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
+from typing import Any, BinaryIO, Union
 
 import numpy as np
+
+import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
@@ -12,13 +14,14 @@
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
 
 @register_info("caltech101")
-def _caltech101_info() -> Dict[str, Any]:
+def _caltech101_info() -> dict[str, Any]:
     return dict(categories=read_categories_file("caltech101"))
 
 
@@ -43,7 +46,7 @@ def __init__(
             skip_integrity_check=skip_integrity_check,
         )
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         images = GDriveResource(
             "137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp",
             file_name="101_ObjectCategories.tar.gz",
@@ -66,15 +69,15 @@ def _resources(self) -> List[OnlineResource]:
         "Airplanes_Side_2": "airplanes",
     }
 
-    def _is_not_background_image(self, data: Tuple[str, Any]) -> bool:
+    def _is_not_background_image(self, data: tuple[str, Any]) -> bool:
         path = pathlib.Path(data[0])
         return path.parent.name != "BACKGROUND_Google"
 
-    def _is_ann(self, data: Tuple[str, Any]) -> bool:
+    def _is_ann(self, data: tuple[str, Any]) -> bool:
         path = pathlib.Path(data[0])
         return bool(self._ANNS_NAME_PATTERN.match(path.name))
 
-    def _images_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
+    def _images_key_fn(self, data: tuple[str, Any]) -> tuple[str, str]:
         path = pathlib.Path(data[0])
 
         category = path.parent.name
@@ -82,7 +85,7 @@ def _images_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
 
         return category, id
 
-    def _anns_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
+    def _anns_key_fn(self, data: tuple[str, Any]) -> tuple[str, str]:
         path = pathlib.Path(data[0])
 
         category = path.parent.name
@@ -94,8 +97,8 @@ def _anns_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
         return category, id
 
     def _prepare_sample(
-        self, data: Tuple[Tuple[str, str], Tuple[Tuple[str, BinaryIO], Tuple[str, BinaryIO]]]
-    ) -> Dict[str, Any]:
+        self, data: tuple[tuple[str, str], tuple[tuple[str, BinaryIO], tuple[str, BinaryIO]]]
+    ) -> dict[str, Any]:
         key, (image_data, ann_data) = data
         category, _ = key
         image_path, image_buffer = image_data
@@ -109,13 +112,15 @@ def _prepare_sample(
             image_path=image_path,
             image=image,
             ann_path=ann_path,
-            bounding_box=BoundingBox(
-                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", image_size=image.image_size
+            bounding_boxes=BoundingBoxes(
+                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]],
+                format="xyxy",
+                spatial_size=image.spatial_size,
             ),
-            contour=_Feature(ann["obj_contour"].T),
+            contour=torch.as_tensor(ann["obj_contour"].T),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         images_dp, anns_dp = resource_dps
 
         images_dp = Filter(images_dp, self._is_not_background_image)
@@ -137,7 +142,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
     def __len__(self) -> int:
         return 8677
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
 
         dp = resources[0].load(self._root)
@@ -147,7 +152,7 @@ def _generate_categories(self) -> List[str]:
 
 
 @register_info("caltech256")
-def _caltech256_info() -> Dict[str, Any]:
+def _caltech256_info() -> dict[str, Any]:
     return dict(categories=read_categories_file("caltech256"))
 
 
@@ -166,7 +171,7 @@ def __init__(
 
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [
             GDriveResource(
                 "1r6o0pSROcV1_VwT4oSjA2FBUSCWGuxLK",
@@ -175,11 +180,11 @@ def _resources(self) -> List[OnlineResource]:
             )
         ]
 
-    def _is_not_rogue_file(self, data: Tuple[str, Any]) -> bool:
+    def _is_not_rogue_file(self, data: tuple[str, Any]) -> bool:
         path = pathlib.Path(data[0])
         return path.name != "RENAME2"
 
-    def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[str, BinaryIO]) -> dict[str, Any]:
         path, buffer = data
 
         return dict(
@@ -188,7 +193,7 @@ def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
             label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = Filter(dp, self._is_not_rogue_file)
         dp = hint_shuffling(dp)
@@ -198,7 +203,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
     def __len__(self) -> int:
         return 30607
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
 
         dp = resources[0].load(self._root)
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index e42657e826e..4fcb38bf8d7 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -1,9 +1,11 @@
 import csv
 import pathlib
-from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, Union
+from collections.abc import Iterator, Sequence
+from typing import Any, BinaryIO, Optional, Union
 
+import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -11,49 +13,53 @@
     INFINITE_BUFFER_SIZE,
     path_accessor,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
 csv.register_dialect("celeba", delimiter=" ", skipinitialspace=True)
 
 
-class CelebACSVParser(IterDataPipe[Tuple[str, Dict[str, str]]]):
+class CelebACSVParser(IterDataPipe[tuple[str, dict[str, str]]]):
     def __init__(
         self,
-        datapipe: IterDataPipe[Tuple[Any, BinaryIO]],
+        datapipe: IterDataPipe[tuple[Any, BinaryIO]],
         *,
         fieldnames: Optional[Sequence[str]] = None,
     ) -> None:
         self.datapipe = datapipe
         self.fieldnames = fieldnames
 
-    def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]:
+    def __iter__(self) -> Iterator[tuple[str, dict[str, str]]]:
         for _, file in self.datapipe:
-            file = (line.decode() for line in file)
+            try:
+                lines = (line.decode() for line in file)
 
-            if self.fieldnames:
-                fieldnames = self.fieldnames
-            else:
-                # The first row is skipped, because it only contains the number of samples
-                next(file)
+                if self.fieldnames:
+                    fieldnames = self.fieldnames
+                else:
+                    # The first row is skipped, because it only contains the number of samples
+                    next(lines)
 
-                # Empty field names are filtered out, because some files have an extra white space after the header
-                # line, which is recognized as extra column
-                fieldnames = [name for name in next(csv.reader([next(file)], dialect="celeba")) if name]
-                # Some files do not include a label for the image ID column
-                if fieldnames[0] != "image_id":
-                    fieldnames.insert(0, "image_id")
+                    # Empty field names are filtered out, because some files have an extra white space after the header
+                    # line, which is recognized as extra column
+                    fieldnames = [name for name in next(csv.reader([next(lines)], dialect="celeba")) if name]
+                    # Some files do not include a label for the image ID column
+                    if fieldnames[0] != "image_id":
+                        fieldnames.insert(0, "image_id")
 
-            for line in csv.DictReader(file, fieldnames=fieldnames, dialect="celeba"):
-                yield line.pop("image_id"), line
+                for line in csv.DictReader(lines, fieldnames=fieldnames, dialect="celeba"):
+                    yield line.pop("image_id"), line
+            finally:
+                file.close()
 
 
 NAME = "celeba"
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict()
 
 
@@ -74,7 +80,7 @@ def __init__(
 
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         splits = GDriveResource(
             "0B7EVK8r0v71pY0NSMzRuSXJEVkk",
             sha256="fc955bcb3ef8fbdf7d5640d9a8693a8431b5f2ee291a5c1449a1549e7e073fe7",
@@ -107,7 +113,7 @@ def _resources(self) -> List[OnlineResource]:
         )
         return [splits, images, identities, attributes, bounding_boxes, landmarks]
 
-    def _filter_split(self, data: Tuple[str, Dict[str, str]]) -> bool:
+    def _filter_split(self, data: tuple[str, dict[str, str]]) -> bool:
         split_id = {
             "train": "0",
             "val": "1",
@@ -117,40 +123,40 @@ def _filter_split(self, data: Tuple[str, Dict[str, str]]) -> bool:
 
     def _prepare_sample(
         self,
-        data: Tuple[
-            Tuple[str, Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO]]],
-            Tuple[
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
+        data: tuple[
+            tuple[str, tuple[tuple[str, list[str]], tuple[str, BinaryIO]]],
+            tuple[
+                tuple[str, dict[str, str]],
+                tuple[str, dict[str, str]],
+                tuple[str, dict[str, str]],
+                tuple[str, dict[str, str]],
             ],
         ],
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         split_and_image_data, ann_data = data
         _, (_, image_data) = split_and_image_data
         path, buffer = image_data
 
         image = EncodedImage.from_file(buffer)
-        (_, identity), (_, attributes), (_, bounding_box), (_, landmarks) = ann_data
+        (_, identity), (_, attributes), (_, bounding_boxes), (_, landmarks) = ann_data
 
         return dict(
             path=path,
             image=image,
             identity=Label(int(identity["identity"])),
             attributes={attr: value == "1" for attr, value in attributes.items()},
-            bounding_box=BoundingBox(
-                [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")],
+            bounding_boxes=BoundingBoxes(
+                [int(bounding_boxes[key]) for key in ("x_1", "y_1", "width", "height")],
                 format="xywh",
-                image_size=image.image_size,
+                spatial_size=image.spatial_size,
             ),
             landmarks={
-                landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
+                landmark: torch.tensor((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
                 for landmark in {key[:-2] for key in landmarks.keys()}
             },
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         splits_dp, images_dp, identities_dp, attributes_dp, bounding_boxes_dp, landmarks_dp = resource_dps
 
         splits_dp = CelebACSVParser(splits_dp, fieldnames=("image_id", "split_id"))
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
index 26196ded638..ef90d4d53ba 100644
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ b/torchvision/prototype/datasets/_builtin/cifar.py
@@ -2,7 +2,8 @@
 import io
 import pathlib
 import pickle
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Tuple, Union
+from collections.abc import Iterator
+from typing import Any, BinaryIO, cast, Optional, Union
 
 import numpy as np
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
@@ -13,17 +14,18 @@
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import Image, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
 
-class CifarFileReader(IterDataPipe[Tuple[np.ndarray, int]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[str, Any]], *, labels_key: str) -> None:
+class CifarFileReader(IterDataPipe[tuple[np.ndarray, int]]):
+    def __init__(self, datapipe: IterDataPipe[dict[str, Any]], *, labels_key: str) -> None:
         self.datapipe = datapipe
         self.labels_key = labels_key
 
-    def __iter__(self) -> Iterator[Tuple[np.ndarray, int]]:
+    def __iter__(self) -> Iterator[tuple[np.ndarray, int]]:
         for mapping in self.datapipe:
             image_arrays = mapping["data"].reshape((-1, 3, 32, 32))
             category_idcs = mapping[self.labels_key]
@@ -36,7 +38,7 @@ class _CifarBase(Dataset):
     _LABELS_KEY: str
     _META_FILE_NAME: str
     _CATEGORIES_KEY: str
-    _categories: List[str]
+    _categories: list[str]
 
     def __init__(
         self,
@@ -49,10 +51,10 @@ def __init__(
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
     @abc.abstractmethod
-    def _is_data_file(self, data: Tuple[str, BinaryIO]) -> Optional[int]:
+    def _is_data_file(self, data: tuple[str, BinaryIO]) -> Optional[int]:
         pass
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [
             HttpResource(
                 f"https://www.cs.toronto.edu/~kriz/{self._FILE_NAME}",
@@ -60,18 +62,20 @@ def _resources(self) -> List[OnlineResource]:
             )
         ]
 
-    def _unpickle(self, data: Tuple[str, io.BytesIO]) -> Dict[str, Any]:
+    def _unpickle(self, data: tuple[str, io.BytesIO]) -> dict[str, Any]:
         _, file = data
-        return cast(Dict[str, Any], pickle.load(file, encoding="latin1"))
+        content = cast(dict[str, Any], pickle.load(file, encoding="latin1"))
+        file.close()
+        return content
 
-    def _prepare_sample(self, data: Tuple[np.ndarray, int]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[np.ndarray, int]) -> dict[str, Any]:
         image_array, category_idx = data
         return dict(
             image=Image(image_array),
             label=Label(category_idx, categories=self._categories),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = Filter(dp, self._is_data_file)
         dp = Mapper(dp, self._unpickle)
@@ -83,18 +87,18 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
     def __len__(self) -> int:
         return 50_000 if self._split == "train" else 10_000
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
 
         dp = resources[0].load(self._root)
         dp = Filter(dp, path_comparator("name", self._META_FILE_NAME))
         dp = Mapper(dp, self._unpickle)
 
-        return cast(List[str], next(iter(dp))[self._CATEGORIES_KEY])
+        return cast(list[str], next(iter(dp))[self._CATEGORIES_KEY])
 
 
 @register_info("cifar10")
-def _cifar10_info() -> Dict[str, Any]:
+def _cifar10_info() -> dict[str, Any]:
     return dict(categories=read_categories_file("cifar10"))
 
 
@@ -111,13 +115,13 @@ class Cifar10(_CifarBase):
     _CATEGORIES_KEY = "label_names"
     _categories = _cifar10_info()["categories"]
 
-    def _is_data_file(self, data: Tuple[str, Any]) -> bool:
+    def _is_data_file(self, data: tuple[str, Any]) -> bool:
         path = pathlib.Path(data[0])
         return path.name.startswith("data" if self._split == "train" else "test")
 
 
 @register_info("cifar100")
-def _cifar100_info() -> Dict[str, Any]:
+def _cifar100_info() -> dict[str, Any]:
     return dict(categories=read_categories_file("cifar100"))
 
 
@@ -134,6 +138,6 @@ class Cifar100(_CifarBase):
     _CATEGORIES_KEY = "fine_label_names"
     _categories = _cifar100_info()["categories"]
 
-    def _is_data_file(self, data: Tuple[str, Any]) -> bool:
+    def _is_data_file(self, data: tuple[str, Any]) -> bool:
         path = pathlib.Path(data[0])
         return path.name == self._split
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
index 4ddacdfb982..4880b6bb49c 100644
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ b/torchvision/prototype/datasets/_builtin/clevr.py
@@ -1,8 +1,8 @@
 import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, Optional, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -11,7 +11,7 @@
     path_accessor,
     path_comparator,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -19,7 +19,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict()
 
 
@@ -36,14 +36,14 @@ def __init__(
 
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         archive = HttpResource(
             "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip",
             sha256="5cd61cf1096ed20944df93c9adb31e74d189b8459a94f54ba00090e5c59936d1",
         )
         return [archive]
 
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         if path.parents[1].name == "images":
             return 0
@@ -52,14 +52,14 @@ def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _filter_scene_anns(self, data: Tuple[str, Any]) -> bool:
+    def _filter_scene_anns(self, data: tuple[str, Any]) -> bool:
         key, _ = data
         return key == "scenes"
 
-    def _add_empty_anns(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[str, BinaryIO], None]:
+    def _add_empty_anns(self, data: tuple[str, BinaryIO]) -> tuple[tuple[str, BinaryIO], None]:
         return data, None
 
-    def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Optional[Dict[str, Any]]]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[tuple[str, BinaryIO], Optional[dict[str, Any]]]) -> dict[str, Any]:
         image_data, scenes_data = data
         path, buffer = image_data
 
@@ -69,7 +69,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Optional[Dict[str, A
             label=Label(len(scenes_data["objects"])) if scenes_data else None,
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         archive_dp = resource_dps[0]
         images_dp, scenes_dp = Demultiplexer(
             archive_dp,
@@ -97,6 +97,8 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
                 buffer_size=INFINITE_BUFFER_SIZE,
             )
         else:
+            for _, file in scenes_dp:
+                file.close()
             dp = Mapper(images_dp, self._add_empty_anns)
 
         return Mapper(dp, self._prepare_sample)
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index 16a16998bf7..121e1786c75 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -1,7 +1,7 @@
 import pathlib
 import re
 from collections import defaultdict, OrderedDict
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, cast, Optional, Union
 
 import torch
 from torchdata.datapipes.iter import (
@@ -14,7 +14,7 @@
     Mapper,
     UnBatcher,
 )
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -24,7 +24,8 @@
     path_accessor,
     read_categories_file,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes, Mask
 
 from .._api import register_dataset, register_info
 
@@ -33,7 +34,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     categories, super_categories = zip(*read_categories_file(NAME))
     return dict(categories=categories, super_categories=super_categories)
 
@@ -86,7 +87,7 @@ def __init__(
         "2017": "113a836d90195ee1f884e704da6304dfaaecff1f023f49b6ca93c4aaae470268",
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         images = HttpResource(
             f"{self._IMAGE_URL_BASE}/{self._split}{self._year}.zip",
             sha256=self._IMAGES_CHECKSUMS[(self._year, self._split)],
@@ -97,42 +98,45 @@ def _resources(self) -> List[OnlineResource]:
         )
         return [images, meta]
 
-    def _segmentation_to_mask(self, segmentation: Any, *, is_crowd: bool, image_size: Tuple[int, int]) -> torch.Tensor:
+    def _segmentation_to_mask(
+        self, segmentation: Any, *, is_crowd: bool, spatial_size: tuple[int, int]
+    ) -> torch.Tensor:
         from pycocotools import mask
 
         if is_crowd:
-            segmentation = mask.frPyObjects(segmentation, *image_size)
+            segmentation = mask.frPyObjects(segmentation, *spatial_size)
         else:
-            segmentation = mask.merge(mask.frPyObjects(segmentation, *image_size))
+            segmentation = mask.merge(mask.frPyObjects(segmentation, *spatial_size))
 
         return torch.from_numpy(mask.decode(segmentation)).to(torch.bool)
 
-    def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        image_size = (image_meta["height"], image_meta["width"])
+    def _decode_instances_anns(self, anns: list[dict[str, Any]], image_meta: dict[str, Any]) -> dict[str, Any]:
+        spatial_size = (image_meta["height"], image_meta["width"])
         labels = [ann["category_id"] for ann in anns]
         return dict(
-            # TODO: create a segmentation feature
-            segmentations=_Feature(
+            segmentations=Mask(
                 torch.stack(
                     [
-                        self._segmentation_to_mask(ann["segmentation"], is_crowd=ann["iscrowd"], image_size=image_size)
+                        self._segmentation_to_mask(
+                            ann["segmentation"], is_crowd=ann["iscrowd"], spatial_size=spatial_size
+                        )
                         for ann in anns
                     ]
                 )
             ),
-            areas=_Feature([ann["area"] for ann in anns]),
-            crowds=_Feature([ann["iscrowd"] for ann in anns], dtype=torch.bool),
-            bounding_boxes=BoundingBox(
+            areas=torch.as_tensor([ann["area"] for ann in anns]),
+            crowds=torch.as_tensor([ann["iscrowd"] for ann in anns], dtype=torch.bool),
+            bounding_boxes=BoundingBoxes(
                 [ann["bbox"] for ann in anns],
                 format="xywh",
-                image_size=image_size,
+                spatial_size=spatial_size,
             ),
             labels=Label(labels, categories=self._categories),
             super_categories=[self._category_to_super_category[self._categories[label]] for label in labels],
             ann_ids=[ann["id"] for ann in anns],
         )
 
-    def _decode_captions_ann(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
+    def _decode_captions_ann(self, anns: list[dict[str, Any]], image_meta: dict[str, Any]) -> dict[str, Any]:
         return dict(
             captions=[ann["caption"] for ann in anns],
             ann_ids=[ann["id"] for ann in anns],
@@ -149,7 +153,7 @@ def _decode_captions_ann(self, anns: List[Dict[str, Any]], image_meta: Dict[str,
         rf"(?P<annotations>({'|'.join(_ANN_DECODERS.keys())}))_(?P<split>[a-zA-Z]+)(?P<year>\d+)[.]json"
     )
 
-    def _filter_meta_files(self, data: Tuple[str, Any]) -> bool:
+    def _filter_meta_files(self, data: tuple[str, Any]) -> bool:
         match = self._META_FILE_PATTERN.match(pathlib.Path(data[0]).name)
         return bool(
             match
@@ -158,7 +162,7 @@ def _filter_meta_files(self, data: Tuple[str, Any]) -> bool:
             and match["annotations"] == self._annotations
         )
 
-    def _classify_meta(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_meta(self, data: tuple[str, Any]) -> Optional[int]:
         key, _ = data
         if key == "images":
             return 0
@@ -167,7 +171,7 @@ def _classify_meta(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _prepare_image(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
+    def _prepare_image(self, data: tuple[str, BinaryIO]) -> dict[str, Any]:
         path, buffer = data
         return dict(
             path=path,
@@ -176,8 +180,8 @@ def _prepare_image(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
 
     def _prepare_sample(
         self,
-        data: Tuple[Tuple[List[Dict[str, Any]], Dict[str, Any]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
+        data: tuple[tuple[list[dict[str, Any]], dict[str, Any]], tuple[str, BinaryIO]],
+    ) -> dict[str, Any]:
         ann_data, image_data = data
         anns, image_meta = ann_data
 
@@ -187,7 +191,7 @@ def _prepare_sample(
         sample.update(self._ANN_DECODERS[annotations](self, anns, image_meta))
         return sample
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         images_dp, meta_dp = resource_dps
 
         if self._annotations is None:
@@ -199,7 +203,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
         meta_dp = Filter(meta_dp, self._filter_meta_files)
         meta_dp = JsonParser(meta_dp)
         meta_dp = Mapper(meta_dp, getitem(1))
-        meta_dp: IterDataPipe[Dict[str, Dict[str, Any]]] = MappingIterator(meta_dp)
+        meta_dp: IterDataPipe[dict[str, dict[str, Any]]] = MappingIterator(meta_dp)
         images_meta_dp, anns_meta_dp = Demultiplexer(
             meta_dp,
             2,
@@ -243,7 +247,7 @@ def __len__(self) -> int:
             self._annotations  # type: ignore[index]
         ]
 
-    def _generate_categories(self) -> Tuple[Tuple[str, str]]:
+    def _generate_categories(self) -> tuple[tuple[str, str]]:
         self._annotations = "instances"
         resources = self._resources()
 
@@ -253,7 +257,7 @@ def _generate_categories(self) -> Tuple[Tuple[str, str]]:
 
         _, meta = next(iter(dp))
         # List[Tuple[super_category, id, category]]
-        label_data = [cast(Tuple[str, int, str], tuple(info.values())) for info in meta["categories"]]
+        label_data = [cast(tuple[str, int, str], tuple(info.values())) for info in meta["categories"]]
 
         # COCO actually defines 91 categories, but only 80 of them have instances. Still, the category_id refers to the
         # full set. To keep the labels dense, we fill the gaps with N/A. Note that there are only 10 gaps, so the total
@@ -267,4 +271,4 @@ def _generate_categories(self) -> Tuple[Tuple[str, str]]:
 
         super_categories, _, categories = zip(*sorted(label_data, key=lambda info: info[1]))
 
-        return cast(Tuple[Tuple[str, str]], tuple(zip(categories, super_categories)))
+        return cast(tuple[tuple[str, str]], tuple(zip(categories, super_categories)))
diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py
index f9821ea4eb6..2f1d5ac7da8 100644
--- a/torchvision/prototype/datasets/_builtin/country211.py
+++ b/torchvision/prototype/datasets/_builtin/country211.py
@@ -1,15 +1,15 @@
 import pathlib
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -17,7 +17,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -41,7 +41,7 @@ def __init__(
 
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [
             HttpResource(
                 "https://openaipublic.azureedge.net/clip/data/country211.tgz",
@@ -49,7 +49,7 @@ def _resources(self) -> List[OnlineResource]:
             )
         ]
 
-    def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[str, Any]) -> dict[str, Any]:
         path, buffer = data
         category = pathlib.Path(path).parent.name
         return dict(
@@ -58,10 +58,10 @@ def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
             image=EncodedImage.from_file(buffer),
         )
 
-    def _filter_split(self, data: Tuple[str, Any], *, split: str) -> bool:
+    def _filter_split(self, data: tuple[str, Any], *, split: str) -> bool:
         return pathlib.Path(data[0]).parent.parent.name == split
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = Filter(dp, path_comparator("parent.parent.name", self._split_folder_name))
         dp = hint_shuffling(dp)
@@ -75,7 +75,7 @@ def __len__(self) -> int:
             "test": 21_100,
         }[self._split]
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
         dp = resources[0].load(self._root)
         return sorted({pathlib.Path(path).parent.name for path, _ in dp})
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index bb3f712c59d..c7efecaedf5 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -1,8 +1,9 @@
 import csv
 import functools
 import pathlib
-from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, Callable, Optional, Union
 
+import torch
 from torchdata.datapipes.iter import (
     CSVDictParser,
     CSVParser,
@@ -13,7 +14,8 @@
     LineReader,
     Mapper,
 )
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
+from torchdata.datapipes.map import IterToMapConverter
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -24,7 +26,8 @@
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
@@ -35,7 +38,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -65,7 +68,7 @@ def __init__(
             skip_integrity_check=skip_integrity_check,
         )
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         if self._year == "2011":
             archive = GDriveResource(
                 "1hbzc_P1FuxMkcabkgn9ZKinBwW683j45",
@@ -101,7 +104,7 @@ def _resources(self) -> List[OnlineResource]:
             )
             return [split, images, anns]
 
-    def _2011_classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _2011_classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         if path.parents[1].name == "images":
             return 0
@@ -114,25 +117,28 @@ def _2011_classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _2011_filter_split(self, row: List[str]) -> bool:
+    def _2011_extract_file_name(self, rel_posix_path: str) -> str:
+        return rel_posix_path.rsplit("/", maxsplit=1)[1]
+
+    def _2011_filter_split(self, row: list[str]) -> bool:
         _, split_id = row
         return {
             "0": "test",
             "1": "train",
         }[split_id] == self._split
 
-    def _2011_segmentation_key(self, data: Tuple[str, Any]) -> str:
+    def _2011_segmentation_key(self, data: tuple[str, Any]) -> str:
         path = pathlib.Path(data[0])
         return path.with_suffix(".jpg").name
 
     def _2011_prepare_ann(
-        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], image_size: Tuple[int, int]
-    ) -> Dict[str, Any]:
-        _, (bounding_box_data, segmentation_data) = data
+        self, data: tuple[str, tuple[list[str], tuple[str, BinaryIO]]], spatial_size: tuple[int, int]
+    ) -> dict[str, Any]:
+        _, (bounding_boxes_data, segmentation_data) = data
         segmentation_path, segmentation_buffer = segmentation_data
         return dict(
-            bounding_box=BoundingBox(
-                [float(part) for part in bounding_box_data[1:]], format="xywh", image_size=image_size
+            bounding_boxes=BoundingBoxes(
+                [float(part) for part in bounding_boxes_data[1:]], format="xywh", spatial_size=spatial_size
             ),
             segmentation_path=segmentation_path,
             segmentation=EncodedImage.from_file(segmentation_buffer),
@@ -141,29 +147,31 @@ def _2011_prepare_ann(
     def _2010_split_key(self, data: str) -> str:
         return data.rsplit("/", maxsplit=1)[1]
 
-    def _2010_anns_key(self, data: Tuple[str, BinaryIO]) -> Tuple[str, Tuple[str, BinaryIO]]:
+    def _2010_anns_key(self, data: tuple[str, BinaryIO]) -> tuple[str, tuple[str, BinaryIO]]:
         path = pathlib.Path(data[0])
         return path.with_suffix(".jpg").name, data
 
-    def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: Tuple[int, int]) -> Dict[str, Any]:
+    def _2010_prepare_ann(
+        self, data: tuple[str, tuple[str, BinaryIO]], spatial_size: tuple[int, int]
+    ) -> dict[str, Any]:
         _, (path, buffer) = data
         content = read_mat(buffer)
         return dict(
             ann_path=path,
-            bounding_box=BoundingBox(
+            bounding_boxes=BoundingBoxes(
                 [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
                 format="xyxy",
-                image_size=image_size,
+                spatial_size=spatial_size,
             ),
-            segmentation=_Feature(content["seg"]),
+            segmentation=torch.as_tensor(content["seg"]),
         )
 
     def _prepare_sample(
         self,
-        data: Tuple[Tuple[str, Tuple[str, BinaryIO]], Any],
+        data: tuple[tuple[str, tuple[str, BinaryIO]], Any],
         *,
-        prepare_ann_fn: Callable[[Any, Tuple[int, int]], Dict[str, Any]],
-    ) -> Dict[str, Any]:
+        prepare_ann_fn: Callable[[Any, tuple[int, int]], dict[str, Any]],
+    ) -> dict[str, Any]:
         data, anns_data = data
         _, image_data = data
         path, buffer = image_data
@@ -171,12 +179,15 @@ def _prepare_sample(
         image = EncodedImage.from_file(buffer)
 
         return dict(
-            prepare_ann_fn(anns_data, image.image_size),
+            prepare_ann_fn(anns_data, image.spatial_size),
             image=image,
-            label=Label(int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]), categories=self._categories),
+            label=Label(
+                int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1,
+                categories=self._categories,
+            ),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         prepare_ann_fn: Callable
         if self._year == "2011":
             archive_dp, segmentations_dp = resource_dps
@@ -185,17 +196,16 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
             )
 
             image_files_dp = CSVParser(image_files_dp, dialect="cub200")
-            image_files_map = dict(
-                (image_id, rel_posix_path.rsplit("/", maxsplit=1)[1]) for image_id, rel_posix_path in image_files_dp
-            )
+            image_files_dp = Mapper(image_files_dp, self._2011_extract_file_name, input_col=1)
+            image_files_map = IterToMapConverter(image_files_dp)
 
             split_dp = CSVParser(split_dp, dialect="cub200")
             split_dp = Filter(split_dp, self._2011_filter_split)
             split_dp = Mapper(split_dp, getitem(0))
-            split_dp = Mapper(split_dp, image_files_map.get)
+            split_dp = Mapper(split_dp, image_files_map.__getitem__)
 
             bounding_boxes_dp = CSVParser(bounding_boxes_dp, dialect="cub200")
-            bounding_boxes_dp = Mapper(bounding_boxes_dp, image_files_map.get, input_col=0)
+            bounding_boxes_dp = Mapper(bounding_boxes_dp, image_files_map.__getitem__, input_col=0)
 
             anns_dp = IterKeyZipper(
                 bounding_boxes_dp,
@@ -244,7 +254,7 @@ def __len__(self) -> int:
             ("test", "2011"): 5_794,
         }[(self._split, self._year)]
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         self._year = "2011"
         resources = self._resources()
 
diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py
index e7ff1e79559..7d199f0f7ec 100644
--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ b/torchvision/prototype/datasets/_builtin/dtd.py
@@ -1,9 +1,9 @@
 import enum
 import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, Optional, Union
 
 from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -12,7 +12,7 @@
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -27,7 +27,7 @@ class DTDDemux(enum.IntEnum):
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -55,7 +55,7 @@ def __init__(
 
         super().__init__(root, skip_integrity_check=skip_validation_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         archive = HttpResource(
             "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz",
             sha256="e42855a52a4950a3b59612834602aa253914755c95b0cff9ead6d07395f8e205",
@@ -63,7 +63,7 @@ def _resources(self) -> List[OnlineResource]:
         )
         return [archive]
 
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         if path.parent.name == "labels":
             if path.name == "labels_joint_anno.txt":
@@ -75,12 +75,12 @@ def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _image_key_fn(self, data: Tuple[str, Any]) -> str:
+    def _image_key_fn(self, data: tuple[str, Any]) -> str:
         path = pathlib.Path(data[0])
         # The split files contain hardcoded posix paths for the images, e.g. banded/banded_0001.jpg
         return str(path.relative_to(path.parents[1]).as_posix())
 
-    def _prepare_sample(self, data: Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[tuple[str, list[str]], tuple[str, BinaryIO]]) -> dict[str, Any]:
         (_, joint_categories_data), image_data = data
         _, *joint_categories = joint_categories_data
         path, buffer = image_data
@@ -94,7 +94,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO
             image=EncodedImage.from_file(buffer),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         archive_dp = resource_dps[0]
 
         splits_dp, joint_categories_dp, images_dp = Demultiplexer(
@@ -124,10 +124,10 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
         )
         return Mapper(dp, self._prepare_sample)
 
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
+    def _filter_images(self, data: tuple[str, Any]) -> bool:
         return self._classify_archive(data) == DTDDemux.IMAGES
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
 
         dp = resources[0].load(self._root)
diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py
index ab31aaf6f42..a3631253dbb 100644
--- a/torchvision/prototype/datasets/_builtin/eurosat.py
+++ b/torchvision/prototype/datasets/_builtin/eurosat.py
@@ -1,10 +1,10 @@
 import pathlib
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -12,14 +12,15 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(
         categories=(
             "AnnualCrop",
             "Forest",
             "HerbaceousVegetation",
             "Highway",
-            "Industrial," "Pasture",
+            "Industrial",
+            "Pasture",
             "PermanentCrop",
             "Residential",
             "River",
@@ -38,7 +39,7 @@ def __init__(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool
         self._categories = _info()["categories"]
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [
             HttpResource(
                 "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
@@ -46,7 +47,7 @@ def _resources(self) -> List[OnlineResource]:
             )
         ]
 
-    def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[str, Any]) -> dict[str, Any]:
         path, buffer = data
         category = pathlib.Path(path).parent.name
         return dict(
@@ -55,7 +56,7 @@ def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
             image=EncodedImage.from_file(buffer),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = hint_shuffling(dp)
         dp = hint_sharding(dp)
diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py
index b2693aa96c0..c3c47b43e8f 100644
--- a/torchvision/prototype/datasets/_builtin/fer2013.py
+++ b/torchvision/prototype/datasets/_builtin/fer2013.py
@@ -1,11 +1,12 @@
 import pathlib
-from typing import Any, Dict, List, Union
+from typing import Any, Union
 
 import torch
 from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper
 from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
@@ -13,7 +14,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=("angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"))
 
 
@@ -36,7 +37,7 @@ def __init__(
         "test": "dec8dfe8021e30cd6704b85ec813042b4a5d99d81cb55e023291a94104f575c3",
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         archive = KaggleDownloadResource(
             "https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge",
             file_name=f"{self._split}.csv.zip",
@@ -44,7 +45,7 @@ def _resources(self) -> List[OnlineResource]:
         )
         return [archive]
 
-    def _prepare_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: dict[str, Any]) -> dict[str, Any]:
         label_id = data.get("emotion")
 
         return dict(
@@ -52,7 +53,7 @@ def _prepare_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
             label=Label(int(label_id), categories=self._categories) if label_id is not None else None,
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = CSVDictParser(dp)
         dp = hint_shuffling(dp)
diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py
index 3657116ae7a..ab06703408f 100644
--- a/torchvision/prototype/datasets/_builtin/food101.py
+++ b/torchvision/prototype/datasets/_builtin/food101.py
@@ -1,8 +1,8 @@
 from pathlib import Path
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, Optional, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -11,7 +11,7 @@
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -20,7 +20,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -36,7 +36,7 @@ def __init__(self, root: Union[str, Path], *, split: str = "train", skip_integri
 
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [
             HttpResource(
                 url="http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz",
@@ -45,7 +45,7 @@ def _resources(self) -> List[OnlineResource]:
             )
         ]
 
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = Path(data[0])
         if path.parents[1].name == "images":
             return 0
@@ -54,7 +54,7 @@ def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _prepare_sample(self, data: Tuple[str, Tuple[str, BinaryIO]]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[str, tuple[str, BinaryIO]]) -> dict[str, Any]:
         id, (path, buffer) = data
         return dict(
             label=Label.from_category(id.split("/", 1)[0], categories=self._categories),
@@ -62,11 +62,11 @@ def _prepare_sample(self, data: Tuple[str, Tuple[str, BinaryIO]]) -> Dict[str, A
             image=EncodedImage.from_file(buffer),
         )
 
-    def _image_key(self, data: Tuple[str, Any]) -> str:
+    def _image_key(self, data: tuple[str, Any]) -> str:
         path = Path(data[0])
         return path.relative_to(path.parents[1]).with_suffix("").as_posix()
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         archive_dp = resource_dps[0]
         images_dp, split_dp = Demultiplexer(
             archive_dp, 2, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
@@ -86,7 +86,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
 
         return Mapper(dp, self._prepare_sample)
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
         dp = resources[0].load(self._root)
         dp = Filter(dp, path_comparator("name", "classes.txt"))
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 8dc0a8240c8..4c981ffaec0 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -1,15 +1,16 @@
 import pathlib
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
     INFINITE_BUFFER_SIZE,
     path_comparator,
 )
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
@@ -17,7 +18,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(
         categories=[f"{label:05d}" for label in range(43)],
     )
@@ -49,8 +50,8 @@ def __init__(
         "test_ground_truth": "f94e5a7614d75845c74c04ddb26b8796b9e483f43541dd95dd5b726504e16d6d",
     }
 
-    def _resources(self) -> List[OnlineResource]:
-        rsrcs: List[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUMS[self._split])]
+    def _resources(self) -> list[OnlineResource]:
+        rsrcs: list[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUMS[self._split])]
 
         if self._split == "test":
             rsrcs.append(
@@ -62,7 +63,7 @@ def _resources(self) -> List[OnlineResource]:
 
         return rsrcs
 
-    def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_train_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         if path.suffix == ".ppm":
             return 0
@@ -71,24 +72,24 @@ def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _prepare_sample(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[tuple[str, Any], dict[str, Any]]) -> dict[str, Any]:
         (path, buffer), csv_info = data
         label = int(csv_info["ClassId"])
 
-        bounding_box = BoundingBox(
+        bounding_boxes = BoundingBoxes(
             [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
             format="xyxy",
-            image_size=(int(csv_info["Height"]), int(csv_info["Width"])),
+            spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])),
         )
 
         return {
             "path": path,
             "image": EncodedImage.from_file(buffer),
             "label": Label(label, categories=self._categories),
-            "bounding_box": bounding_box,
+            "bounding_boxes": bounding_boxes,
         }
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         if self._split == "train":
             images_dp, ann_dp = Demultiplexer(
                 resource_dps[0], 2, self._classify_train_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py
index 062e240a8b8..ec27da61523 100644
--- a/torchvision/prototype/datasets/_builtin/imagenet.py
+++ b/torchvision/prototype/datasets/_builtin/imagenet.py
@@ -1,8 +1,10 @@
 import enum
-import functools
 import pathlib
 import re
-from typing import Any, BinaryIO, cast, Dict, List, Match, Optional, Tuple, Union
+from collections.abc import Iterator
+from re import Match
+
+from typing import Any, BinaryIO, cast, Optional, Union
 
 from torchdata.datapipes.iter import (
     Demultiplexer,
@@ -14,7 +16,8 @@
     Mapper,
     TarArchiveLoader,
 )
-from torchvision.prototype.datasets.utils import Dataset, ManualDownloadResource, OnlineResource
+from torchdata.datapipes.map import IterToMapConverter
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, ManualDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -24,7 +27,7 @@
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -32,7 +35,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     categories, wnids = zip(*read_categories_file(NAME))
     return dict(categories=categories, wnids=wnids)
 
@@ -47,6 +50,28 @@ class ImageNetDemux(enum.IntEnum):
     LABEL = 1
 
 
+class CategoryAndWordNetIDExtractor(IterDataPipe):
+    # Although the WordNet IDs (wnids) are unique, the corresponding categories are not. For example, both n02012849
+    # and n03126707 are labeled 'crane' while the first means the bird and the latter means the construction equipment
+    _WNID_MAP = {
+        "n03126707": "construction crane",
+        "n03710721": "tank suit",
+    }
+
+    def __init__(self, datapipe: IterDataPipe[tuple[str, BinaryIO]]) -> None:
+        self.datapipe = datapipe
+
+    def __iter__(self) -> Iterator[tuple[str, str]]:
+        for _, stream in self.datapipe:
+            synsets = read_mat(stream, squeeze_me=True)["synsets"]
+            for _, wnid, category, _, num_children, *_ in synsets:
+                if num_children > 0:
+                    # we are looking at a superclass that has no direct instance
+                    continue
+
+                yield self._WNID_MAP.get(wnid, category.split(",", 1)[0]), wnid
+
+
 @register_dataset(NAME)
 class ImageNet(Dataset):
     """
@@ -76,13 +101,13 @@ def __init__(
         "test_v10102019": "9cf7f8249639510f17d3d8a0deb47cd22a435886ba8e29e2b3223e65a4079eb4",
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         name = "test_v10102019" if self._split == "test" else self._split
         images = ImageNetResource(
             file_name=f"ILSVRC2012_img_{name}.tar",
             sha256=self._IMAGES_CHECKSUMS[name],
         )
-        resources: List[OnlineResource] = [images]
+        resources: list[OnlineResource] = [images]
 
         if self._split == "val":
             devkit = ImageNetResource(
@@ -95,48 +120,29 @@ def _resources(self) -> List[OnlineResource]:
 
     _TRAIN_IMAGE_NAME_PATTERN = re.compile(r"(?P<wnid>n\d{8})_\d+[.]JPEG")
 
-    def _prepare_train_data(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[Label, str], Tuple[str, BinaryIO]]:
+    def _prepare_train_data(self, data: tuple[str, BinaryIO]) -> tuple[tuple[Label, str], tuple[str, BinaryIO]]:
         path = pathlib.Path(data[0])
         wnid = cast(Match[str], self._TRAIN_IMAGE_NAME_PATTERN.match(path.name))["wnid"]
         label = Label.from_category(self._wnid_to_category[wnid], categories=self._categories)
         return (label, wnid), data
 
-    def _prepare_test_data(self, data: Tuple[str, BinaryIO]) -> Tuple[None, Tuple[str, BinaryIO]]:
+    def _prepare_test_data(self, data: tuple[str, BinaryIO]) -> tuple[None, tuple[str, BinaryIO]]:
         return None, data
 
-    def _classifiy_devkit(self, data: Tuple[str, BinaryIO]) -> Optional[int]:
+    def _classifiy_devkit(self, data: tuple[str, BinaryIO]) -> Optional[int]:
         return {
             "meta.mat": ImageNetDemux.META,
             "ILSVRC2012_validation_ground_truth.txt": ImageNetDemux.LABEL,
         }.get(pathlib.Path(data[0]).name)
 
-    # Although the WordNet IDs (wnids) are unique, the corresponding categories are not. For example, both n02012849
-    # and n03126707 are labeled 'crane' while the first means the bird and the latter means the construction equipment
-    _WNID_MAP = {
-        "n03126707": "construction crane",
-        "n03710721": "tank suit",
-    }
-
-    def _extract_categories_and_wnids(self, data: Tuple[str, BinaryIO]) -> List[Tuple[str, str]]:
-        synsets = read_mat(data[1], squeeze_me=True)["synsets"]
-        return [
-            (self._WNID_MAP.get(wnid, category.split(",", 1)[0]), wnid)
-            for _, wnid, category, _, num_children, *_ in synsets
-            # if num_children > 0, we are looking at a superclass that has no direct instance
-            if num_children == 0
-        ]
-
-    def _imagenet_label_to_wnid(self, imagenet_label: str, *, wnids: Tuple[str, ...]) -> str:
-        return wnids[int(imagenet_label) - 1]
-
     _VAL_TEST_IMAGE_NAME_PATTERN = re.compile(r"ILSVRC2012_(val|test)_(?P<id>\d{8})[.]JPEG")
 
     def _val_test_image_key(self, path: pathlib.Path) -> int:
         return int(self._VAL_TEST_IMAGE_NAME_PATTERN.match(path.name)["id"])  # type: ignore[index]
 
     def _prepare_val_data(
-        self, data: Tuple[Tuple[int, str], Tuple[str, BinaryIO]]
-    ) -> Tuple[Tuple[Label, str], Tuple[str, BinaryIO]]:
+        self, data: tuple[tuple[int, str], tuple[str, BinaryIO]]
+    ) -> tuple[tuple[Label, str], tuple[str, BinaryIO]]:
         label_data, image_data = data
         _, wnid = label_data
         label = Label.from_category(self._wnid_to_category[wnid], categories=self._categories)
@@ -144,8 +150,8 @@ def _prepare_val_data(
 
     def _prepare_sample(
         self,
-        data: Tuple[Optional[Tuple[Label, str]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
+        data: tuple[Optional[tuple[Label, str]], tuple[str, BinaryIO]],
+    ) -> dict[str, Any]:
         label_data, (path, buffer) = data
 
         return dict(
@@ -154,7 +160,7 @@ def _prepare_sample(
             image=EncodedImage.from_file(buffer),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         if self._split in {"train", "test"}:
             dp = resource_dps[0]
 
@@ -172,13 +178,16 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
                 devkit_dp, 2, self._classifiy_devkit, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
             )
 
-            meta_dp = Mapper(meta_dp, self._extract_categories_and_wnids)
-            _, wnids = zip(*next(iter(meta_dp)))
+            # We cannot use self._wnids here, since we use a different order than the dataset
+            meta_dp = CategoryAndWordNetIDExtractor(meta_dp)
+            wnid_dp = Mapper(meta_dp, getitem(1))
+            wnid_dp = Enumerator(wnid_dp, 1)
+            wnid_map = IterToMapConverter(wnid_dp)
 
             label_dp = LineReader(label_dp, decode=True, return_path=False)
-            # We cannot use self._wnids here, since we use a different order than the dataset
-            label_dp = Mapper(label_dp, functools.partial(self._imagenet_label_to_wnid, wnids=wnids))
-            label_dp: IterDataPipe[Tuple[int, str]] = Enumerator(label_dp, 1)
+            label_dp = Mapper(label_dp, int)
+            label_dp = Mapper(label_dp, wnid_map.__getitem__)
+            label_dp: IterDataPipe[tuple[int, str]] = Enumerator(label_dp, 1)
             label_dp = hint_shuffling(label_dp)
             label_dp = hint_sharding(label_dp)
 
@@ -200,17 +209,17 @@ def __len__(self) -> int:
             "test": 100_000,
         }[self._split]
 
-    def _filter_meta(self, data: Tuple[str, Any]) -> bool:
+    def _filter_meta(self, data: tuple[str, Any]) -> bool:
         return self._classifiy_devkit(data) == ImageNetDemux.META
 
-    def _generate_categories(self) -> List[Tuple[str, ...]]:
+    def _generate_categories(self) -> list[tuple[str, ...]]:
         self._split = "val"
         resources = self._resources()
 
         devkit_dp = resources[1].load(self._root)
         meta_dp = Filter(devkit_dp, self._filter_meta)
-        meta_dp = Mapper(meta_dp, self._extract_categories_and_wnids)
+        meta_dp = CategoryAndWordNetIDExtractor(meta_dp)
 
-        categories_and_wnids = cast(List[Tuple[str, ...]], next(iter(meta_dp)))
+        categories_and_wnids = cast(list[tuple[str, ...]], list(meta_dp))
         categories_and_wnids.sort(key=lambda category_and_wnid: category_and_wnid[1])
         return categories_and_wnids
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 7a459b2d0ea..48787eaaace 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -3,14 +3,16 @@
 import operator
 import pathlib
 import string
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Sequence, Tuple, Union
+from collections.abc import Iterator, Sequence
+from typing import Any, BinaryIO, cast, Optional, Union
 
 import torch
 from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE
-from torchvision.prototype.features import Image, Label
+from torchvision.prototype.tv_tensors import Label
 from torchvision.prototype.utils._internal import fromfile
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
@@ -29,7 +31,7 @@ class MNISTFileReader(IterDataPipe[torch.Tensor]):
     }
 
     def __init__(
-        self, datapipe: IterDataPipe[Tuple[Any, BinaryIO]], *, start: Optional[int], stop: Optional[int]
+        self, datapipe: IterDataPipe[tuple[Any, BinaryIO]], *, start: Optional[int], stop: Optional[int]
     ) -> None:
         self.datapipe = datapipe
         self.start = start
@@ -37,35 +39,38 @@ def __init__(
 
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            read = functools.partial(fromfile, file, byte_order="big")
+            try:
+                read = functools.partial(fromfile, file, byte_order="big")
 
-            magic = int(read(dtype=torch.int32, count=1))
-            dtype = self._DTYPE_MAP[magic // 256]
-            ndim = magic % 256 - 1
+                magic = int(read(dtype=torch.int32, count=1))
+                dtype = self._DTYPE_MAP[magic // 256]
+                ndim = magic % 256 - 1
 
-            num_samples = int(read(dtype=torch.int32, count=1))
-            shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
-            count = prod(shape) if shape else 1
+                num_samples = int(read(dtype=torch.int32, count=1))
+                shape = cast(list[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
+                count = prod(shape) if shape else 1
 
-            start = self.start or 0
-            stop = min(self.stop, num_samples) if self.stop else num_samples
+                start = self.start or 0
+                stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            if start:
-                num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-                file.seek(num_bytes_per_value * count * start, 1)
+                if start:
+                    num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+                    file.seek(num_bytes_per_value * count * start, 1)
 
-            for _ in range(stop - start):
-                yield read(dtype=dtype, count=count).reshape(shape)
+                for _ in range(stop - start):
+                    yield read(dtype=dtype, count=count).reshape(shape)
+            finally:
+                file.close()
 
 
 class _MNISTBase(Dataset):
     _URL_BASE: Union[str, Sequence[str]]
 
     @abc.abstractmethod
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
+    def _files_and_checksums(self) -> tuple[tuple[str, str], tuple[str, str]]:
         pass
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         (images_file, images_sha256), (
             labels_file,
             labels_sha256,
@@ -83,19 +88,19 @@ def _resources(self) -> List[OnlineResource]:
 
         return [images, labels]
 
-    def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
+    def start_and_stop(self) -> tuple[Optional[int], Optional[int]]:
         return None, None
 
-    _categories: List[str]
+    _categories: list[str]
 
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[torch.Tensor, torch.Tensor]) -> dict[str, Any]:
         image, label = data
         return dict(
             image=Image(image),
             label=Label(label, dtype=torch.int64, categories=self._categories),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         images_dp, labels_dp = resource_dps
         start, stop = self.start_and_stop()
 
@@ -112,7 +117,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
 
 
 @register_info("mnist")
-def _mnist_info() -> Dict[str, Any]:
+def _mnist_info() -> dict[str, Any]:
     return dict(
         categories=[str(label) for label in range(10)],
     )
@@ -145,7 +150,7 @@ def __init__(
         "t10k-labels-idx1-ubyte.gz": "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6",
     }
 
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
+    def _files_and_checksums(self) -> tuple[tuple[str, str], tuple[str, str]]:
         prefix = "train" if self._split == "train" else "t10k"
         images_file = f"{prefix}-images-idx3-ubyte.gz"
         labels_file = f"{prefix}-labels-idx1-ubyte.gz"
@@ -161,7 +166,7 @@ def __len__(self) -> int:
 
 
 @register_info("fashionmnist")
-def _fashionmnist_info() -> Dict[str, Any]:
+def _fashionmnist_info() -> dict[str, Any]:
     return dict(
         categories=[
             "T-shirt/top",
@@ -196,7 +201,7 @@ class FashionMNIST(MNIST):
 
 
 @register_info("kmnist")
-def _kmnist_info() -> Dict[str, Any]:
+def _kmnist_info() -> dict[str, Any]:
     return dict(
         categories=["o", "ki", "su", "tsu", "na", "ha", "ma", "ya", "re", "wo"],
     )
@@ -220,7 +225,7 @@ class KMNIST(MNIST):
 
 
 @register_info("emnist")
-def _emnist_info() -> Dict[str, Any]:
+def _emnist_info() -> dict[str, Any]:
     return dict(
         categories=list(string.digits + string.ascii_uppercase + string.ascii_lowercase),
     )
@@ -248,14 +253,14 @@ def __init__(
 
     _URL_BASE = "https://rds.westernsydney.edu.au/Institutes/MARCS/BENS/EMNIST"
 
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
+    def _files_and_checksums(self) -> tuple[tuple[str, str], tuple[str, str]]:
         prefix = f"emnist-{self._image_set.replace('_', '').lower()}-{self._split}"
         images_file = f"{prefix}-images-idx3-ubyte.gz"
         labels_file = f"{prefix}-labels-idx1-ubyte.gz"
         # Since EMNIST provides the data files inside an archive, we don't need to provide checksums for them
         return (images_file, ""), (labels_file, "")
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [
             HttpResource(
                 f"{self._URL_BASE}/emnist-gzip.zip",
@@ -263,7 +268,7 @@ def _resources(self) -> List[OnlineResource]:
             )
         ]
 
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         (images_file, _), (labels_file, _) = self._files_and_checksums()
         if path.name == images_file:
@@ -287,7 +292,7 @@ def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         46: 9,
     }
 
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[torch.Tensor, torch.Tensor]) -> dict[str, Any]:
         # In these two splits, some lowercase letters are merged into their uppercase ones (see Fig 2. in the paper).
         # That means for example that there is 'D', 'd', and 'C', but not 'c'. Since the labels are nevertheless dense,
         # i.e. no gaps between 0 and 46 for 47 total classes, we need to add an offset to create these gaps. For
@@ -302,7 +307,7 @@ def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str,
             data = (image, label)
         return super()._prepare_sample(data)
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         archive_dp = resource_dps[0]
         images_dp, labels_dp = Demultiplexer(
             archive_dp,
@@ -331,7 +336,7 @@ def __len__(self) -> int:
 
 
 @register_info("qmnist")
-def _qmnist_info() -> Dict[str, Any]:
+def _qmnist_info() -> dict[str, Any]:
     return dict(
         categories=[str(label) for label in range(10)],
     )
@@ -363,7 +368,7 @@ def __init__(
         "xnist-labels-idx2-int.xz": "db042968723ec2b7aed5f1beac25d2b6e983b9286d4f4bf725f1086e5ae55c4f",
     }
 
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
+    def _files_and_checksums(self) -> tuple[tuple[str, str], tuple[str, str]]:
         prefix = "xnist" if self._split == "nist" else f"qmnist-{'train' if self._split == 'train' else 'test'}"
         suffix = "xz" if self._split == "nist" else "gz"
         images_file = f"{prefix}-images-idx3-ubyte.{suffix}"
@@ -373,7 +378,7 @@ def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
             self._CHECKSUMS[labels_file],
         )
 
-    def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
+    def start_and_stop(self) -> tuple[Optional[int], Optional[int]]:
         start: Optional[int]
         stop: Optional[int]
         if self._split == "test10k":
@@ -389,7 +394,7 @@ def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
 
     _categories = _emnist_info()["categories"]
 
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[torch.Tensor, torch.Tensor]) -> dict[str, Any]:
         image, ann = data
         label, *extra_anns = ann
         sample = super()._prepare_sample((image, label))
diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
index 499dbd837ed..9b03486d952 100644
--- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
+++ b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
@@ -1,9 +1,9 @@
 import enum
 import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, Optional, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -13,7 +13,7 @@
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
@@ -27,7 +27,7 @@ class OxfordIIITPetDemux(enum.IntEnum):
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -44,7 +44,7 @@ def __init__(
         self._categories = _info()["categories"]
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         images = HttpResource(
             "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz",
             sha256="67195c5e1c01f1ab5f9b6a5d22b8c27a580d896ece458917e61d459337fa318d",
@@ -57,21 +57,21 @@ def _resources(self) -> List[OnlineResource]:
         )
         return [images, anns]
 
-    def _classify_anns(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_anns(self, data: tuple[str, Any]) -> Optional[int]:
         return {
             "annotations": OxfordIIITPetDemux.SPLIT_AND_CLASSIFICATION,
             "trimaps": OxfordIIITPetDemux.SEGMENTATIONS,
         }.get(pathlib.Path(data[0]).parent.name)
 
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
+    def _filter_images(self, data: tuple[str, Any]) -> bool:
         return pathlib.Path(data[0]).suffix == ".jpg"
 
-    def _filter_segmentations(self, data: Tuple[str, Any]) -> bool:
+    def _filter_segmentations(self, data: tuple[str, Any]) -> bool:
         return not pathlib.Path(data[0]).name.startswith(".")
 
     def _prepare_sample(
-        self, data: Tuple[Tuple[Dict[str, str], Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]
-    ) -> Dict[str, Any]:
+        self, data: tuple[tuple[dict[str, str], tuple[str, BinaryIO]], tuple[str, BinaryIO]]
+    ) -> dict[str, Any]:
         ann_data, image_data = data
         classification_data, segmentation_data = ann_data
         segmentation_path, segmentation_buffer = segmentation_data
@@ -86,7 +86,7 @@ def _prepare_sample(
             image=EncodedImage.from_file(image_buffer),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         images_dp, anns_dp = resource_dps
 
         images_dp = Filter(images_dp, self._filter_images)
@@ -125,10 +125,10 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
         )
         return Mapper(dp, self._prepare_sample)
 
-    def _filter_split_and_classification_anns(self, data: Tuple[str, Any]) -> bool:
+    def _filter_split_and_classification_anns(self, data: tuple[str, Any]) -> bool:
         return self._classify_anns(data) == OxfordIIITPetDemux.SPLIT_AND_CLASSIFICATION
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
 
         dp = resources[1].load(self._root)
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
index 162f22f1abd..e0c0f3f2d9e 100644
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ b/torchvision/prototype/datasets/_builtin/pcam.py
@@ -1,13 +1,14 @@
 import io
 import pathlib
 from collections import namedtuple
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from collections.abc import Iterator
+from typing import Any, Optional, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper
-from torchvision.prototype import features
 from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
@@ -15,30 +16,33 @@
 NAME = "pcam"
 
 
-class PCAMH5Reader(IterDataPipe[Tuple[str, io.IOBase]]):
+class PCAMH5Reader(IterDataPipe[tuple[str, io.IOBase]]):
     def __init__(
         self,
-        datapipe: IterDataPipe[Tuple[str, io.IOBase]],
+        datapipe: IterDataPipe[tuple[str, io.IOBase]],
         key: Optional[str] = None,  # Note: this key thing might be very specific to the PCAM dataset
     ) -> None:
         self.datapipe = datapipe
         self.key = key
 
-    def __iter__(self) -> Iterator[Tuple[str, io.IOBase]]:
+    def __iter__(self) -> Iterator[tuple[str, io.IOBase]]:
         import h5py
 
         for _, handle in self.datapipe:
-            with h5py.File(handle) as data:
-                if self.key is not None:
-                    data = data[self.key]
-                yield from data
+            try:
+                with h5py.File(handle) as data:
+                    if self.key is not None:
+                        data = data[self.key]
+                    yield from data
+            finally:
+                handle.close()
 
 
 _Resource = namedtuple("_Resource", ("file_name", "gdrive_id", "sha256"))
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=["0", "1"])
 
 
@@ -96,21 +100,21 @@ def __init__(
         ),
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [  # = [images resource, targets resource]
             GDriveResource(file_name=file_name, id=gdrive_id, sha256=sha256, preprocess="decompress")
             for file_name, gdrive_id, sha256 in self._RESOURCES[self._split]
         ]
 
-    def _prepare_sample(self, data: Tuple[Any, Any]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[Any, Any]) -> dict[str, Any]:
         image, target = data  # They're both numpy arrays at this point
 
         return {
-            "image": features.Image(image.transpose(2, 0, 1)),
+            "image": Image(image.transpose(2, 0, 1)),
             "label": Label(target.item(), categories=self._categories),
         }
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
 
         images_dp, targets_dp = resource_dps
 
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
index c7a79c4188e..dafceb793e0 100644
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ b/torchvision/prototype/datasets/_builtin/sbd.py
@@ -1,10 +1,11 @@
 import pathlib
 import re
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, cast, Optional, Union
 
 import numpy as np
+import torch
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -15,7 +16,6 @@
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, EncodedImage
 
 from .._api import register_dataset, register_info
 
@@ -23,7 +23,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -48,34 +48,38 @@ def __init__(
 
         super().__init__(root, dependencies=("scipy",), skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz",
-            sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53",
-        )
-        extra_split = HttpResource(
-            "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt",
-            sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432",
-        )
-        return [archive, extra_split]
+    def _resources(self) -> list[OnlineResource]:
+        resources = [
+            HttpResource(
+                "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz",
+                sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53",
+            )
+        ]
+        if self._split == "train_noval":
+            resources.append(
+                HttpResource(
+                    "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt",
+                    sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432",
+                )
+            )
+        return resources  # type: ignore[return-value]
 
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         parent, grandparent, *_ = path.parents
 
-        if parent.name == "dataset":
-            return 0
-        elif grandparent.name == "dataset":
+        if grandparent.name == "dataset":
             if parent.name == "img":
-                return 1
+                return 0
             elif parent.name == "cls":
-                return 2
-            else:
-                return None
-        else:
-            return None
+                return 1
+
+        if parent.name == "dataset" and self._split != "train_noval":
+            return 2
 
-    def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
+        return None
+
+    def _prepare_sample(self, data: tuple[tuple[Any, tuple[str, BinaryIO]], tuple[str, BinaryIO]]) -> dict[str, Any]:
         split_and_image_data, ann_data = data
         _, image_data = split_and_image_data
         image_path, image_buffer = image_data
@@ -88,23 +92,31 @@ def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[st
             image=EncodedImage.from_file(image_buffer),
             ann_path=ann_path,
             # the boundaries are stored in sparse CSC format, which is not supported by PyTorch
-            boundaries=_Feature(np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])),
-            segmentation=_Feature(anns["Segmentation"].item()),
+            boundaries=torch.as_tensor(
+                np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])
+            ),
+            segmentation=torch.as_tensor(anns["Segmentation"].item()),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp, extra_split_dp = resource_dps
-
-        archive_dp = resource_dps[0]
-        split_dp, images_dp, anns_dp = Demultiplexer(
-            archive_dp,
-            3,
-            self._classify_archive,
-            buffer_size=INFINITE_BUFFER_SIZE,
-            drop_none=True,
-        )
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         if self._split == "train_noval":
-            split_dp = extra_split_dp
+            archive_dp, split_dp = resource_dps
+            images_dp, anns_dp = Demultiplexer(
+                archive_dp,
+                2,
+                self._classify_archive,
+                buffer_size=INFINITE_BUFFER_SIZE,
+                drop_none=True,
+            )
+        else:
+            archive_dp = resource_dps[0]
+            images_dp, anns_dp, split_dp = Demultiplexer(
+                archive_dp,
+                3,
+                self._classify_archive,
+                buffer_size=INFINITE_BUFFER_SIZE,
+                drop_none=True,
+            )
 
         split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
         split_dp = LineReader(split_dp, decode=True)
@@ -129,7 +141,7 @@ def __len__(self) -> int:
             "train_noval": 5_623,
         }[self._split]
 
-    def _generate_categories(self) -> Tuple[str, ...]:
+    def _generate_categories(self) -> tuple[str, ...]:
         resources = self._resources()
 
         dp = resources[0].load(self._root)
@@ -140,7 +152,7 @@ def _generate_categories(self) -> Tuple[str, ...]:
 
         pattern = re.compile(r"\s*'(?P<category>\w+)';\s*%(?P<label>\d+)")
         categories_and_labels = cast(
-            List[Tuple[str, ...]],
+            list[tuple[str, ...]],
             [
                 pattern.match(line).groups()  # type: ignore[union-attr]
                 # the first and last line contain no information
diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py
index 8107f6565e4..a9173b4c701 100644
--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ b/torchvision/prototype/datasets/_builtin/semeion.py
@@ -1,11 +1,12 @@
 import pathlib
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Union
 
 import torch
 from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, OneHotLabel
+from torchvision.prototype.tv_tensors import OneHotLabel
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
@@ -13,7 +14,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=[str(i) for i in range(10)])
 
 
@@ -28,14 +29,14 @@ def __init__(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool
         self._categories = _info()["categories"]
         super().__init__(root, skip_integrity_check=skip_integrity_check)
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         data = HttpResource(
             "http://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",
             sha256="f43228ae3da5ea6a3c95069d53450b86166770e3b719dcc333182128fe08d4b1",
         )
         return [data]
 
-    def _prepare_sample(self, data: Tuple[str, ...]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[str, ...]) -> dict[str, Any]:
         image_data, label_data = data[:256], data[256:-1]
 
         return dict(
@@ -43,7 +44,7 @@ def _prepare_sample(self, data: Tuple[str, ...]) -> Dict[str, Any]:
             label=OneHotLabel([int(label) for label in label_data], categories=self._categories),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = CSVParser(dp, delimiter=" ")
         dp = hint_shuffling(dp)
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index 011204f2bfb..2e9883caaaf 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -1,8 +1,9 @@
 import pathlib
-from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
+from collections.abc import Iterator
+from typing import Any, BinaryIO, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
@@ -10,16 +11,17 @@
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
 
-class StanfordCarsLabelReader(IterDataPipe[Tuple[int, int, int, int, int, str]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[str, Any]]) -> None:
+class StanfordCarsLabelReader(IterDataPipe[tuple[int, int, int, int, int, str]]):
+    def __init__(self, datapipe: IterDataPipe[dict[str, Any]]) -> None:
         self.datapipe = datapipe
 
-    def __iter__(self) -> Iterator[Tuple[int, int, int, int, int, str]]:
+    def __iter__(self) -> Iterator[tuple[int, int, int, int, int, str]]:
         for _, file in self.datapipe:
             data = read_mat(file, squeeze_me=True)
             for ann in data["annotations"]:
@@ -30,7 +32,7 @@ def __iter__(self) -> Iterator[Tuple[int, int, int, int, int, str]]:
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -67,8 +69,8 @@ def __init__(
         "car_devkit": "512b227b30e2f0a8aab9e09485786ab4479582073a144998da74d64b801fd288",
     }
 
-    def _resources(self) -> List[OnlineResource]:
-        resources: List[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUM[self._split])]
+    def _resources(self) -> list[OnlineResource]:
+        resources: list[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUM[self._split])]
         if self._split == "train":
             resources.append(HttpResource(url=self._URLS["car_devkit"], sha256=self._CHECKSUM["car_devkit"]))
 
@@ -80,7 +82,7 @@ def _resources(self) -> List[OnlineResource]:
             )
         return resources
 
-    def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Tuple[int, int, int, int, int, str]]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[tuple[str, BinaryIO], tuple[int, int, int, int, int, str]]) -> dict[str, Any]:
         image, target = data
         path, buffer = image
         image = EncodedImage.from_file(buffer)
@@ -89,10 +91,10 @@ def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Tuple[int, int, int,
             path=path,
             image=image,
             label=Label(target[4] - 1, categories=self._categories),
-            bounding_box=BoundingBox(target[:4], format="xyxy", image_size=image.image_size),
+            bounding_boxes=BoundingBoxes(target[:4], format="xyxy", spatial_size=image.spatial_size),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
 
         images_dp, targets_dp = resource_dps
         if self._split == "train":
@@ -103,7 +105,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str,
         dp = hint_sharding(dp)
         return Mapper(dp, self._prepare_sample)
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         resources = self._resources()
 
         devkit_dp = resources[1].load(self._root)
diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py
index 6dd55a77c99..de1dbd07fdb 100644
--- a/torchvision/prototype/datasets/_builtin/svhn.py
+++ b/torchvision/prototype/datasets/_builtin/svhn.py
@@ -1,11 +1,12 @@
 import pathlib
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
+from typing import Any, BinaryIO, Union
 
 import numpy as np
 from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat
-from torchvision.prototype.features import Image, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
@@ -13,7 +14,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=[str(c) for c in range(10)])
 
 
@@ -41,7 +42,7 @@ def __init__(
         "extra": "a133a4beb38a00fcdda90c9489e0c04f900b660ce8a316a5e854838379a71eb3",
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         data = HttpResource(
             f"http://ufldl.stanford.edu/housenumbers/{self._split}_32x32.mat",
             sha256=self._CHECKSUMS[self._split],
@@ -49,7 +50,7 @@ def _resources(self) -> List[OnlineResource]:
 
         return [data]
 
-    def _read_images_and_labels(self, data: Tuple[str, BinaryIO]) -> List[Tuple[np.ndarray, np.ndarray]]:
+    def _read_images_and_labels(self, data: tuple[str, BinaryIO]) -> list[tuple[np.ndarray, np.ndarray]]:
         _, buffer = data
         content = read_mat(buffer)
         return list(
@@ -59,7 +60,7 @@ def _read_images_and_labels(self, data: Tuple[str, BinaryIO]) -> List[Tuple[np.n
             )
         )
 
-    def _prepare_sample(self, data: Tuple[np.ndarray, np.ndarray]) -> Dict[str, Any]:
+    def _prepare_sample(self, data: tuple[np.ndarray, np.ndarray]) -> dict[str, Any]:
         image_array, label_array = data
 
         return dict(
@@ -67,7 +68,7 @@ def _prepare_sample(self, data: Tuple[np.ndarray, np.ndarray]) -> Dict[str, Any]
             label=Label(int(label_array) % 10, categories=self._categories),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = resource_dps[0]
         dp = Mapper(dp, self._read_images_and_labels)
         dp = UnBatcher(dp)
diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py
index e5ca58f8428..1752d15146b 100644
--- a/torchvision/prototype/datasets/_builtin/usps.py
+++ b/torchvision/prototype/datasets/_builtin/usps.py
@@ -1,11 +1,12 @@
 import pathlib
-from typing import Any, Dict, List, Union
+from typing import Any, Union
 
 import torch
 from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
@@ -13,7 +14,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=[str(c) for c in range(10)])
 
 
@@ -46,10 +47,10 @@ def __init__(
         ),
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         return [USPS._RESOURCES[self._split]]
 
-    def _prepare_sample(self, line: str) -> Dict[str, Any]:
+    def _prepare_sample(self, line: str) -> dict[str, Any]:
         label, *values = line.strip().split(" ")
         values = [float(value.split(":")[1]) for value in values]
         pixels = torch.tensor(values).add_(1).div_(2)
@@ -58,7 +59,7 @@ def _prepare_sample(self, line: str) -> Dict[str, Any]:
             label=Label(int(label) - 1, categories=self._categories),
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         dp = Decompressor(resource_dps[0])
         dp = LineReader(dp, decode=True, return_path=False)
         dp = hint_shuffling(dp)
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index 2f13ce10d6f..c5c17258bdc 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -1,12 +1,12 @@
 import enum
 import functools
 import pathlib
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, cast, Optional, Union
 from xml.etree import ElementTree
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
 from torchvision.datasets import VOCDetection
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -16,7 +16,8 @@
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
@@ -24,7 +25,7 @@
 
 
 @register_info(NAME)
-def _info() -> Dict[str, Any]:
+def _info() -> dict[str, Any]:
     return dict(categories=read_categories_file(NAME))
 
 
@@ -69,12 +70,12 @@ def __init__(
         "2007": ("VOCtest_06-Nov-2007.tar", "6836888e2e01dca84577a849d339fa4f73e1e4f135d312430c4856b5609b4892")
     }
 
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         file_name, sha256 = (self._TEST_ARCHIVES if self._split == "test" else self._TRAIN_VAL_ARCHIVES)[self._year]
         archive = HttpResource(f"http://host.robots.ox.ac.uk/pascal/VOC/voc{self._year}/{file_name}", sha256=sha256)
         return [archive]
 
-    def _is_in_folder(self, data: Tuple[str, Any], *, name: str, depth: int = 1) -> bool:
+    def _is_in_folder(self, data: tuple[str, Any], *, name: str, depth: int = 1) -> bool:
         path = pathlib.Path(data[0])
         return name in path.parent.parts[-depth:]
 
@@ -83,7 +84,7 @@ class _Demux(enum.IntEnum):
         IMAGES = 1
         ANNS = 2
 
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+    def _classify_archive(self, data: tuple[str, Any]) -> Optional[int]:
         if self._is_in_folder(data, name="ImageSets", depth=2):
             return self._Demux.SPLIT
         elif self._is_in_folder(data, name="JPEGImages"):
@@ -93,33 +94,35 @@ def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         else:
             return None
 
-    def _parse_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        return cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"])
+    def _parse_detection_ann(self, buffer: BinaryIO) -> dict[str, Any]:
+        ann = cast(dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"])
+        buffer.close()
+        return ann
 
-    def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
+    def _prepare_detection_ann(self, buffer: BinaryIO) -> dict[str, Any]:
         anns = self._parse_detection_ann(buffer)
         instances = anns["object"]
         return dict(
-            bounding_boxes=BoundingBox(
+            bounding_boxes=BoundingBoxes(
                 [
                     [int(instance["bndbox"][part]) for part in ("xmin", "ymin", "xmax", "ymax")]
                     for instance in instances
                 ],
                 format="xyxy",
-                image_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
+                spatial_size=cast(tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
             ),
             labels=Label(
                 [self._categories.index(instance["name"]) for instance in instances], categories=self._categories
             ),
         )
 
-    def _prepare_segmentation_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
+    def _prepare_segmentation_ann(self, buffer: BinaryIO) -> dict[str, Any]:
         return dict(segmentation=EncodedImage.from_file(buffer))
 
     def _prepare_sample(
         self,
-        data: Tuple[Tuple[Tuple[str, str], Tuple[str, BinaryIO]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
+        data: tuple[tuple[tuple[str, str], tuple[str, BinaryIO]], tuple[str, BinaryIO]],
+    ) -> dict[str, Any]:
         split_and_image_data, ann_data = data
         _, image_data = split_and_image_data
         image_path, image_buffer = image_data
@@ -132,7 +135,7 @@ def _prepare_sample(
             ann_path=ann_path,
         )
 
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         archive_dp = resource_dps[0]
         split_dp, images_dp, anns_dp = Demultiplexer(
             archive_dp,
@@ -201,10 +204,10 @@ def __len__(self) -> int:
             ("test", "2007", "segmentation"): 210,
         }[(self._split, self._year, self._task)]
 
-    def _filter_anns(self, data: Tuple[str, Any]) -> bool:
+    def _filter_anns(self, data: tuple[str, Any]) -> bool:
         return self._classify_archive(data) == self._Demux.ANNS
 
-    def _generate_categories(self) -> List[str]:
+    def _generate_categories(self) -> list[str]:
         self._task = "detection"
         resources = self._resources()
 
diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py
index b2ec23c5e3d..48bce8dc2a2 100644
--- a/torchvision/prototype/datasets/_folder.py
+++ b/torchvision/prototype/datasets/_folder.py
@@ -2,11 +2,13 @@
 import os
 import os.path
 import pathlib
-from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union
+from collections.abc import Collection
+from typing import Any, BinaryIO, Optional, Union
 
 from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper
+from torchvision.prototype.datasets.utils import EncodedData, EncodedImage
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import EncodedData, EncodedImage, Label
+from torchvision.prototype.tv_tensors import Label
 
 
 __all__ = ["from_data_folder", "from_image_folder"]
@@ -18,11 +20,11 @@ def _is_not_top_level_file(path: str, *, root: pathlib.Path) -> bool:
 
 
 def _prepare_sample(
-    data: Tuple[str, BinaryIO],
+    data: tuple[str, BinaryIO],
     *,
     root: pathlib.Path,
-    categories: List[str],
-) -> Dict[str, Any]:
+    categories: list[str],
+) -> dict[str, Any]:
     path, buffer = data
     category = pathlib.Path(path).relative_to(root).parts[0]
     return dict(
@@ -37,10 +39,10 @@ def from_data_folder(
     *,
     valid_extensions: Optional[Collection[str]] = None,
     recursive: bool = True,
-) -> Tuple[IterDataPipe, List[str]]:
+) -> tuple[IterDataPipe, list[str]]:
     root = pathlib.Path(root).expanduser().resolve()
     categories = sorted(entry.name for entry in os.scandir(root) if entry.is_dir())
-    masks: Union[List[str], str] = [f"*.{ext}" for ext in valid_extensions] if valid_extensions is not None else ""
+    masks: Union[list[str], str] = [f"*.{ext}" for ext in valid_extensions] if valid_extensions is not None else ""
     dp = FileLister(str(root), recursive=recursive, masks=masks)
     dp: IterDataPipe = Filter(dp, functools.partial(_is_not_top_level_file, root=root))
     dp = hint_sharding(dp)
@@ -49,7 +51,7 @@ def from_data_folder(
     return Mapper(dp, functools.partial(_prepare_sample, root=root, categories=categories)), categories
 
 
-def _data_to_image_key(sample: Dict[str, Any]) -> Dict[str, Any]:
+def _data_to_image_key(sample: dict[str, Any]) -> dict[str, Any]:
     sample["image"] = EncodedImage(sample.pop("data").data)
     return sample
 
@@ -59,7 +61,7 @@ def from_image_folder(
     *,
     valid_extensions: Collection[str] = ("jpg", "jpeg", "png", "ppm", "bmp", "pgm", "tif", "tiff", "webp"),
     **kwargs: Any,
-) -> Tuple[IterDataPipe, List[str]]:
+) -> tuple[IterDataPipe, list[str]]:
     valid_extensions = [valid_extension for ext in valid_extensions for valid_extension in (ext.lower(), ext.upper())]
     dp, categories = from_data_folder(root, valid_extensions=valid_extensions, **kwargs)
     return Mapper(dp, _data_to_image_key), categories
diff --git a/torchvision/prototype/datasets/utils/__init__.py b/torchvision/prototype/datasets/utils/__init__.py
index 41ccbf48951..3fdb53eec43 100644
--- a/torchvision/prototype/datasets/utils/__init__.py
+++ b/torchvision/prototype/datasets/utils/__init__.py
@@ -1,3 +1,4 @@
 from . import _internal  # usort: skip
 from ._dataset import Dataset
+from ._encoded import EncodedData, EncodedImage
 from ._resource import GDriveResource, HttpResource, KaggleDownloadResource, ManualDownloadResource, OnlineResource
diff --git a/torchvision/prototype/datasets/utils/_dataset.py b/torchvision/prototype/datasets/utils/_dataset.py
index e7486c854ac..26de05ab56c 100644
--- a/torchvision/prototype/datasets/utils/_dataset.py
+++ b/torchvision/prototype/datasets/utils/_dataset.py
@@ -1,15 +1,16 @@
 import abc
 import importlib
 import pathlib
-from typing import Any, Collection, Dict, Iterator, List, Optional, Sequence, Union
+from collections.abc import Collection, Iterator, Sequence
+from typing import Any, Optional, Union
 
-from torch.utils.data import IterDataPipe
+from torchdata.datapipes.iter import IterDataPipe
 from torchvision.datasets.utils import verify_str_arg
 
 from ._resource import OnlineResource
 
 
-class Dataset(IterDataPipe[Dict[str, Any]], abc.ABC):
+class Dataset(IterDataPipe[dict[str, Any]], abc.ABC):
     @staticmethod
     def _verify_str_arg(
         value: str,
@@ -38,15 +39,15 @@ def __init__(
         ]
         self._dp = self._datapipe(resources)
 
-    def __iter__(self) -> Iterator[Dict[str, Any]]:
+    def __iter__(self) -> Iterator[dict[str, Any]]:
         yield from self._dp
 
     @abc.abstractmethod
-    def _resources(self) -> List[OnlineResource]:
+    def _resources(self) -> list[OnlineResource]:
         pass
 
     @abc.abstractmethod
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: list[IterDataPipe]) -> IterDataPipe[dict[str, Any]]:
         pass
 
     @abc.abstractmethod
diff --git a/torchvision/prototype/datasets/utils/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py
new file mode 100644
index 00000000000..ba71416d6c5
--- /dev/null
+++ b/torchvision/prototype/datasets/utils/_encoded.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import os
+import sys
+from typing import Any, BinaryIO, TypeVar
+
+import PIL.Image
+import torch
+from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
+
+from torchvision.tv_tensors._tv_tensor import TVTensor
+
+D = TypeVar("D", bound="EncodedData")
+
+
+class EncodedData(TVTensor):
+    @classmethod
+    def _wrap(cls: type[D], tensor: torch.Tensor) -> D:
+        return tensor.as_subclass(cls)
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool = False,
+    ) -> EncodedData:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        # TODO: warn / bail out if we encounter a tensor with shape other than (N,) or with dtype other than uint8?
+        return cls._wrap(tensor)
+
+    @classmethod
+    def wrap_like(cls: type[D], other: D, tensor: torch.Tensor) -> D:
+        return cls._wrap(tensor)
+
+    @classmethod
+    def from_file(cls: type[D], file: BinaryIO, **kwargs: Any) -> D:
+        encoded_data = cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs)
+        file.close()
+        return encoded_data
+
+    @classmethod
+    def from_path(cls: type[D], path: str | os.PathLike, **kwargs: Any) -> D:
+        with open(path, "rb") as file:
+            return cls.from_file(file, **kwargs)
+
+
+class EncodedImage(EncodedData):
+    # TODO: Use @functools.cached_property if we can depend on Python 3.8
+    @property
+    def spatial_size(self) -> tuple[int, int]:
+        if not hasattr(self, "_spatial_size"):
+            with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image:
+                self._spatial_size = image.height, image.width
+
+        return self._spatial_size
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 6768469be67..9af90094c9c 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -2,13 +2,13 @@
 import functools
 import pathlib
 import pickle
-from typing import Any, BinaryIO, Callable, cast, Dict, IO, Iterator, List, Sequence, Sized, Tuple, TypeVar, Union
+from collections.abc import Iterator, Sequence, Sized
+from typing import Any, BinaryIO, Callable, IO, TypeVar, Union
 
 import torch
 import torch.distributed as dist
 import torch.utils.data
 from torchdata.datapipes.iter import IoPathFileLister, IoPathFileOpener, IterDataPipe, ShardingFilter, Shuffler
-from torchdata.datapipes.utils import StreamWrapper
 from torchvision.prototype.utils._internal import fromfile
 
 
@@ -40,18 +40,17 @@ def read_mat(buffer: BinaryIO, **kwargs: Any) -> Any:
     except ImportError as error:
         raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error
 
-    if isinstance(buffer, StreamWrapper):
-        buffer = buffer.file_obj
+    data = sio.loadmat(buffer, **kwargs)
+    buffer.close()
+    return data
 
-    return sio.loadmat(buffer, **kwargs)
 
-
-class MappingIterator(IterDataPipe[Union[Tuple[K, D], D]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[K, D]], *, drop_key: bool = False) -> None:
+class MappingIterator(IterDataPipe[Union[tuple[K, D], D]]):
+    def __init__(self, datapipe: IterDataPipe[dict[K, D]], *, drop_key: bool = False) -> None:
         self.datapipe = datapipe
         self.drop_key = drop_key
 
-    def __iter__(self) -> Iterator[Union[Tuple[K, D], D]]:
+    def __iter__(self) -> Iterator[Union[tuple[K, D], D]]:
         for mapping in self.datapipe:
             yield from iter(mapping.values() if self.drop_key else mapping.items())
 
@@ -72,31 +71,31 @@ def _getattr_closure(obj: Any, *, attrs: Sequence[str]) -> Any:
     return obj
 
 
-def _path_attribute_accessor(path: pathlib.Path, *, name: str) -> D:
-    return cast(D, _getattr_closure(path, attrs=name.split(".")))
+def _path_attribute_accessor(path: pathlib.Path, *, name: str) -> Any:
+    return _getattr_closure(path, attrs=name.split("."))
 
 
-def _path_accessor_closure(data: Tuple[str, Any], *, getter: Callable[[pathlib.Path], D]) -> D:
+def _path_accessor_closure(data: tuple[str, Any], *, getter: Callable[[pathlib.Path], D]) -> D:
     return getter(pathlib.Path(data[0]))
 
 
-def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[Tuple[str, Any]], D]:
+def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[tuple[str, Any]], D]:
     if isinstance(getter, str):
         getter = functools.partial(_path_attribute_accessor, name=getter)
 
     return functools.partial(_path_accessor_closure, getter=getter)
 
 
-def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool:
+def _path_comparator_closure(data: tuple[str, Any], *, accessor: Callable[[tuple[str, Any]], D], value: D) -> bool:
     return accessor(data) == value
 
 
-def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]:
+def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[tuple[str, Any]], bool]:
     return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value)
 
 
 class PicklerDataPipe(IterDataPipe):
-    def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO[bytes]]]) -> None:
+    def __init__(self, source_datapipe: IterDataPipe[tuple[str, IO[bytes]]]) -> None:
         self.source_datapipe = source_datapipe
 
     def __iter__(self) -> Iterator[Any]:
@@ -106,7 +105,7 @@ def __iter__(self) -> Iterator[Any]:
                 yield d
 
 
-class SharderDataPipe(torch.utils.data.datapipes.iter.grouping.ShardingFilterIterDataPipe):
+class SharderDataPipe(ShardingFilter):
     def __init__(self, source_datapipe: IterDataPipe) -> None:
         super().__init__(source_datapipe)
         self.rank = 0
@@ -160,7 +159,7 @@ def __len__(self) -> int:
         return num_take
 
 
-def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe[Dict[str, Any]]:
+def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe[dict[str, Any]]:
     dp = IoPathFileLister(root=root)
     dp = SharderDataPipe(dp)
     dp = dp.shuffle(buffer_size=INFINITE_BUFFER_SIZE)
@@ -188,7 +187,7 @@ def hint_shuffling(datapipe: IterDataPipe[D]) -> Shuffler[D]:
     return Shuffler(datapipe, buffer_size=INFINITE_BUFFER_SIZE).set_shuffle(False)
 
 
-def read_categories_file(name: str) -> List[Union[str, Sequence[str]]]:
+def read_categories_file(name: str) -> list[Union[str, Sequence[str]]]:
     path = BUILTIN_DIR / f"{name}.categories"
     with open(path, newline="") as file:
         rows = list(csv.reader(file))
diff --git a/torchvision/prototype/datasets/utils/_resource.py b/torchvision/prototype/datasets/utils/_resource.py
index dc01c72de28..2743799650a 100644
--- a/torchvision/prototype/datasets/utils/_resource.py
+++ b/torchvision/prototype/datasets/utils/_resource.py
@@ -2,7 +2,8 @@
 import hashlib
 import itertools
 import pathlib
-from typing import Any, Callable, IO, NoReturn, Optional, Sequence, Set, Tuple, Union
+from collections.abc import Sequence
+from typing import Any, Callable, IO, Literal, NoReturn, Optional, Union
 from urllib.parse import urlparse
 
 from torchdata.datapipes.iter import (
@@ -23,7 +24,6 @@
     download_url,
     extract_archive,
 )
-from typing_extensions import Literal
 
 
 class OnlineResource(abc.ABC):
@@ -57,7 +57,7 @@ def _extract(file: pathlib.Path) -> None:
     def _decompress(file: pathlib.Path) -> None:
         _decompress(str(file), remove_finished=True)
 
-    def _loader(self, path: pathlib.Path) -> IterDataPipe[Tuple[str, IO]]:
+    def _loader(self, path: pathlib.Path) -> IterDataPipe[tuple[str, IO]]:
         if path.is_dir():
             return FileOpener(FileLister(str(path), recursive=True), mode="rb")
 
@@ -77,7 +77,7 @@ def _loader(self, path: pathlib.Path) -> IterDataPipe[Tuple[str, IO]]:
 
     def _guess_archive_loader(
         self, path: pathlib.Path
-    ) -> Optional[Callable[[IterDataPipe[Tuple[str, IO]]], IterDataPipe[Tuple[str, IO]]]]:
+    ) -> Optional[Callable[[IterDataPipe[tuple[str, IO]]], IterDataPipe[tuple[str, IO]]]]:
         try:
             _, archive_type, _ = _detect_file_type(path.name)
         except RuntimeError:
@@ -86,7 +86,7 @@ def _guess_archive_loader(
 
     def load(
         self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False
-    ) -> IterDataPipe[Tuple[str, IO]]:
+    ) -> IterDataPipe[tuple[str, IO]]:
         root = pathlib.Path(root)
         path = root / self.file_name
 
@@ -95,7 +95,7 @@ def load(
         # is not sufficient for files with multiple suffixes, e.g. foo.tar.gz.
         stem = path.name.replace("".join(path.suffixes), "")
 
-        def find_candidates() -> Set[pathlib.Path]:
+        def find_candidates() -> set[pathlib.Path]:
             # Although it looks like we could glob for f"{stem}*" to find the file candidates as well as the folder
             # candidate simultaneously, that would also pick up other files that share the same prefix. For example, the
             # test split of the stanford-cars dataset uses the files
@@ -137,7 +137,7 @@ def download(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool
     def _check_sha256(self, path: pathlib.Path, *, chunk_size: int = 1024 * 1024) -> None:
         hash = hashlib.sha256()
         with open(path, "rb") as file:
-            for chunk in iter(lambda: file.read(chunk_size), b""):
+            while chunk := file.read(chunk_size):
                 hash.update(chunk)
         sha256 = hash.hexdigest()
         if sha256 != self.sha256:
diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
deleted file mode 100644
index 218c8876495..00000000000
--- a/torchvision/prototype/features/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._encoded import EncodedData, EncodedImage, EncodedVideo
-from ._feature import _Feature
-from ._image import ColorSpace, Image
-from ._label import Label, OneHotLabel
-from ._segmentation_mask import SegmentationMask
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
deleted file mode 100644
index 0b78169014b..00000000000
--- a/torchvision/prototype/features/_bounding_box.py
+++ /dev/null
@@ -1,229 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, List, Optional, Sequence, Tuple, Union
-
-import torch
-from torchvision._utils import StrEnum
-from torchvision.transforms import InterpolationMode
-from torchvision.transforms.functional import _get_inverse_affine_matrix
-from torchvision.transforms.functional_tensor import _compute_output_size
-
-from ._feature import _Feature
-
-
-class BoundingBoxFormat(StrEnum):
-    XYXY = StrEnum.auto()
-    XYWH = StrEnum.auto()
-    CXCYWH = StrEnum.auto()
-
-
-class BoundingBox(_Feature):
-    format: BoundingBoxFormat
-    image_size: Tuple[int, int]
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        format: Union[BoundingBoxFormat, str],
-        image_size: Tuple[int, int],
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> BoundingBox:
-        bounding_box = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
-        bounding_box.format = format
-
-        bounding_box.image_size = image_size
-
-        return bounding_box
-
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, image_size=self.image_size)
-
-    @classmethod
-    def new_like(
-        cls,
-        other: BoundingBox,
-        data: Any,
-        *,
-        format: Optional[Union[BoundingBoxFormat, str]] = None,
-        image_size: Optional[Tuple[int, int]] = None,
-        **kwargs: Any,
-    ) -> BoundingBox:
-        return super().new_like(
-            other,
-            data,
-            format=format if format is not None else other.format,
-            image_size=image_size if image_size is not None else other.image_size,
-            **kwargs,
-        )
-
-    def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
-
-        return BoundingBox.new_like(
-            self, _F.convert_bounding_box_format(self, old_format=self.format, new_format=format), format=format
-        )
-
-    def horizontal_flip(self) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size)
-        return BoundingBox.new_like(self, output)
-
-    def vertical_flip(self) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size)
-        return BoundingBox.new_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.resize_bounding_box(self, size, image_size=self.image_size, max_size=max_size)
-        image_size = (size[0], size[0]) if len(size) == 1 else (size[0], size[1])
-        return BoundingBox.new_like(self, output, image_size=image_size, dtype=output.dtype)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.crop_bounding_box(self, self.format, top, left)
-        return BoundingBox.new_like(self, output, image_size=(height, width))
-
-    def center_crop(self, output_size: List[int]) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.center_crop_bounding_box(
-            self, format=self.format, output_size=output_size, image_size=self.image_size
-        )
-        image_size = (output_size[0], output_size[0]) if len(output_size) == 1 else (output_size[0], output_size[1])
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
-        image_size = (size[0], size[0]) if len(size) == 1 else (size[0], size[1])
-        return BoundingBox.new_like(self, output, image_size=image_size, dtype=output.dtype)
-
-    def pad(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        if padding_mode not in ["constant"]:
-            raise ValueError(f"Padding mode '{padding_mode}' is not supported with bounding boxes")
-
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        output = _F.pad_bounding_box(self, padding, format=self.format)
-
-        # Update output image size:
-        # TODO: remove the import below and make _parse_pad_padding available
-        from torchvision.transforms.functional_tensor import _parse_pad_padding
-
-        left, top, right, bottom = _parse_pad_padding(padding)
-        height, width = self.image_size
-        height += top + bottom
-        width += left + right
-
-        return BoundingBox.new_like(self, output, image_size=(height, width))
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.rotate_bounding_box(
-            self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center
-        )
-        image_size = self.image_size
-        if expand:
-            # The way we recompute image_size is not optimal due to redundant computations of
-            # - rotation matrix (_get_inverse_affine_matrix)
-            # - points dot matrix (_compute_output_size)
-            # Alternatively, we could return new image size by _F.rotate_bounding_box
-            height, width = image_size
-            rotation_matrix = _get_inverse_affine_matrix([0.0, 0.0], angle, [0.0, 0.0], 1.0, [0.0, 0.0])
-            new_width, new_height = _compute_output_size(rotation_matrix, width, height)
-            image_size = (new_height, new_width)
-
-        return BoundingBox.new_like(self, output, dtype=output.dtype, image_size=image_size)
-
-    def affine(
-        self,
-        angle: float,
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.affine_bounding_box(
-            self,
-            self.format,
-            self.image_size,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            center=center,
-        )
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.perspective_bounding_box(self, self.format, perspective_coeffs)
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> BoundingBox:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.elastic_bounding_box(self, self.format, displacement)
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
deleted file mode 100644
index ccab0b1b8a8..00000000000
--- a/torchvision/prototype/features/_encoded.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from __future__ import annotations
-
-import os
-import sys
-from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
-
-from ._feature import _Feature
-from ._image import Image
-
-D = TypeVar("D", bound="EncodedData")
-
-
-class EncodedData(_Feature):
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> EncodedData:
-        # TODO: warn / bail out if we encounter a tensor with shape other than (N,) or with dtype other than uint8?
-        return super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-    @classmethod
-    def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D:
-        return cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs)
-
-    @classmethod
-    def from_path(cls: Type[D], path: Union[str, os.PathLike], **kwargs: Any) -> D:
-        with open(path, "rb") as file:
-            return cls.from_file(file, **kwargs)
-
-
-class EncodedImage(EncodedData):
-    # TODO: Use @functools.cached_property if we can depend on Python 3.8
-    @property
-    def image_size(self) -> Tuple[int, int]:
-        if not hasattr(self, "_image_size"):
-            with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image:
-                self._image_size = image.height, image.width
-
-        return self._image_size
-
-    def decode(self) -> Image:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-
-        # import at runtime to avoid cyclic imports
-        from torchvision.prototype.transforms.functional import decode_image_with_pil
-
-        return Image(decode_image_with_pil(self))
-
-
-class EncodedVideo(EncodedData):
-    pass
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
deleted file mode 100644
index ecab05ba41b..00000000000
--- a/torchvision/prototype/features/_feature.py
+++ /dev/null
@@ -1,210 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, Callable, cast, List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
-
-import torch
-from torch._C import _TensorBase, DisableTorchFunction
-from torchvision.transforms import InterpolationMode
-
-F = TypeVar("F", bound="_Feature")
-
-
-class _Feature(torch.Tensor):
-    def __new__(
-        cls: Type[F],
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> F:
-        return cast(
-            F,
-            torch.Tensor._make_subclass(
-                cast(_TensorBase, cls),
-                torch.as_tensor(data, dtype=dtype, device=device),  # type: ignore[arg-type]
-                requires_grad,
-            ),
-        )
-
-    @classmethod
-    def new_like(
-        cls: Type[F],
-        other: F,
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: Optional[bool] = None,
-        **kwargs: Any,
-    ) -> F:
-        return cls(
-            data,
-            dtype=dtype if dtype is not None else other.dtype,
-            device=device if device is not None else other.device,
-            requires_grad=requires_grad if requires_grad is not None else other.requires_grad,
-            **kwargs,
-        )
-
-    @classmethod
-    def __torch_function__(
-        cls,
-        func: Callable[..., torch.Tensor],
-        types: Tuple[Type[torch.Tensor], ...],
-        args: Sequence[Any] = (),
-        kwargs: Optional[Mapping[str, Any]] = None,
-    ) -> torch.Tensor:
-        """For general information about how the __torch_function__ protocol works,
-        see https://pytorch.org/docs/stable/notes/extending.html#extending-torch
-
-        TL;DR: Every time a PyTorch operator is called, it goes through the inputs and looks for the
-        ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
-        ``args`` and ``kwargs`` of the original call.
-
-        The default behavior of :class:`~torch.Tensor`'s is to retain a custom tensor type. For the :class:`Feature`
-        use case, this has two downsides:
-
-        1. Since some :class:`Feature`'s require metadata to be constructed, the default wrapping, i.e.
-           ``return cls(func(*args, **kwargs))``, will fail for them.
-        2. For most operations, there is no way of knowing if the input type is still valid for the output.
-
-        For these reasons, the automatic output wrapping is turned off for most operators.
-
-        Exceptions to this are:
-
-        - :func:`torch.clone`
-        - :meth:`torch.Tensor.to`
-        """
-        kwargs = kwargs or dict()
-        with DisableTorchFunction():
-            output = func(*args, **kwargs)
-
-        if func is torch.Tensor.clone:
-            return cls.new_like(args[0], output)
-        elif func is torch.Tensor.to:
-            return cls.new_like(args[0], output, dtype=output.dtype, device=output.device)
-        else:
-            return output
-
-    def _make_repr(self, **kwargs: Any) -> str:
-        # This is a poor man's implementation of the proposal in https://github.com/pytorch/pytorch/issues/76532.
-        # If that ever gets implemented, remove this in favor of the solution on the `torch.Tensor` class.
-        extra_repr = ", ".join(f"{key}={value}" for key, value in kwargs.items())
-        return f"{super().__repr__()[:-1]}, {extra_repr})"
-
-    def horizontal_flip(self) -> _Feature:
-        return self
-
-    def vertical_flip(self) -> _Feature:
-        return self
-
-    # TODO: We have to ignore override mypy error as there is torch.Tensor built-in deprecated op: Tensor.resize
-    # https://github.com/pytorch/pytorch/blob/e8727994eb7cdb2ab642749d6549bc497563aa06/torch/_tensor.py#L588-L593
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> _Feature:
-        return self
-
-    def crop(self, top: int, left: int, height: int, width: int) -> _Feature:
-        return self
-
-    def center_crop(self, output_size: List[int]) -> _Feature:
-        return self
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
-    ) -> _Feature:
-        return self
-
-    def pad(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> _Feature:
-        return self
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> _Feature:
-        return self
-
-    def affine(
-        self,
-        angle: float,
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> _Feature:
-        return self
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> _Feature:
-        return self
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> _Feature:
-        return self
-
-    def adjust_brightness(self, brightness_factor: float) -> _Feature:
-        return self
-
-    def adjust_saturation(self, saturation_factor: float) -> _Feature:
-        return self
-
-    def adjust_contrast(self, contrast_factor: float) -> _Feature:
-        return self
-
-    def adjust_sharpness(self, sharpness_factor: float) -> _Feature:
-        return self
-
-    def adjust_hue(self, hue_factor: float) -> _Feature:
-        return self
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> _Feature:
-        return self
-
-    def posterize(self, bits: int) -> _Feature:
-        return self
-
-    def solarize(self, threshold: float) -> _Feature:
-        return self
-
-    def autocontrast(self) -> _Feature:
-        return self
-
-    def equalize(self) -> _Feature:
-        return self
-
-    def invert(self) -> _Feature:
-        return self
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> _Feature:
-        return self
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
deleted file mode 100644
index 96e5995ee81..00000000000
--- a/torchvision/prototype/features/_image.py
+++ /dev/null
@@ -1,343 +0,0 @@
-from __future__ import annotations
-
-import warnings
-from typing import Any, cast, List, Optional, Sequence, Tuple, Union
-
-import torch
-from torchvision._utils import StrEnum
-from torchvision.transforms.functional import InterpolationMode, to_pil_image
-from torchvision.utils import draw_bounding_boxes, make_grid
-
-from ._bounding_box import BoundingBox
-from ._feature import _Feature
-
-
-class ColorSpace(StrEnum):
-    OTHER = StrEnum.auto()
-    GRAY = StrEnum.auto()
-    GRAY_ALPHA = StrEnum.auto()
-    RGB = StrEnum.auto()
-    RGB_ALPHA = StrEnum.auto()
-
-    @classmethod
-    def from_pil_mode(cls, mode: str) -> ColorSpace:
-        if mode == "L":
-            return cls.GRAY
-        elif mode == "LA":
-            return cls.GRAY_ALPHA
-        elif mode == "RGB":
-            return cls.RGB
-        elif mode == "RGBA":
-            return cls.RGB_ALPHA
-        else:
-            return cls.OTHER
-
-
-class Image(_Feature):
-    color_space: ColorSpace
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        color_space: Optional[Union[ColorSpace, str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> Image:
-        data = torch.as_tensor(data, dtype=dtype, device=device)  # type: ignore[arg-type]
-        if data.ndim < 2:
-            raise ValueError
-        elif data.ndim == 2:
-            data = data.unsqueeze(0)
-        image = super().__new__(cls, data, requires_grad=requires_grad)
-
-        if color_space is None:
-            color_space = cls.guess_color_space(image)
-            if color_space == ColorSpace.OTHER:
-                warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.")
-        elif isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-        elif not isinstance(color_space, ColorSpace):
-            raise ValueError
-        image.color_space = color_space
-
-        return image
-
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(color_space=self.color_space)
-
-    @classmethod
-    def new_like(
-        cls, other: Image, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any
-    ) -> Image:
-        return super().new_like(
-            other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs
-        )
-
-    @property
-    def image_size(self) -> Tuple[int, int]:
-        return cast(Tuple[int, int], tuple(self.shape[-2:]))
-
-    @property
-    def num_channels(self) -> int:
-        return self.shape[-3]
-
-    @staticmethod
-    def guess_color_space(data: torch.Tensor) -> ColorSpace:
-        if data.ndim < 2:
-            return ColorSpace.OTHER
-        elif data.ndim == 2:
-            return ColorSpace.GRAY
-
-        num_channels = data.shape[-3]
-        if num_channels == 1:
-            return ColorSpace.GRAY
-        elif num_channels == 2:
-            return ColorSpace.GRAY_ALPHA
-        elif num_channels == 3:
-            return ColorSpace.RGB
-        elif num_channels == 4:
-            return ColorSpace.RGB_ALPHA
-        else:
-            return ColorSpace.OTHER
-
-    def to_color_space(self, color_space: Union[str, ColorSpace], copy: bool = True) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        if isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-
-        return Image.new_like(
-            self,
-            _F.convert_color_space_image_tensor(
-                self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
-            ),
-            color_space=color_space,
-        )
-
-    def show(self) -> None:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-        to_pil_image(make_grid(self.view(-1, *self.shape[-3:]))).show()
-
-    def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-        return Image.new_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs))
-
-    def horizontal_flip(self) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.horizontal_flip_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def vertical_flip(self) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.vertical_flip_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.resize_image_tensor(self, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-        return Image.new_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.crop_image_tensor(self, top, left, height, width)
-        return Image.new_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.center_crop_image_tensor(self, output_size=output_size)
-        return Image.new_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
-    ) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.resized_crop_image_tensor(
-            self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
-        )
-        return Image.new_like(self, output)
-
-    def pad(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        # PyTorch's pad supports only scalars on fill. So we need to overwrite the colour
-        if isinstance(fill, (int, float)) or fill is None:
-            output = _F.pad_image_tensor(self, padding, fill=fill, padding_mode=padding_mode)
-        else:
-            from torchvision.prototype.transforms.functional._geometry import _pad_with_vector_fill
-
-            output = _pad_with_vector_fill(self, padding, fill=fill, padding_mode=padding_mode)
-
-        return Image.new_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> Image:
-        from torchvision.prototype.transforms.functional import _geometry as _F
-
-        fill = _F._convert_fill_arg(fill)
-
-        output = _F.rotate_image_tensor(
-            self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-        return Image.new_like(self, output)
-
-    def affine(
-        self,
-        angle: float,
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> Image:
-        from torchvision.prototype.transforms.functional import _geometry as _F
-
-        fill = _F._convert_fill_arg(fill)
-
-        output = _F.affine_image_tensor(
-            self,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-        return Image.new_like(self, output)
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> Image:
-        from torchvision.prototype.transforms.functional import _geometry as _F
-
-        fill = _F._convert_fill_arg(fill)
-
-        output = _F.perspective_image_tensor(self, perspective_coeffs, interpolation=interpolation, fill=fill)
-        return Image.new_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> Image:
-        from torchvision.prototype.transforms.functional import _geometry as _F
-
-        fill = _F._convert_fill_arg(fill)
-
-        output = _F.elastic_image_tensor(self, displacement, interpolation=interpolation, fill=fill)
-        return Image.new_like(self, output)
-
-    def adjust_brightness(self, brightness_factor: float) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.adjust_brightness_image_tensor(self, brightness_factor=brightness_factor)
-        return Image.new_like(self, output)
-
-    def adjust_saturation(self, saturation_factor: float) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.adjust_saturation_image_tensor(self, saturation_factor=saturation_factor)
-        return Image.new_like(self, output)
-
-    def adjust_contrast(self, contrast_factor: float) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.adjust_contrast_image_tensor(self, contrast_factor=contrast_factor)
-        return Image.new_like(self, output)
-
-    def adjust_sharpness(self, sharpness_factor: float) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.adjust_sharpness_image_tensor(self, sharpness_factor=sharpness_factor)
-        return Image.new_like(self, output)
-
-    def adjust_hue(self, hue_factor: float) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.adjust_hue_image_tensor(self, hue_factor=hue_factor)
-        return Image.new_like(self, output)
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.adjust_gamma_image_tensor(self, gamma=gamma, gain=gain)
-        return Image.new_like(self, output)
-
-    def posterize(self, bits: int) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.posterize_image_tensor(self, bits=bits)
-        return Image.new_like(self, output)
-
-    def solarize(self, threshold: float) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.solarize_image_tensor(self, threshold=threshold)
-        return Image.new_like(self, output)
-
-    def autocontrast(self) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.autocontrast_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def equalize(self) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.equalize_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def invert(self) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.invert_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.gaussian_blur_image_tensor(self, kernel_size=kernel_size, sigma=sigma)
-        return Image.new_like(self, output)
diff --git a/torchvision/prototype/features/_label.py b/torchvision/prototype/features/_label.py
deleted file mode 100644
index b6b8588e478..00000000000
--- a/torchvision/prototype/features/_label.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, Optional, Sequence, Union
-
-import torch
-from torch.utils._pytree import tree_map
-
-from ._feature import _Feature
-
-
-class Label(_Feature):
-    categories: Optional[Sequence[str]]
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        categories: Optional[Sequence[str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> Label:
-        label = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        label.categories = categories
-
-        return label
-
-    @classmethod
-    def new_like(cls, other: Label, data: Any, *, categories: Optional[Sequence[str]] = None, **kwargs: Any) -> Label:
-        return super().new_like(
-            other, data, categories=categories if categories is not None else other.categories, **kwargs
-        )
-
-    @classmethod
-    def from_category(
-        cls,
-        category: str,
-        *,
-        categories: Sequence[str],
-        **kwargs: Any,
-    ) -> Label:
-        return cls(categories.index(category), categories=categories, **kwargs)
-
-    def to_categories(self) -> Any:
-        if self.categories is None:
-            raise RuntimeError("Label does not have categories")
-
-        return tree_map(lambda idx: self.categories[idx], self.tolist())
-
-
-class OneHotLabel(_Feature):
-    categories: Optional[Sequence[str]]
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        categories: Optional[Sequence[str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> OneHotLabel:
-        one_hot_label = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        if categories is not None and len(categories) != one_hot_label.shape[-1]:
-            raise ValueError()
-
-        one_hot_label.categories = categories
-
-        return one_hot_label
-
-    @classmethod
-    def new_like(
-        cls, other: OneHotLabel, data: Any, *, categories: Optional[Sequence[str]] = None, **kwargs: Any
-    ) -> OneHotLabel:
-        return super().new_like(
-            other, data, categories=categories if categories is not None else other.categories, **kwargs
-        )
diff --git a/torchvision/prototype/features/_segmentation_mask.py b/torchvision/prototype/features/_segmentation_mask.py
deleted file mode 100644
index 406e06aef11..00000000000
--- a/torchvision/prototype/features/_segmentation_mask.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from __future__ import annotations
-
-from typing import List, Optional, Sequence, Union
-
-import torch
-from torchvision.transforms import InterpolationMode
-
-from ._feature import _Feature
-
-
-class SegmentationMask(_Feature):
-    def horizontal_flip(self) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.horizontal_flip_segmentation_mask(self)
-        return SegmentationMask.new_like(self, output)
-
-    def vertical_flip(self) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.vertical_flip_segmentation_mask(self)
-        return SegmentationMask.new_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.resize_segmentation_mask(self, size, max_size=max_size)
-        return SegmentationMask.new_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.crop_segmentation_mask(self, top, left, height, width)
-        return SegmentationMask.new_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.center_crop_segmentation_mask(self, output_size=output_size)
-        return SegmentationMask.new_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        antialias: bool = False,
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.resized_crop_segmentation_mask(self, top, left, height, width, size=size)
-        return SegmentationMask.new_like(self, output)
-
-    def pad(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        output = _F.pad_segmentation_mask(self, padding, padding_mode=padding_mode)
-        return SegmentationMask.new_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.rotate_segmentation_mask(self, angle, expand=expand, center=center)
-        return SegmentationMask.new_like(self, output)
-
-    def affine(
-        self,
-        angle: float,
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-        center: Optional[List[float]] = None,
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.affine_segmentation_mask(
-            self,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            center=center,
-        )
-        return SegmentationMask.new_like(self, output)
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.perspective_segmentation_mask(self, perspective_coeffs)
-        return SegmentationMask.new_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    ) -> SegmentationMask:
-        from torchvision.prototype.transforms import functional as _F
-
-        output = _F.elastic_segmentation_mask(self, displacement)
-        return SegmentationMask.new_like(self, output, dtype=output.dtype)
diff --git a/torchvision/prototype/models/depth/stereo/__init__.py b/torchvision/prototype/models/depth/stereo/__init__.py
index 18679026683..cd075ca2b9e 100644
--- a/torchvision/prototype/models/depth/stereo/__init__.py
+++ b/torchvision/prototype/models/depth/stereo/__init__.py
@@ -1 +1,2 @@
 from .raft_stereo import *
+from .crestereo import *
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
new file mode 100644
index 00000000000..a72868f9217
--- /dev/null
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -0,0 +1,1464 @@
+import math
+from collections.abc import Iterable
+from functools import partial
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models.optical_flow.raft as raft
+from torch import Tensor
+from torchvision.models._api import register_model, Weights, WeightsEnum
+from torchvision.models._utils import handle_legacy_interface
+from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
+from torchvision.ops import Conv2dNormActivation
+from torchvision.prototype.transforms._presets import StereoMatching
+
+all = (
+    "CREStereo",
+    "CREStereo_Base_Weights",
+    "crestereo_base",
+)
+
+
+class ConvexMaskPredictor(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        hidden_size: int,
+        upsample_factor: int,
+        multiplier: float = 0.25,
+    ) -> None:
+
+        super().__init__()
+        self.mask_head = nn.Sequential(
+            Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3),
+            # https://arxiv.org/pdf/2003.12039.pdf (Annex section B) for the
+            # following convolution output size
+            nn.Conv2d(hidden_size, upsample_factor**2 * 9, 1, padding=0),
+        )
+
+        self.multiplier = multiplier
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.mask_head(x) * self.multiplier
+        return x
+
+
+def get_correlation(
+    left_feature: Tensor,
+    right_feature: Tensor,
+    window_size: tuple[int, int] = (3, 3),
+    dilate: tuple[int, int] = (1, 1),
+) -> Tensor:
+    """Function that computes a correlation product between the left and right features.
+
+    The correlation is computed in a sliding window fashion, namely the left features are fixed
+    and for each ``(i, j)`` location we compute the correlation with a sliding window anchored in
+    ``(i, j)`` from the right feature map. The sliding window selects pixels obtained in the range of the sliding
+    window; i.e ``(i - window_size // 2, i + window_size // 2)`` respectively ``(j - window_size // 2, j + window_size // 2)``.
+    """
+
+    B, C, H, W = left_feature.shape
+
+    di_y, di_x = dilate[0], dilate[1]
+    pad_y, pad_x = window_size[0] // 2 * di_y, window_size[1] // 2 * di_x
+
+    right_padded = F.pad(right_feature, (pad_x, pad_x, pad_y, pad_y), mode="replicate")
+    # in order to vectorize the correlation computation over all pixel candidates
+    # we create multiple shifted right images which we stack on an extra dimension
+    right_padded = F.unfold(right_padded, kernel_size=(H, W), dilation=dilate)
+    # torch unfold returns a tensor of shape [B, flattened_values, n_selections]
+    right_padded = right_padded.permute(0, 2, 1)
+    # we consider rehsape back into [B, n_views, C, H, W]
+    right_padded = right_padded.reshape(B, (window_size[0] * window_size[1]), C, H, W)
+    # we expand the left features for broadcasting
+    left_feature = left_feature.unsqueeze(1)
+    # this will compute an element product of between [B, 1, C, H, W] * [B, n_views, C, H, W]
+    # to obtain correlations over the pixel candidates we perform a mean on the C dimension
+    correlation = torch.mean(left_feature * right_padded, dim=2, keepdim=False)
+    # the final correlation tensor shape will be [B, n_views, H, W]
+    # where on the i-th position of the n_views dimension we will have
+    # the correlation value between the left pixel
+    # and the i-th candidate on the right feature map
+    return correlation
+
+
+def _check_window_specs(
+    search_window_1d: tuple[int, int] = (1, 9),
+    search_dilate_1d: tuple[int, int] = (1, 1),
+    search_window_2d: tuple[int, int] = (3, 3),
+    search_dilate_2d: tuple[int, int] = (1, 1),
+) -> None:
+
+    if not np.prod(search_window_1d) == np.prod(search_window_2d):
+        raise ValueError(
+            f"The 1D and 2D windows should contain the same number of elements. "
+            f"1D shape: {search_window_1d} 2D shape: {search_window_2d}"
+        )
+    if not np.prod(search_window_1d) % 2 == 1:
+        raise ValueError(
+            f"Search windows should contain an odd number of elements in them."
+            f"Window of shape {search_window_1d} has {np.prod(search_window_1d)} elements."
+        )
+    if not any(size == 1 for size in search_window_1d):
+        raise ValueError(f"The 1D search window should have at least one size equal to 1. 1D shape: {search_window_1d}")
+    if any(size == 1 for size in search_window_2d):
+        raise ValueError(
+            f"The 2D search window should have all dimensions greater than 1. 2D shape: {search_window_2d}"
+        )
+    if any(dilate < 1 for dilate in search_dilate_1d):
+        raise ValueError(
+            f"The 1D search dilation should have all elements equal or greater than 1. 1D shape: {search_dilate_1d}"
+        )
+    if any(dilate < 1 for dilate in search_dilate_2d):
+        raise ValueError(
+            f"The 2D search dilation should have all elements equal greater than 1. 2D shape: {search_dilate_2d}"
+        )
+
+
+class IterativeCorrelationLayer(nn.Module):
+    def __init__(
+        self,
+        groups: int = 4,
+        search_window_1d: tuple[int, int] = (1, 9),
+        search_dilate_1d: tuple[int, int] = (1, 1),
+        search_window_2d: tuple[int, int] = (3, 3),
+        search_dilate_2d: tuple[int, int] = (1, 1),
+    ) -> None:
+
+        super().__init__()
+        _check_window_specs(
+            search_window_1d=search_window_1d,
+            search_dilate_1d=search_dilate_1d,
+            search_window_2d=search_window_2d,
+            search_dilate_2d=search_dilate_2d,
+        )
+        self.search_pixels = np.prod(search_window_1d)
+        self.groups = groups
+
+        # two selection tables for dealing with the small_patch argument in the forward function
+        self.patch_sizes = {
+            "2d": [search_window_2d for _ in range(self.groups)],
+            "1d": [search_window_1d for _ in range(self.groups)],
+        }
+
+        self.dilate_sizes = {
+            "2d": [search_dilate_2d for _ in range(self.groups)],
+            "1d": [search_dilate_1d for _ in range(self.groups)],
+        }
+
+    def forward(self, left_feature: Tensor, right_feature: Tensor, flow: Tensor, window_type: str = "1d") -> Tensor:
+        """Function that computes 1 pass of non-offsetted Group-Wise correlation"""
+        coords = make_coords_grid(
+            left_feature.shape[0], left_feature.shape[2], left_feature.shape[3], device=str(left_feature.device)
+        )
+
+        # we offset the coordinate grid in the flow direction
+        coords = coords + flow
+        coords = coords.permute(0, 2, 3, 1)
+        # resample right features according to off-setted grid
+        right_feature = grid_sample(right_feature, coords, mode="bilinear", align_corners=True)
+
+        # use_small_patch is a flag by which we decide on how many axes
+        # we perform candidate search. See section 3.1 ``Deformable search window`` & Figure 4 in the paper.
+        patch_size_list = self.patch_sizes[window_type]
+        dilate_size_list = self.dilate_sizes[window_type]
+
+        # chunking the left and right feature to perform group-wise correlation
+        # mechanism similar to GroupNorm. See section 3.1 ``Group-wise correlation``.
+        left_groups = torch.chunk(left_feature, self.groups, dim=1)
+        right_groups = torch.chunk(right_feature, self.groups, dim=1)
+
+        correlations = []
+        # this boils down to rather than performing the correlation product
+        # over the entire C dimensions, we use subsets of C to get multiple correlation sets
+        for i in range(len(patch_size_list)):
+            correlation = get_correlation(left_groups[i], right_groups[i], patch_size_list[i], dilate_size_list[i])
+            correlations.append(correlation)
+        final_correlations = torch.cat(correlations, dim=1)
+        return final_correlations
+
+
+class AttentionOffsetCorrelationLayer(nn.Module):
+    def __init__(
+        self,
+        groups: int = 4,
+        attention_module: Optional[nn.Module] = None,
+        search_window_1d: tuple[int, int] = (1, 9),
+        search_dilate_1d: tuple[int, int] = (1, 1),
+        search_window_2d: tuple[int, int] = (3, 3),
+        search_dilate_2d: tuple[int, int] = (1, 1),
+    ) -> None:
+        super().__init__()
+        _check_window_specs(
+            search_window_1d=search_window_1d,
+            search_dilate_1d=search_dilate_1d,
+            search_window_2d=search_window_2d,
+            search_dilate_2d=search_dilate_2d,
+        )
+        # convert to python scalar
+        self.search_pixels = int(np.prod(search_window_1d))
+        self.groups = groups
+
+        # two selection tables for dealing with the small_patch argument in the forward function
+        self.patch_sizes = {
+            "2d": [search_window_2d for _ in range(self.groups)],
+            "1d": [search_window_1d for _ in range(self.groups)],
+        }
+
+        self.dilate_sizes = {
+            "2d": [search_dilate_2d for _ in range(self.groups)],
+            "1d": [search_dilate_1d for _ in range(self.groups)],
+        }
+
+        self.attention_module = attention_module
+
+    def forward(
+        self,
+        left_feature: Tensor,
+        right_feature: Tensor,
+        flow: Tensor,
+        extra_offset: Tensor,
+        window_type: str = "1d",
+    ) -> Tensor:
+        """Function that computes 1 pass of offsetted Group-Wise correlation
+
+        If the class was provided with an attention layer, the left and right feature maps
+        will be passed through a transformer first
+        """
+        B, C, H, W = left_feature.shape
+
+        if self.attention_module is not None:
+            # prepare for transformer required input shapes
+            left_feature = left_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
+            right_feature = right_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
+            # this can be either self attention or cross attention, hence the tuple return
+            left_feature, right_feature = self.attention_module(left_feature, right_feature)
+            left_feature = left_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
+            right_feature = right_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
+
+        left_groups = torch.chunk(left_feature, self.groups, dim=1)
+        right_groups = torch.chunk(right_feature, self.groups, dim=1)
+
+        num_search_candidates = self.search_pixels
+        # for each pixel (i, j) we have a number of search candidates
+        # thus, for each candidate we should have an X-axis and Y-axis offset value
+        extra_offset = extra_offset.reshape(B, num_search_candidates, 2, H, W).permute(0, 1, 3, 4, 2)
+
+        patch_size_list = self.patch_sizes[window_type]
+        dilate_size_list = self.dilate_sizes[window_type]
+
+        group_channels = C // self.groups
+        correlations = []
+
+        for i in range(len(patch_size_list)):
+            left_group, right_group = left_groups[i], right_groups[i]
+            patch_size, dilate = patch_size_list[i], dilate_size_list[i]
+
+            di_y, di_x = dilate
+            ps_y, ps_x = patch_size
+            # define the search based on the window patch shape
+            ry, rx = ps_y // 2 * di_y, ps_x // 2 * di_x
+
+            # base offsets for search (i.e. where to look on the search index)
+            x_grid, y_grid = torch.meshgrid(
+                torch.arange(-rx, rx + 1, di_x), torch.arange(-ry, ry + 1, di_y), indexing="xy"
+            )
+            x_grid, y_grid = x_grid.to(flow.device), y_grid.to(flow.device)
+            offsets = torch.stack((x_grid, y_grid))
+            offsets = offsets.reshape(2, -1).permute(1, 0)
+
+            for d in (0, 2, 3):
+                offsets = offsets.unsqueeze(d)
+            # extra offsets for search (i.e. deformed search indexes. Similar concept to deformable convolutions)
+            offsets = offsets + extra_offset
+
+            coords = (
+                make_coords_grid(
+                    left_feature.shape[0], left_feature.shape[2], left_feature.shape[3], device=str(left_feature.device)
+                )
+                + flow
+            )
+            coords = coords.permute(0, 2, 3, 1).unsqueeze(1)
+            coords = coords + offsets
+            coords = coords.reshape(B, -1, W, 2)
+
+            right_group = grid_sample(right_group, coords, mode="bilinear", align_corners=True)
+            # we do not need to perform any window shifting because the grid sample op
+            # will return a multi-view right based on the num_search_candidates dimension in the offsets
+            right_group = right_group.reshape(B, group_channels, -1, H, W)
+            left_group = left_group.reshape(B, group_channels, -1, H, W)
+            correlation = torch.mean(left_group * right_group, dim=1)
+            correlations.append(correlation)
+
+        final_correlation = torch.cat(correlations, dim=1)
+        return final_correlation
+
+
+class AdaptiveGroupCorrelationLayer(nn.Module):
+    """
+    Container for computing various correlation types between a left and right feature map.
+    This module does not contain any optimisable parameters, it's solely a collection of ops.
+    We wrap in a nn.Module for torch.jit.script compatibility
+
+    Adaptive Group Correlation operations from: https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf
+
+    Canonical reference implementation: https://github.com/megvii-research/CREStereo/blob/master/nets/corr.py
+    """
+
+    def __init__(
+        self,
+        iterative_correlation_layer: IterativeCorrelationLayer,
+        attention_offset_correlation_layer: AttentionOffsetCorrelationLayer,
+    ) -> None:
+        super().__init__()
+
+        self.iterative_correlation_layer = iterative_correlation_layer
+        self.attention_offset_correlation_layer = attention_offset_correlation_layer
+
+    def forward(
+        self,
+        left_features: Tensor,
+        right_features: Tensor,
+        flow: torch.Tensor,
+        extra_offset: Optional[Tensor],
+        window_type: str = "1d",
+        iter_mode: bool = False,
+    ) -> Tensor:
+        if iter_mode or extra_offset is None:
+            corr = self.iterative_correlation_layer(left_features, right_features, flow, window_type)
+        else:
+            corr = self.attention_offset_correlation_layer(
+                left_features, right_features, flow, extra_offset, window_type
+            )  # type: ignore
+        return corr
+
+
+def elu_feature_map(x: Tensor) -> Tensor:
+    """Elu feature map operation from: https://arxiv.org/pdf/2006.16236.pdf"""
+    return F.elu(x) + 1
+
+
+class LinearAttention(nn.Module):
+    """
+    Linear attention operation from: https://arxiv.org/pdf/2006.16236.pdf
+    Canonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
+    """
+
+    def __init__(self, eps: float = 1e-6, feature_map_fn: Callable[[Tensor], Tensor] = elu_feature_map) -> None:
+        super().__init__()
+        self.eps = eps
+        self.feature_map_fn = feature_map_fn
+
+    def forward(
+        self,
+        queries: Tensor,
+        keys: Tensor,
+        values: Tensor,
+        q_mask: Optional[Tensor] = None,
+        kv_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Args:
+            queries (torch.Tensor): [N, S1, H, D]
+            keys (torch.Tensor): [N, S2, H, D]
+            values (torch.Tensor): [N, S2, H, D]
+            q_mask (torch.Tensor): [N, S1] (optional)
+            kv_mask (torch.Tensor): [N, S2] (optional)
+        Returns:
+            queried_values (torch.Tensor): [N, S1, H, D]
+        """
+        queries = self.feature_map_fn(queries)
+        keys = self.feature_map_fn(keys)
+
+        if q_mask is not None:
+            queries = queries * q_mask[:, :, None, None]
+        if kv_mask is not None:
+            keys = keys * kv_mask[:, :, None, None]
+            values = values * kv_mask[:, :, None, None]
+
+        # mitigates fp16 overflows
+        values_length = values.shape[1]
+        values = values / values_length
+        kv = torch.einsum("NSHD, NSHV -> NHDV", keys, values)
+        z = 1 / (torch.einsum("NLHD, NHD -> NLH", queries, keys.sum(dim=1)) + self.eps)
+        # rescale at the end to account for fp16 mitigation
+        queried_values = torch.einsum("NLHD, NHDV, NLH -> NLHV", queries, kv, z) * values_length
+        return queried_values
+
+
+class SoftmaxAttention(nn.Module):
+    """
+    A simple softmax attention  operation
+    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
+    """
+
+    def __init__(self, dropout: float = 0.0) -> None:
+        super().__init__()
+        self.dropout = nn.Dropout(dropout) if dropout else nn.Identity()
+
+    def forward(
+        self,
+        queries: Tensor,
+        keys: Tensor,
+        values: Tensor,
+        q_mask: Optional[Tensor] = None,
+        kv_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Computes classical softmax full-attention between all queries and keys.
+
+        Args:
+            queries (torch.Tensor): [N, S1, H, D]
+            keys (torch.Tensor): [N, S2, H, D]
+            values (torch.Tensor): [N, S2, H, D]
+            q_mask (torch.Tensor): [N, S1] (optional)
+            kv_mask (torch.Tensor): [N, S2] (optional)
+        Returns:
+            queried_values: [N, S1, H, D]
+        """
+
+        scale_factor = 1.0 / queries.shape[3] ** 0.5  # irsqrt(D) scaling
+        queries = queries * scale_factor
+
+        qk = torch.einsum("NLHD, NSHD -> NLSH", queries, keys)
+        if kv_mask is not None and q_mask is not None:
+            qk.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float("-inf"))
+
+        attention = torch.softmax(qk, dim=2)
+        attention = self.dropout(attention)
+
+        queried_values = torch.einsum("NLSH, NSHD -> NLHD", attention, values)
+        return queried_values
+
+
+class PositionalEncodingSine(nn.Module):
+    """
+    Sinusoidal positional encodings
+
+    Using the scaling term from https://github.com/megvii-research/CREStereo/blob/master/nets/attention/position_encoding.py
+    Reference implementation from https://github.com/facebookresearch/detr/blob/8a144f83a287f4d3fece4acdf073f387c5af387d/models/position_encoding.py#L28-L48
+    """
+
+    def __init__(self, dim_model: int, max_size: int = 256) -> None:
+        super().__init__()
+        self.dim_model = dim_model
+        self.max_size = max_size
+        # pre-registered for memory efficiency during forward pass
+        pe = self._make_pe_of_size(self.max_size)
+        self.register_buffer("pe", pe)
+
+    def _make_pe_of_size(self, size: int) -> Tensor:
+        pe = torch.zeros((self.dim_model, *(size, size)), dtype=torch.float32)
+        y_positions = torch.ones((size, size)).cumsum(0).float().unsqueeze(0)
+        x_positions = torch.ones((size, size)).cumsum(1).float().unsqueeze(0)
+        div_term = torch.exp(torch.arange(0.0, self.dim_model // 2, 2) * (-math.log(10000.0) / self.dim_model // 2))
+        div_term = div_term[:, None, None]
+        pe[0::4, :, :] = torch.sin(x_positions * div_term)
+        pe[1::4, :, :] = torch.cos(x_positions * div_term)
+        pe[2::4, :, :] = torch.sin(y_positions * div_term)
+        pe[3::4, :, :] = torch.cos(y_positions * div_term)
+        pe = pe.unsqueeze(0)
+        return pe
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: [B, C, H, W]
+
+        Returns:
+            x: [B, C, H, W]
+        """
+        torch._assert(
+            len(x.shape) == 4,
+            f"PositionalEncodingSine requires a 4-D dimensional input. Provided tensor is of shape {x.shape}",
+        )
+
+        B, C, H, W = x.shape
+        return x + self.pe[:, :, :H, :W]  # type: ignore
+
+
+class LocalFeatureEncoderLayer(nn.Module):
+    """
+    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
+    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    """
+
+    def __init__(
+        self,
+        *,
+        dim_model: int,
+        num_heads: int,
+        attention_module: Callable[..., nn.Module] = LinearAttention,
+    ) -> None:
+        super().__init__()
+
+        self.attention_op = attention_module()
+
+        if not isinstance(self.attention_op, (LinearAttention, SoftmaxAttention)):
+            raise ValueError(
+                f"attention_module must be an instance of LinearAttention or SoftmaxAttention. Got {type(self.attention_op)}"
+            )
+
+        self.dim_head = dim_model // num_heads
+        self.num_heads = num_heads
+
+        # multi-head attention
+        self.query_proj = nn.Linear(dim_model, dim_model, bias=False)
+        self.key_proj = nn.Linear(dim_model, dim_model, bias=False)
+        self.value_proj = nn.Linear(dim_model, dim_model, bias=False)
+        self.merge = nn.Linear(dim_model, dim_model, bias=False)
+
+        # feed forward network
+        self.ffn = nn.Sequential(
+            nn.Linear(dim_model * 2, dim_model * 2, bias=False),
+            nn.ReLU(),
+            nn.Linear(dim_model * 2, dim_model, bias=False),
+        )
+
+        # norm layers
+        self.attention_norm = nn.LayerNorm(dim_model)
+        self.ffn_norm = nn.LayerNorm(dim_model)
+
+    def forward(
+        self, x: Tensor, source: Tensor, x_mask: Optional[Tensor] = None, source_mask: Optional[Tensor] = None
+    ) -> Tensor:
+        """
+        Args:
+            x (torch.Tensor): [B, S1, D]
+            source (torch.Tensor): [B, S2, D]
+            x_mask (torch.Tensor): [B, S1] (optional)
+            source_mask (torch.Tensor): [B, S2] (optional)
+        """
+        B, S, D = x.shape
+        queries, keys, values = x, source, source
+
+        queries = self.query_proj(queries).reshape(B, S, self.num_heads, self.dim_head)
+        keys = self.key_proj(keys).reshape(B, S, self.num_heads, self.dim_head)
+        values = self.value_proj(values).reshape(B, S, self.num_heads, self.dim_head)
+
+        # attention operation
+        message = self.attention_op(queries, keys, values, x_mask, source_mask)
+        # concatenating attention heads together before passing through projection layer
+        message = self.merge(message.reshape(B, S, D))
+        message = self.attention_norm(message)
+
+        # ffn operation
+        message = self.ffn(torch.cat([x, message], dim=2))
+        message = self.ffn_norm(message)
+
+        return x + message
+
+
+class LocalFeatureTransformer(nn.Module):
+    """
+    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
+    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    """
+
+    def __init__(
+        self,
+        *,
+        dim_model: int,
+        num_heads: int,
+        attention_directions: list[str],
+        attention_module: Callable[..., nn.Module] = LinearAttention,
+    ) -> None:
+        super().__init__()
+
+        self.attention_module = attention_module
+        self.attention_directions = attention_directions
+        for direction in attention_directions:
+            if direction not in ["self", "cross"]:
+                raise ValueError(
+                    f"Attention direction {direction} unsupported. LocalFeatureTransformer accepts only ``attention_type`` in ``[self, cross]``."
+                )
+
+        self.layers = nn.ModuleList(
+            [
+                LocalFeatureEncoderLayer(dim_model=dim_model, num_heads=num_heads, attention_module=attention_module)
+                for _ in attention_directions
+            ]
+        )
+
+    def forward(
+        self,
+        left_features: Tensor,
+        right_features: Tensor,
+        left_mask: Optional[Tensor] = None,
+        right_mask: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor]:
+        """
+        Args:
+            left_features (torch.Tensor): [N, S1, D]
+            right_features (torch.Tensor): [N, S2, D]
+            left_mask (torch.Tensor): [N, S1] (optional)
+            right_mask (torch.Tensor): [N, S2] (optional)
+        Returns:
+            left_features (torch.Tensor): [N, S1, D]
+            right_features (torch.Tensor): [N, S2, D]
+        """
+
+        torch._assert(
+            left_features.shape[2] == right_features.shape[2],
+            f"left_features and right_features should have the same embedding dimensions. left_features: {left_features.shape[2]} right_features: {right_features.shape[2]}",
+        )
+
+        for idx, layer in enumerate(self.layers):
+            attention_direction = self.attention_directions[idx]
+
+            if attention_direction == "self":
+                left_features = layer(left_features, left_features, left_mask, left_mask)
+                right_features = layer(right_features, right_features, right_mask, right_mask)
+
+            elif attention_direction == "cross":
+                left_features = layer(left_features, right_features, left_mask, right_mask)
+                right_features = layer(right_features, left_features, right_mask, left_mask)
+
+        return left_features, right_features
+
+
+class PyramidDownsample(nn.Module):
+    """
+    A simple wrapper that return and Avg Pool feature pyramid based on the provided scales.
+    Implicitly returns the input as well.
+    """
+
+    def __init__(self, factors: Iterable[int]) -> None:
+        super().__init__()
+        self.factors = factors
+
+    def forward(self, x: torch.Tensor) -> list[Tensor]:
+        results = [x]
+        for factor in self.factors:
+            results.append(F.avg_pool2d(x, kernel_size=factor, stride=factor))
+        return results
+
+
+class CREStereo(nn.Module):
+    """
+    Implements CREStereo from the `"Practical Stereo Matching via Cascaded Recurrent Network
+    With Adaptive Correlation" <https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf>`_ paper.
+    Args:
+        feature_encoder (raft.FeatureEncoder): Raft-like Feature Encoder module extract low-level features from inputs.
+        update_block (raft.UpdateBlock): Raft-like Update Block which recursively refines a flow-map.
+        flow_head (raft.FlowHead): Raft-like Flow Head which predics a flow-map from some inputs.
+        self_attn_block (LocalFeatureTransformer): A Local Feature Transformer that performs self attention on the two feature maps.
+        cross_attn_block (LocalFeatureTransformer): A Local Feature Transformer that performs cross attention between the two feature maps
+            used in the Adaptive Group Correlation module.
+        feature_downsample_rates (List[int]): The downsample rates used to build a feature pyramid from the outputs of the `feature_encoder`. Default: [2, 4]
+        correlation_groups (int): In how many groups should the features be split when computer per-pixel correlation. Defaults 4.
+        search_window_1d (Tuple[int, int]): The alternate search window size in the x and y directions for the 1D case. Defaults to (1, 9).
+        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
+        search_window_2d (Tuple[int, int]): The alternate search window size in the x and y directions for the 2D case. Defaults to (3, 3).
+        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
+    """
+
+    def __init__(
+        self,
+        *,
+        feature_encoder: raft.FeatureEncoder,
+        update_block: raft.UpdateBlock,
+        flow_head: raft.FlowHead,
+        self_attn_block: LocalFeatureTransformer,
+        cross_attn_block: LocalFeatureTransformer,
+        feature_downsample_rates: tuple[int, ...] = (2, 4),
+        correlation_groups: int = 4,
+        search_window_1d: tuple[int, int] = (1, 9),
+        search_dilate_1d: tuple[int, int] = (1, 1),
+        search_window_2d: tuple[int, int] = (3, 3),
+        search_dilate_2d: tuple[int, int] = (1, 1),
+    ) -> None:
+        super().__init__()
+        self.output_channels = 2
+
+        self.feature_encoder = feature_encoder
+        self.update_block = update_block
+        self.flow_head = flow_head
+        self.self_attn_block = self_attn_block
+
+        # average pooling for the feature encoder outputs
+        self.downsampling_pyramid = PyramidDownsample(feature_downsample_rates)
+        self.downsampling_factors: list[int] = [feature_encoder.downsample_factor]
+        base_downsample_factor: int = self.downsampling_factors[0]
+        for rate in feature_downsample_rates:
+            self.downsampling_factors.append(base_downsample_factor * rate)
+
+        # output resolution tracking
+        self.resolutions: list[str] = [f"1 / {factor}" for factor in self.downsampling_factors]
+        self.search_pixels = int(np.prod(search_window_1d))
+
+        # flow convex upsampling mask predictor
+        self.mask_predictor = ConvexMaskPredictor(
+            in_channels=feature_encoder.output_dim // 2,
+            hidden_size=feature_encoder.output_dim,
+            upsample_factor=feature_encoder.downsample_factor,
+            multiplier=0.25,
+        )
+
+        # offsets modules for offsetted feature selection
+        self.offset_convs = nn.ModuleDict()
+        self.correlation_layers = nn.ModuleDict()
+
+        offset_conv_layer = partial(
+            Conv2dNormActivation,
+            in_channels=feature_encoder.output_dim,
+            out_channels=self.search_pixels * 2,
+            norm_layer=None,
+            activation_layer=None,
+        )
+
+        # populate the dicts in top to bottom order
+        # useful for iterating through torch.jit.script module given the network forward pass
+        #
+        # Ignore the largest resolution. We handle that separately due to torch.jit.script
+        # not being able to access to runtime generated keys in ModuleDicts.
+        # This way, we can keep a generic way of processing all pyramid levels but except
+        # the final one
+        iterative_correlation_layer = partial(
+            IterativeCorrelationLayer,
+            groups=correlation_groups,
+            search_window_1d=search_window_1d,
+            search_dilate_1d=search_dilate_1d,
+            search_window_2d=search_window_2d,
+            search_dilate_2d=search_dilate_2d,
+        )
+
+        attention_offset_correlation_layer = partial(
+            AttentionOffsetCorrelationLayer,
+            groups=correlation_groups,
+            search_window_1d=search_window_1d,
+            search_dilate_1d=search_dilate_1d,
+            search_window_2d=search_window_2d,
+            search_dilate_2d=search_dilate_2d,
+        )
+
+        for idx, resolution in enumerate(reversed(self.resolutions[1:])):
+            # the largest resolution does use offset convolutions for sampling grid coords
+            offset_conv = None if idx == len(self.resolutions) - 1 else offset_conv_layer()
+            if offset_conv:
+                self.offset_convs[resolution] = offset_conv
+                # only the lowest resolution uses the cross attention module when computing correlation scores
+                attention_module = cross_attn_block if idx == 0 else None
+                self.correlation_layers[resolution] = AdaptiveGroupCorrelationLayer(
+                    iterative_correlation_layer=iterative_correlation_layer(),
+                    attention_offset_correlation_layer=attention_offset_correlation_layer(
+                        attention_module=attention_module
+                    ),
+                )
+
+        # correlation layer for the largest resolution
+        self.max_res_correlation_layer = AdaptiveGroupCorrelationLayer(
+            iterative_correlation_layer=iterative_correlation_layer(),
+            attention_offset_correlation_layer=attention_offset_correlation_layer(),
+        )
+
+        # simple 2D Postional Encodings
+        self.positional_encodings = PositionalEncodingSine(feature_encoder.output_dim)
+
+    def _get_window_type(self, iteration: int) -> str:
+        return "1d" if iteration % 2 == 0 else "2d"
+
+    def forward(
+        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 10
+    ) -> list[Tensor]:
+        features = torch.cat([left_image, right_image], dim=0)
+        features = self.feature_encoder(features)
+        left_features, right_features = features.chunk(2, dim=0)
+
+        # update block network state and input context are derived from the left feature map
+        net, ctx = left_features.chunk(2, dim=1)
+        net = torch.tanh(net)
+        ctx = torch.relu(ctx)
+
+        # will output lists of tensor.
+        l_pyramid = self.downsampling_pyramid(left_features)
+        r_pyramid = self.downsampling_pyramid(right_features)
+        net_pyramid = self.downsampling_pyramid(net)
+        ctx_pyramid = self.downsampling_pyramid(ctx)
+
+        # we store in reversed order because we process the pyramid from top to bottom
+        l_pyramid = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        r_pyramid = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        net_pyramid = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        ctx_pyramid = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+
+        # offsets for sampling pixel candidates in the correlation ops
+        offsets: dict[str, Tensor] = {}
+        for resolution, offset_conv in self.offset_convs.items():
+            feature_map = l_pyramid[resolution]
+            offset = offset_conv(feature_map)
+            offsets[resolution] = (torch.sigmoid(offset) - 0.5) * 2.0
+
+        # the smallest resolution is prepared for passing through self attention
+        min_res = self.resolutions[-1]
+        max_res = self.resolutions[0]
+
+        B, C, MIN_H, MIN_W = l_pyramid[min_res].shape
+        # add positional encodings
+        l_pyramid[min_res] = self.positional_encodings(l_pyramid[min_res])
+        r_pyramid[min_res] = self.positional_encodings(r_pyramid[min_res])
+        # reshaping for transformer
+        l_pyramid[min_res] = l_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
+        r_pyramid[min_res] = r_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
+        # perform self attention
+        l_pyramid[min_res], r_pyramid[min_res] = self.self_attn_block(l_pyramid[min_res], r_pyramid[min_res])
+        # now we need to reshape back into [B, C, H, W] format
+        l_pyramid[min_res] = l_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
+        r_pyramid[min_res] = r_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
+
+        predictions: list[Tensor] = []
+        flow_estimates: dict[str, Tensor] = {}
+        # we added this because of torch.script.jit
+        # also, the predicition prior is always going to have the
+        # spatial size of the features outputted by the feature encoder
+        flow_pred_prior: Tensor = torch.empty(
+            size=(B, 2, left_features.shape[2], left_features.shape[3]),
+            dtype=l_pyramid[max_res].dtype,
+            device=l_pyramid[max_res].device,
+        )
+
+        if flow_init is not None:
+            scale = l_pyramid[max_res].shape[2] / flow_init.shape[2]
+            # in CREStereo implementation they multiply with -scale instead of scale
+            # this can be either a downsample or an upsample based on the cascaded inference
+            # configuration
+
+            # we use a -scale because the flow used inside the network is a negative flow
+            # from the right to the left, so we flip the flow direction
+            flow_estimates[max_res] = -scale * F.interpolate(
+                input=flow_init,
+                size=l_pyramid[max_res].shape[2:],
+                mode="bilinear",
+                align_corners=True,
+            )
+
+        # when not provided with a flow prior, we construct one using the lower resolution maps
+        else:
+            # initialize a zero flow with the smallest resolution
+            flow = torch.zeros(size=(B, 2, MIN_H, MIN_W), device=left_features.device, dtype=left_features.dtype)
+
+            # flows from coarse resolutions are refined similarly
+            # we always need to fetch the next pyramid feature map as well
+            # when updating coarse resolutions, therefore we create a reversed
+            # view which has its order synced with the ModuleDict keys iterator
+            coarse_resolutions: list[str] = self.resolutions[::-1]  # using slicing because of torch.jit.script
+            fine_grained_resolution = max_res
+
+            # set the coarsest flow to the zero flow
+            flow_estimates[coarse_resolutions[0]] = flow
+
+            # the correlation layer ModuleDict will contain layers ordered from coarse to fine resolution
+            # i.e ["1 / 16", "1 / 8", "1 / 4"]
+            # the correlation layer ModuleDict has layers for all the resolutions except the fine one
+            # i.e {"1 / 16": Module, "1 / 8": Module}
+            # for these resolution we perform only half of the number of refinement iterations
+            for idx, (resolution, correlation_layer) in enumerate(self.correlation_layers.items()):
+                # compute the scale difference between the first pyramid scale and the current pyramid scale
+                scale_to_base = l_pyramid[fine_grained_resolution].shape[2] // l_pyramid[resolution].shape[2]
+                for it in range(num_iters // 2):
+                    # set whether we want to search on (X, Y) axes for correlation or just on X axis
+                    window_type = self._get_window_type(it)
+                    # we consider this a prior, therefore we do not want to back-propagate through it
+                    flow_estimates[resolution] = flow_estimates[resolution].detach()
+
+                    correlations = correlation_layer(
+                        l_pyramid[resolution],  # left
+                        r_pyramid[resolution],  # right
+                        flow_estimates[resolution],
+                        offsets[resolution],
+                        window_type,
+                    )
+
+                    # update the recurrent network state and the flow deltas
+                    net_pyramid[resolution], delta_flow = self.update_block(
+                        net_pyramid[resolution], ctx_pyramid[resolution], correlations, flow_estimates[resolution]
+                    )
+
+                    # the convex upsampling weights are computed w.r.t.
+                    # the recurrent update state
+                    up_mask = self.mask_predictor(net_pyramid[resolution])
+                    flow_estimates[resolution] = flow_estimates[resolution] + delta_flow
+                    # convex upsampling with the initial feature encoder downsampling rate
+                    flow_pred_prior = upsample_flow(
+                        flow_estimates[resolution], up_mask, factor=self.downsampling_factors[0]
+                    )
+                    # we then bilinear upsample to the final resolution
+                    # we use a factor that's equivalent to the difference between
+                    # the current downsample resolution and the base downsample resolution
+                    #
+                    # i.e. if a 1 / 16 flow is upsampled by 4 (base downsampling) we get a 1 / 4 flow.
+                    # therefore we have to further upscale it by the difference between
+                    # the current level 1 / 16 and the base level 1 / 4.
+                    #
+                    # we use a -scale because the flow used inside the network is a negative flow
+                    # from the right to the left, so we flip the flow direction in order to get the
+                    # left to right flow
+                    flow_pred = -upsample_flow(flow_pred_prior, None, factor=scale_to_base)
+                    predictions.append(flow_pred)
+
+                # when constructing the next resolution prior, we resample w.r.t
+                # to the scale of the next level in the pyramid
+                next_resolution = coarse_resolutions[idx + 1]
+                scale_to_next = l_pyramid[next_resolution].shape[2] / flow_pred_prior.shape[2]
+                # we use the flow_up_prior because this is a more accurate estimation of the true flow
+                # due to the convex upsample, which resembles a learned super-resolution module.
+                # this is not necessarily an upsample, it can be a downsample, based on the provided configuration
+                flow_estimates[next_resolution] = -scale_to_next * F.interpolate(
+                    input=flow_pred_prior,
+                    size=l_pyramid[next_resolution].shape[2:],
+                    mode="bilinear",
+                    align_corners=True,
+                )
+
+        # finally we will be doing a full pass through the fine-grained resolution
+        # this coincides with the maximum resolution
+
+        # we keep a separate loop here in order to avoid python control flow
+        # to decide how many iterations should we do based on the current resolution
+        # furthermore, if provided with an initial flow, there is no need to generate
+        # a prior estimate when moving into the final refinement stage
+
+        for it in range(num_iters):
+            search_window_type = self._get_window_type(it)
+
+            flow_estimates[max_res] = flow_estimates[max_res].detach()
+            # we run the fine-grained resolution correlations in iterative mode
+            # this means that we are using the fixed window pixel selections
+            # instead of the deformed ones as with the previous steps
+            correlations = self.max_res_correlation_layer(
+                l_pyramid[max_res],
+                r_pyramid[max_res],
+                flow_estimates[max_res],
+                extra_offset=None,
+                window_type=search_window_type,
+                iter_mode=True,
+            )
+
+            net_pyramid[max_res], delta_flow = self.update_block(
+                net_pyramid[max_res], ctx_pyramid[max_res], correlations, flow_estimates[max_res]
+            )
+
+            up_mask = self.mask_predictor(net_pyramid[max_res])
+            flow_estimates[max_res] = flow_estimates[max_res] + delta_flow
+            # at the final resolution we simply do a convex upsample using the base downsample rate
+            flow_pred = -upsample_flow(flow_estimates[max_res], up_mask, factor=self.downsampling_factors[0])
+            predictions.append(flow_pred)
+
+        return predictions
+
+
+def _crestereo(
+    *,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    # Feature Encoder
+    feature_encoder_layers: tuple[int, int, int, int, int],
+    feature_encoder_strides: tuple[int, int, int, int],
+    feature_encoder_block: Callable[..., nn.Module],
+    feature_encoder_norm_layer: Callable[..., nn.Module],
+    # Average Pooling Pyramid
+    feature_downsample_rates: tuple[int, ...],
+    # Adaptive Correlation Layer
+    corr_groups: int,
+    corr_search_window_2d: tuple[int, int],
+    corr_search_dilate_2d: tuple[int, int],
+    corr_search_window_1d: tuple[int, int],
+    corr_search_dilate_1d: tuple[int, int],
+    # Flow head
+    flow_head_hidden_size: int,
+    # Recurrent block
+    recurrent_block_hidden_state_size: int,
+    recurrent_block_kernel_size: tuple[tuple[int, int], tuple[int, int]],
+    recurrent_block_padding: tuple[tuple[int, int], tuple[int, int]],
+    # Motion Encoder
+    motion_encoder_corr_layers: tuple[int, int],
+    motion_encoder_flow_layers: tuple[int, int],
+    motion_encoder_out_channels: int,
+    # Transformer Blocks
+    num_attention_heads: int,
+    num_self_attention_layers: int,
+    num_cross_attention_layers: int,
+    self_attention_module: Callable[..., nn.Module],
+    cross_attention_module: Callable[..., nn.Module],
+    **kwargs,
+) -> CREStereo:
+
+    feature_encoder = kwargs.pop("feature_encoder", None) or raft.FeatureEncoder(
+        block=feature_encoder_block,
+        layers=feature_encoder_layers,
+        strides=feature_encoder_strides,
+        norm_layer=feature_encoder_norm_layer,
+    )
+
+    if feature_encoder.output_dim % corr_groups != 0:
+        raise ValueError(
+            f"Final ``feature_encoder_layers`` size should be divisible by ``corr_groups`` argument."
+            f"Feature encoder output size : {feature_encoder.output_dim}, Correlation groups: {corr_groups}."
+        )
+
+    motion_encoder = kwargs.pop("motion_encoder", None) or raft.MotionEncoder(
+        in_channels_corr=corr_groups * int(np.prod(corr_search_window_1d)),
+        corr_layers=motion_encoder_corr_layers,
+        flow_layers=motion_encoder_flow_layers,
+        out_channels=motion_encoder_out_channels,
+    )
+
+    out_channels_context = feature_encoder_layers[-1] - recurrent_block_hidden_state_size
+    recurrent_block = kwargs.pop("recurrent_block", None) or raft.RecurrentBlock(
+        input_size=motion_encoder.out_channels + out_channels_context,
+        hidden_size=recurrent_block_hidden_state_size,
+        kernel_size=recurrent_block_kernel_size,
+        padding=recurrent_block_padding,
+    )
+
+    flow_head = kwargs.pop("flow_head", None) or raft.FlowHead(
+        in_channels=out_channels_context, hidden_size=flow_head_hidden_size
+    )
+
+    update_block = raft.UpdateBlock(motion_encoder=motion_encoder, recurrent_block=recurrent_block, flow_head=flow_head)
+
+    self_attention_module = kwargs.pop("self_attention_module", None) or LinearAttention
+    self_attn_block = LocalFeatureTransformer(
+        dim_model=feature_encoder.output_dim,
+        num_heads=num_attention_heads,
+        attention_directions=["self"] * num_self_attention_layers,
+        attention_module=self_attention_module,
+    )
+
+    cross_attention_module = kwargs.pop("cross_attention_module", None) or LinearAttention
+    cross_attn_block = LocalFeatureTransformer(
+        dim_model=feature_encoder.output_dim,
+        num_heads=num_attention_heads,
+        attention_directions=["cross"] * num_cross_attention_layers,
+        attention_module=cross_attention_module,
+    )
+
+    model = CREStereo(
+        feature_encoder=feature_encoder,
+        update_block=update_block,
+        flow_head=flow_head,
+        self_attn_block=self_attn_block,
+        cross_attn_block=cross_attn_block,
+        feature_downsample_rates=feature_downsample_rates,
+        correlation_groups=corr_groups,
+        search_window_1d=corr_search_window_1d,
+        search_window_2d=corr_search_window_2d,
+        search_dilate_1d=corr_search_dilate_1d,
+        search_dilate_2d=corr_search_dilate_2d,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "resize_size": (384, 512),
+}
+
+
+class CREStereo_Base_Weights(WeightsEnum):
+    """The metrics reported here are as follows.
+
+    ``mae`` is the "mean-average-error" and indicates how far (in pixels) the
+    predicted disparity is from its true value (equivalent to ``epe``). This is averaged over all pixels
+    of all images. ``1px``, ``3px``, ``5px`` and indicate the percentage of pixels that have a lower
+    error than that of the ground truth. ``relepe`` is the "relative-end-point-error" and is the
+    average ``epe`` divided by the average ground truth disparity. ``fl-all`` corresponds to the average of pixels whose epe
+    is either <3px, or whom's ``relepe`` is lower than 0.05 (therefore higher is better).
+
+    """
+
+    MEGVII_V1 = Weights(
+        # Weights ported from https://github.com/megvii-research/CREStereo
+        url="https://download.pytorch.org/models/crestereo-756c8b0f.pth",
+        transforms=StereoMatching,
+        meta={
+            **_COMMON_META,
+            "num_params": 5432948,
+            "recipe": "https://github.com/megvii-research/CREStereo",
+            "_metrics": {
+                "Middlebury2014-train": {
+                    # metrics for 10 refinement iterations and 1 cascade
+                    "mae": 0.792,
+                    "rmse": 2.765,
+                    "1px": 0.905,
+                    "3px": 0.958,
+                    "5px": 0.97,
+                    "relepe": 0.114,
+                    "fl-all": 90.429,
+                    "_detailed": {
+                        # 1 is the number of cascades
+                        1: {
+                            # 2 is number of refininement iterations
+                            2: {
+                                "mae": 1.704,
+                                "rmse": 3.738,
+                                "1px": 0.738,
+                                "3px": 0.896,
+                                "5px": 0.933,
+                                "relepe": 0.157,
+                                "fl-all": 76.464,
+                            },
+                            5: {
+                                "mae": 0.956,
+                                "rmse": 2.963,
+                                "1px": 0.88,
+                                "3px": 0.948,
+                                "5px": 0.965,
+                                "relepe": 0.124,
+                                "fl-all": 88.186,
+                            },
+                            10: {
+                                "mae": 0.792,
+                                "rmse": 2.765,
+                                "1px": 0.905,
+                                "3px": 0.958,
+                                "5px": 0.97,
+                                "relepe": 0.114,
+                                "fl-all": 90.429,
+                            },
+                            20: {
+                                "mae": 0.749,
+                                "rmse": 2.706,
+                                "1px": 0.907,
+                                "3px": 0.961,
+                                "5px": 0.972,
+                                "relepe": 0.113,
+                                "fl-all": 90.807,
+                            },
+                        },
+                        2: {
+                            2: {
+                                "mae": 1.702,
+                                "rmse": 3.784,
+                                "1px": 0.784,
+                                "3px": 0.894,
+                                "5px": 0.924,
+                                "relepe": 0.172,
+                                "fl-all": 80.313,
+                            },
+                            5: {
+                                "mae": 0.932,
+                                "rmse": 2.907,
+                                "1px": 0.877,
+                                "3px": 0.944,
+                                "5px": 0.963,
+                                "relepe": 0.125,
+                                "fl-all": 87.979,
+                            },
+                            10: {
+                                "mae": 0.773,
+                                "rmse": 2.768,
+                                "1px": 0.901,
+                                "3px": 0.958,
+                                "5px": 0.972,
+                                "relepe": 0.117,
+                                "fl-all": 90.43,
+                            },
+                            20: {
+                                "mae": 0.854,
+                                "rmse": 2.971,
+                                "1px": 0.9,
+                                "3px": 0.957,
+                                "5px": 0.97,
+                                "relepe": 0.122,
+                                "fl-all": 90.269,
+                            },
+                        },
+                    },
+                }
+            },
+            "_docs": """These weights were ported from the original paper. They
+            are trained on a dataset mixture of the author's choice.""",
+        },
+    )
+
+    CRESTEREO_ETH_MBL_V1 = Weights(
+        # Weights ported from https://github.com/megvii-research/CREStereo
+        url="https://download.pytorch.org/models/crestereo-8f0e0e9a.pth",
+        transforms=StereoMatching,
+        meta={
+            **_COMMON_META,
+            "num_params": 5432948,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/depth/stereo",
+            "_metrics": {
+                "Middlebury2014-train": {
+                    # metrics for 10 refinement iterations and 1 cascade
+                    "mae": 1.416,
+                    "rmse": 3.53,
+                    "1px": 0.777,
+                    "3px": 0.896,
+                    "5px": 0.933,
+                    "relepe": 0.148,
+                    "fl-all": 78.388,
+                    "_detailed": {
+                        # 1 is the number of cascades
+                        1: {
+                            # 2 is the number of refinement iterations
+                            2: {
+                                "mae": 2.363,
+                                "rmse": 4.352,
+                                "1px": 0.611,
+                                "3px": 0.828,
+                                "5px": 0.891,
+                                "relepe": 0.176,
+                                "fl-all": 64.511,
+                            },
+                            5: {
+                                "mae": 1.618,
+                                "rmse": 3.71,
+                                "1px": 0.761,
+                                "3px": 0.879,
+                                "5px": 0.918,
+                                "relepe": 0.154,
+                                "fl-all": 77.128,
+                            },
+                            10: {
+                                "mae": 1.416,
+                                "rmse": 3.53,
+                                "1px": 0.777,
+                                "3px": 0.896,
+                                "5px": 0.933,
+                                "relepe": 0.148,
+                                "fl-all": 78.388,
+                            },
+                            20: {
+                                "mae": 1.448,
+                                "rmse": 3.583,
+                                "1px": 0.771,
+                                "3px": 0.893,
+                                "5px": 0.931,
+                                "relepe": 0.145,
+                                "fl-all": 77.7,
+                            },
+                        },
+                        2: {
+                            2: {
+                                "mae": 1.972,
+                                "rmse": 4.125,
+                                "1px": 0.73,
+                                "3px": 0.865,
+                                "5px": 0.908,
+                                "relepe": 0.169,
+                                "fl-all": 74.396,
+                            },
+                            5: {
+                                "mae": 1.403,
+                                "rmse": 3.448,
+                                "1px": 0.793,
+                                "3px": 0.905,
+                                "5px": 0.937,
+                                "relepe": 0.151,
+                                "fl-all": 80.186,
+                            },
+                            10: {
+                                "mae": 1.312,
+                                "rmse": 3.368,
+                                "1px": 0.799,
+                                "3px": 0.912,
+                                "5px": 0.943,
+                                "relepe": 0.148,
+                                "fl-all": 80.379,
+                            },
+                            20: {
+                                "mae": 1.376,
+                                "rmse": 3.542,
+                                "1px": 0.796,
+                                "3px": 0.91,
+                                "5px": 0.942,
+                                "relepe": 0.149,
+                                "fl-all": 80.054,
+                            },
+                        },
+                    },
+                }
+            },
+            "_docs": """These weights were trained from scratch on
+            :class:`~torchvision.datasets._stereo_matching.CREStereo` +
+            :class:`~torchvision.datasets._stereo_matching.Middlebury2014Stereo` +
+            :class:`~torchvision.datasets._stereo_matching.ETH3DStereo`.""",
+        },
+    )
+
+    CRESTEREO_FINETUNE_MULTI_V1 = Weights(
+        # Weights ported from https://github.com/megvii-research/CREStereo
+        url="https://download.pytorch.org/models/crestereo-697c38f4.pth	",
+        transforms=StereoMatching,
+        meta={
+            **_COMMON_META,
+            "num_params": 5432948,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/depth/stereo",
+            "_metrics": {
+                "Middlebury2014-train": {
+                    # metrics for 10 refinement iterations and 1 cascade
+                    "mae": 1.038,
+                    "rmse": 3.108,
+                    "1px": 0.852,
+                    "3px": 0.942,
+                    "5px": 0.963,
+                    "relepe": 0.129,
+                    "fl-all": 85.522,
+                    "_detailed": {
+                        # 1 is the number of cascades
+                        1: {
+                            # 2 is number of refininement iterations
+                            2: {
+                                "mae": 1.85,
+                                "rmse": 3.797,
+                                "1px": 0.673,
+                                "3px": 0.862,
+                                "5px": 0.917,
+                                "relepe": 0.171,
+                                "fl-all": 69.736,
+                            },
+                            5: {
+                                "mae": 1.111,
+                                "rmse": 3.166,
+                                "1px": 0.838,
+                                "3px": 0.93,
+                                "5px": 0.957,
+                                "relepe": 0.134,
+                                "fl-all": 84.596,
+                            },
+                            10: {
+                                "mae": 1.02,
+                                "rmse": 3.073,
+                                "1px": 0.854,
+                                "3px": 0.938,
+                                "5px": 0.96,
+                                "relepe": 0.129,
+                                "fl-all": 86.042,
+                            },
+                            20: {
+                                "mae": 0.993,
+                                "rmse": 3.059,
+                                "1px": 0.855,
+                                "3px": 0.942,
+                                "5px": 0.967,
+                                "relepe": 0.126,
+                                "fl-all": 85.784,
+                            },
+                        },
+                        2: {
+                            2: {
+                                "mae": 1.667,
+                                "rmse": 3.867,
+                                "1px": 0.78,
+                                "3px": 0.891,
+                                "5px": 0.922,
+                                "relepe": 0.165,
+                                "fl-all": 78.89,
+                            },
+                            5: {
+                                "mae": 1.158,
+                                "rmse": 3.278,
+                                "1px": 0.843,
+                                "3px": 0.926,
+                                "5px": 0.955,
+                                "relepe": 0.135,
+                                "fl-all": 84.556,
+                            },
+                            10: {
+                                "mae": 1.046,
+                                "rmse": 3.13,
+                                "1px": 0.85,
+                                "3px": 0.934,
+                                "5px": 0.96,
+                                "relepe": 0.13,
+                                "fl-all": 85.464,
+                            },
+                            20: {
+                                "mae": 1.021,
+                                "rmse": 3.102,
+                                "1px": 0.85,
+                                "3px": 0.935,
+                                "5px": 0.963,
+                                "relepe": 0.129,
+                                "fl-all": 85.417,
+                            },
+                        },
+                    },
+                },
+            },
+            "_docs": """These weights were finetuned on a mixture of
+            :class:`~torchvision.datasets._stereo_matching.CREStereo` +
+            :class:`~torchvision.datasets._stereo_matching.Middlebury2014Stereo` +
+            :class:`~torchvision.datasets._stereo_matching.ETH3DStereo` +
+            :class:`~torchvision.datasets._stereo_matching.InStereo2k` +
+            :class:`~torchvision.datasets._stereo_matching.CarlaStereo` +
+            :class:`~torchvision.datasets._stereo_matching.SintelStereo` +
+            :class:`~torchvision.datasets._stereo_matching.FallingThingsStereo` +
+            .""",
+        },
+    )
+
+    DEFAULT = MEGVII_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", CREStereo_Base_Weights.MEGVII_V1))
+def crestereo_base(*, weights: Optional[CREStereo_Base_Weights] = None, progress=True, **kwargs) -> CREStereo:
+    """CREStereo model from
+    `Practical Stereo Matching via Cascaded Recurrent Network
+    With Adaptive Correlation <https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf>`_.
+
+    Please see the example below for a tutorial on how to use this model.
+
+    Args:
+        weights(:class:`~torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/crestereo.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights
+        :members:
+    """
+
+    weights = CREStereo_Base_Weights.verify(weights)
+
+    return _crestereo(
+        weights=weights,
+        progress=progress,
+        # Feature encoder
+        feature_encoder_layers=(64, 64, 96, 128, 256),
+        feature_encoder_strides=(2, 1, 2, 1),
+        feature_encoder_block=partial(raft.ResidualBlock, always_project=True),
+        feature_encoder_norm_layer=nn.InstanceNorm2d,
+        # Average pooling pyramid
+        feature_downsample_rates=(2, 4),
+        # Motion encoder
+        motion_encoder_corr_layers=(256, 192),
+        motion_encoder_flow_layers=(128, 64),
+        motion_encoder_out_channels=128,
+        # Recurrent block
+        recurrent_block_hidden_state_size=128,
+        recurrent_block_kernel_size=((1, 5), (5, 1)),
+        recurrent_block_padding=((0, 2), (2, 0)),
+        # Flow head
+        flow_head_hidden_size=256,
+        # Transformer blocks
+        num_attention_heads=8,
+        num_self_attention_layers=1,
+        num_cross_attention_layers=1,
+        self_attention_module=LinearAttention,
+        cross_attention_module=LinearAttention,
+        # Adaptive Correlation layer
+        corr_groups=4,
+        corr_search_window_2d=(3, 3),
+        corr_search_dilate_2d=(1, 1),
+        corr_search_window_1d=(1, 9),
+        corr_search_dilate_1d=(1, 1),
+    )
diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
index 522ad24c3a2..c1981c8c716 100644
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py
@@ -1,14 +1,17 @@
-from typing import Callable, List, Optional, Tuple
+from functools import partial
+from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.models.optical_flow.raft as raft
 from torch import Tensor
-from torchvision.models._api import register_model, WeightsEnum
+from torchvision.models._api import register_model, Weights, WeightsEnum
+from torchvision.models._utils import handle_legacy_interface
 from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
 from torchvision.models.optical_flow.raft import FlowHead, MotionEncoder, ResidualBlock
 from torchvision.ops import Conv2dNormActivation
+from torchvision.prototype.transforms._presets import StereoMatching
 from torchvision.utils import _log_api_usage_once
 
 
@@ -31,8 +34,8 @@ def __init__(
         self,
         *,
         block: Callable[..., nn.Module] = ResidualBlock,
-        layers: Tuple[int, int, int, int] = (64, 64, 96, 128),
-        strides: Tuple[int, int, int, int] = (2, 1, 2, 2),
+        layers: tuple[int, int, int, int] = (64, 64, 96, 128),
+        strides: tuple[int, int, int, int] = (2, 1, 2, 2),
         norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d,
     ):
         # We use layers + (256,) because raft.FeatureEncoder require 5 layers
@@ -50,7 +53,7 @@ def __init__(
 class FeatureEncoder(nn.Module):
     """Feature Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Context Encoder.
 
-    The FeatureEncoder takes concatination of left and right image as input, it produce feature embedding that later
+    The FeatureEncoder takes concatenation of left and right image as input. It produces feature embedding that later
     will be used to construct correlation volume.
     """
 
@@ -86,7 +89,7 @@ def forward(self, x: Tensor) -> Tensor:
 class MultiLevelContextEncoder(nn.Module):
     """Context Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Feature Encoder.
 
-    The ContextEncoder takes left image as input and it outputs concatenated hidden_states and contexts.
+    The ContextEncoder takes left image as input, and it outputs concatenated hidden_states and contexts.
     In Raft-Stereo we have multi level GRUs and this context encoder will also multi outputs (list of Tensor)
     that correspond to each GRUs.
     Take note that the length of "out_with_blocks" parameter represent the number of GRU's level.
@@ -103,7 +106,7 @@ class MultiLevelContextEncoder(nn.Module):
     def __init__(
         self,
         base_encoder: nn.Module,
-        out_with_blocks: List[bool],
+        out_with_blocks: list[bool],
         output_dim: int = 256,
         block: Callable[..., nn.Module] = ResidualBlock,
     ):
@@ -142,7 +145,7 @@ def _make_downsampler(self, block, in_channels, out_channels):
         block2 = block(out_channels, out_channels, norm_layer=nn.BatchNorm2d, stride=1)
         return nn.Sequential(block1, block2)
 
-    def forward(self, x: Tensor) -> List[Tensor]:
+    def forward(self, x: Tensor) -> list[Tensor]:
         x = self.base_encoder(x)
         outs = []
         for layer_dict in self.downsample_and_out_layers:
@@ -156,7 +159,7 @@ class ConvGRU(raft.ConvGRU):
 
     # Modified from raft.ConvGRU to accept pre-convolved contexts,
     # see: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/update.py#L23
-    def forward(self, h: Tensor, x: Tensor, context: List[Tensor]) -> Tensor:  # type: ignore[override]
+    def forward(self, h: Tensor, x: Tensor, context: list[Tensor]) -> Tensor:  # type: ignore[override]
         hx = torch.cat([h, x], dim=1)
         z = torch.sigmoid(self.convz(hx) + context[0])
         r = torch.sigmoid(self.convr(hx) + context[1])
@@ -171,13 +174,13 @@ class MultiLevelUpdateBlock(nn.Module):
     It must expose a ``hidden_dims`` attribute which is the hidden dimension size of its gru blocks
     """
 
-    def __init__(self, *, motion_encoder: MotionEncoder, hidden_dims: List[int]):
+    def __init__(self, *, motion_encoder: MotionEncoder, hidden_dims: list[int]):
         super().__init__()
         self.motion_encoder = motion_encoder
 
         # The GRU input size is the size of previous level hidden_dim plus next level hidden_dim
         # if this is the first gru, then we replace previous level with motion_encoder output channels
-        # for the last GRU, we dont add the next level hidden_dim
+        # for the last GRU, we don't add the next level hidden_dim
         gru_input_dims = []
         for i in range(len(hidden_dims)):
             input_dim = hidden_dims[i - 1] if i > 0 else motion_encoder.out_channels
@@ -188,8 +191,8 @@ def __init__(self, *, motion_encoder: MotionEncoder, hidden_dims: List[int]):
         self.grus = nn.ModuleList(
             [
                 ConvGRU(input_size=gru_input_dims[i], hidden_size=hidden_dims[i], kernel_size=3, padding=1)
-                # Ideally we should reverse the direction during forward to use the gru with smallest resolution first
-                # however currently there is no way to reverse a ModuleList that is jit script compatible
+                # Ideally we should reverse the direction during forward to use the gru with the smallest resolution
+                # first however currently there is no way to reverse a ModuleList that is jit script compatible
                 # hence we reverse the ordering of self.grus on the constructor instead
                 # see: https://github.com/pytorch/pytorch/issues/31772
                 for i in reversed(list(range(len(hidden_dims))))
@@ -200,21 +203,21 @@ def __init__(self, *, motion_encoder: MotionEncoder, hidden_dims: List[int]):
 
     def forward(
         self,
-        hidden_states: List[Tensor],
-        contexts: List[List[Tensor]],
+        hidden_states: list[Tensor],
+        contexts: list[list[Tensor]],
         corr_features: Tensor,
-        depth: Tensor,
-        level_processed: List[bool],
-    ) -> List[Tensor]:
+        disparity: Tensor,
+        level_processed: list[bool],
+    ) -> list[Tensor]:
         # We call it reverse_i because it has a reversed ordering compared to hidden_states
         # see self.grus on the constructor for more detail
         for reverse_i, gru in enumerate(self.grus):
             i = len(self.grus) - 1 - reverse_i
             if level_processed[i]:
-                # X is concatination of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
+                # X is concatenation of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
                 # upsampled hidden_dim (or nothing if not exist).
                 if i == 0:
-                    features = self.motion_encoder(depth, corr_features)
+                    features = self.motion_encoder(disparity, corr_features)
                 else:
                     # 2x downsampled features from larger hidden states
                     features = F.avg_pool2d(hidden_states[i - 1], kernel_size=3, stride=2, padding=1)
@@ -234,14 +237,14 @@ def forward(
 
                 hidden_states[i] = gru(hidden_states[i], features, contexts[i])
 
-                # NOTE: For slow-fast gru, we dont always want to calculate delta depth for every call on UpdateBlock
-                # Hence we move the delta depth calculation to the RAFT-Stereo main forward
+                # NOTE: For slow-fast gru, we don't always want to calculate delta disparity for every call on UpdateBlock
+                # Hence we move the delta disparity calculation to the RAFT-Stereo main forward
 
         return hidden_states
 
 
 class MaskPredictor(raft.MaskPredictor):
-    """Mask predictor to be used when upsampling the predicted depth."""
+    """Mask predictor to be used when upsampling the predicted disparity."""
 
     # We add out_channels compared to raft.MaskPredictor
     def __init__(self, *, in_channels: int, hidden_size: int, out_channels: int, multiplier: float = 0.25):
@@ -262,7 +265,7 @@ def __init__(self, num_levels: int = 4):
         super().__init__()
         self.num_levels = num_levels
 
-    def forward(self, fmap1: Tensor, fmap2: Tensor) -> List[Tensor]:
+    def forward(self, fmap1: Tensor, fmap2: Tensor) -> list[Tensor]:
         """Build the correlation pyramid from two feature maps.
 
         The correlation volume is first computed as the dot product of each pair (pixel_in_fmap1, pixel_in_fmap2) on the same row.
@@ -305,7 +308,7 @@ def __init__(self, *, num_levels: int = 4, radius: int = 4):
         self.radius = radius
         self.out_channels = num_levels * (2 * radius + 1)
 
-    def forward(self, centroids_coords: Tensor, corr_pyramid: List[Tensor]) -> Tensor:
+    def forward(self, centroids_coords: Tensor, corr_pyramid: list[Tensor]) -> Tensor:
         """Return correlation features by indexing from the pyramid."""
         neighborhood_side_len = 2 * self.radius + 1  # see note in __init__ about out_channels
         di = torch.linspace(-self.radius, self.radius, neighborhood_side_len, device=centroids_coords.device)
@@ -345,7 +348,7 @@ def __init__(
         corr_pyramid: CorrPyramid1d,
         corr_block: CorrBlock1d,
         update_block: MultiLevelUpdateBlock,
-        depth_head: nn.Module,
+        disparity_head: nn.Module,
         mask_predictor: Optional[nn.Module] = None,
         slow_fast: bool = False,
     ):
@@ -353,15 +356,15 @@ def __init__(
         `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
 
         args:
-            feature_encoder (FeatureEncoder): The feature encoder. Its input is the concatenation of ``image1`` and ``image2``.
-            context_encoder (MultiLevelContextEncoder): The context encoder. Its input is ``image1``.
+            feature_encoder (FeatureEncoder): The feature encoder. Its input is the concatenation of ``left_image`` and ``right_image``.
+            context_encoder (MultiLevelContextEncoder): The context encoder. Its input is ``left_image``.
                 It has multi-level output and each level will have 2 parts:
 
                 - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                   the ``update_block``
 
-            corr_pyramid (CorrPyramid1d): Module to buid the correlation pyramid from feature encoder output
+            corr_pyramid (CorrPyramid1d): Module to build the correlation pyramid from feature encoder output
             corr_block (CorrBlock1d): The correlation block, which uses the correlation pyramid indexes
                 to create correlation features. It takes the coordinate of the centroid pixel and correlation pyramid
                 as input and returns the correlation features.
@@ -369,8 +372,8 @@ def __init__(
 
             update_block (MultiLevelUpdateBlock): The update block, which contains the motion encoder, and the recurrent unit.
                 It takes as input the hidden state of its recurrent unit, the context, the correlation
-                features, and the current predicted depth. It outputs an updated hidden state
-            depth_head (nn.Module): The depth head block will convert from the hidden state into changes in depth.
+                features, and the current predicted disparity. It outputs an updated hidden state
+            disparity_head (nn.Module): The disparity head block will convert from the hidden state into changes in disparity.
             mask_predictor (nn.Module, optional): Predicts the mask that will be used to upsample the predicted flow.
                 If ``None`` (default), the flow is upsampled using interpolation.
             slow_fast (bool): A boolean that specify whether we should use slow-fast GRU or not. See RAFT-Stereo paper
@@ -379,6 +382,10 @@ def __init__(
         super().__init__()
         _log_api_usage_once(self)
 
+        # This indicates that the disparity output will be only have 1 channel (represent horizontal axis).
+        # We need this because some stereo matching model like CREStereo might have 2 channel on the output
+        self.output_channels = 1
+
         self.feature_encoder = feature_encoder
         self.context_encoder = context_encoder
 
@@ -387,7 +394,7 @@ def __init__(
         self.corr_pyramid = corr_pyramid
         self.corr_block = corr_block
         self.update_block = update_block
-        self.depth_head = depth_head
+        self.disparity_head = disparity_head
         self.mask_predictor = mask_predictor
 
         hidden_dims = self.update_block.hidden_dims
@@ -398,26 +405,29 @@ def __init__(
         )
         self.slow_fast = slow_fast
 
-    def forward(self, image1: Tensor, image2: Tensor, num_iters: int = 12) -> List[Tensor]:
+    def forward(
+        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 12
+    ) -> list[Tensor]:
         """
-        Return dept predictions on every iterations as a list of Tensor.
+        Return disparity predictions on every iteration as a list of Tensor.
         args:
-            image1 (Tensor): The input left image with layout B, C, H, W
-            image2 (Tensor): The input right image with layout B, C, H, W
+            left_image (Tensor): The input left image with layout B, C, H, W
+            right_image (Tensor): The input right image with layout B, C, H, W
+            flow_init (Optional[Tensor]): Initial estimate for the disparity. Default: None
             num_iters (int): Number of update block iteration on the largest resolution. Default: 12
         """
-        batch_size, _, h, w = image1.shape
+        batch_size, _, h, w = left_image.shape
         torch._assert(
-            (h, w) == image2.shape[-2:],
-            f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}",
+            (h, w) == right_image.shape[-2:],
+            f"input images should have the same shape, instead got ({h}, {w}) != {right_image.shape[-2:]}",
         )
 
         torch._assert(
             (h % self.base_downsampling_ratio == 0 and w % self.base_downsampling_ratio == 0),
-            f"input image H and W should be divisible by {self.base_downsampling_ratio}, insted got H={h} and W={w}",
+            f"input image H and W should be divisible by {self.base_downsampling_ratio}, instead got H={h} and W={w}",
         )
 
-        fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
+        fmaps = self.feature_encoder(torch.cat([left_image, right_image], dim=0))
         fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
         torch._assert(
             fmap1.shape[-2:] == (h // self.base_downsampling_ratio, w // self.base_downsampling_ratio),
@@ -427,12 +437,12 @@ def forward(self, image1: Tensor, image2: Tensor, num_iters: int = 12) -> List[T
         corr_pyramid = self.corr_pyramid(fmap1, fmap2)
 
         # Multi level contexts
-        context_outs = self.context_encoder(image1)
+        context_outs = self.context_encoder(left_image)
 
         hidden_dims = self.update_block.hidden_dims
         context_out_channels = [context_outs[i].shape[1] - hidden_dims[i] for i in range(len(context_outs))]
-        hidden_states: List[Tensor] = []
-        contexts: List[List[Tensor]] = []
+        hidden_states: list[Tensor] = []
+        contexts: list[list[Tensor]] = []
         for i, context_conv in enumerate(self.context_convs):
             # As in the original paper, the actual output of the context encoder is split in 2 parts:
             # - one part is used to initialize the hidden state of the recurent units of the update block
@@ -440,42 +450,53 @@ def forward(self, image1: Tensor, image2: Tensor, num_iters: int = 12) -> List[T
             hidden_state, context = torch.split(context_outs[i], [hidden_dims[i], context_out_channels[i]], dim=1)
             hidden_states.append(torch.tanh(hidden_state))
             contexts.append(
-                torch.split(context_conv(F.relu(context)), [hidden_dims[i], hidden_dims[i], hidden_dims[i]], dim=1)
+                # mypy is technically correct here. The return type of `torch.split` was incorrectly annotated with
+                # `List[int]` although it should have been `Tuple[Tensor, ...]`. However, the latter is not supported by
+                # JIT and thus we have to keep the wrong annotation here and silence mypy.
+                torch.split(  # type: ignore[arg-type]
+                    context_conv(F.relu(context)), [hidden_dims[i], hidden_dims[i], hidden_dims[i]], dim=1
+                )
             )
 
         _, Cf, Hf, Wf = fmap1.shape
         coords0 = make_coords_grid(batch_size, Hf, Wf).to(fmap1.device)
         coords1 = make_coords_grid(batch_size, Hf, Wf).to(fmap1.device)
 
-        depth_predictions = []
+        # We use flow_init for cascade inference
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+
+        disparity_predictions = []
         for _ in range(num_iters):
             coords1 = coords1.detach()  # Don't backpropagate gradients through this branch, see paper
             corr_features = self.corr_block(centroids_coords=coords1, corr_pyramid=corr_pyramid)
 
-            depth = coords1 - coords0
+            disparity = coords1 - coords0
             if self.slow_fast:
                 # Using slow_fast GRU (see paper section 3.4). The lower resolution are processed more often
                 for i in range(1, self.num_level):
                     # We only processed the smallest i levels
                     level_processed = [False] * (self.num_level - i) + [True] * i
                     hidden_states = self.update_block(
-                        hidden_states, contexts, corr_features, depth, level_processed=level_processed
+                        hidden_states, contexts, corr_features, disparity, level_processed=level_processed
                     )
             hidden_states = self.update_block(
-                hidden_states, contexts, corr_features, depth, level_processed=[True] * self.num_level
+                hidden_states, contexts, corr_features, disparity, level_processed=[True] * self.num_level
             )
-            # Take the largest hidden_state to get the depth
+            # Take the largest hidden_state to get the disparity
             hidden_state = hidden_states[0]
-            delta_depth = self.depth_head(hidden_state)
-            # in stereo mode, project depth onto epipolar
-            delta_depth[:, 1] = 0.0
+            delta_disparity = self.disparity_head(hidden_state)
+            # in stereo mode, project disparity onto epipolar
+            delta_disparity[:, 1] = 0.0
 
-            coords1 = coords1 + delta_depth
+            coords1 = coords1 + delta_disparity
             up_mask = None if self.mask_predictor is None else self.mask_predictor(hidden_state)
-            upsampled_depth = upsample_flow((coords1 - coords0), up_mask=up_mask, factor=self.base_downsampling_ratio)
-            depth_predictions.append(upsampled_depth[:, :1])
+            upsampled_disparity = upsample_flow(
+                (coords1 - coords0), up_mask=up_mask, factor=self.base_downsampling_ratio
+            )
+            disparity_predictions.append(upsampled_disparity[:, :1])
 
-        return depth_predictions
+        return disparity_predictions
 
 
 def _raft_stereo(
@@ -484,25 +505,25 @@ def _raft_stereo(
     progress: bool,
     shared_encoder_weight: bool,
     # Feature encoder
-    feature_encoder_layers: Tuple[int, int, int, int, int],
-    feature_encoder_strides: Tuple[int, int, int, int],
+    feature_encoder_layers: tuple[int, int, int, int, int],
+    feature_encoder_strides: tuple[int, int, int, int],
     feature_encoder_block: Callable[..., nn.Module],
     # Context encoder
-    context_encoder_layers: Tuple[int, int, int, int, int],
-    context_encoder_strides: Tuple[int, int, int, int],
+    context_encoder_layers: tuple[int, int, int, int, int],
+    context_encoder_strides: tuple[int, int, int, int],
     # if the `out_with_blocks` param of the context_encoder is True, then
     # the particular output on that level position will have additional `context_encoder_block` layer
-    context_encoder_out_with_blocks: List[bool],
+    context_encoder_out_with_blocks: list[bool],
     context_encoder_block: Callable[..., nn.Module],
     # Correlation block
     corr_num_levels: int,
     corr_radius: int,
     # Motion encoder
-    motion_encoder_corr_layers: Tuple[int, int],
-    motion_encoder_flow_layers: Tuple[int, int],
+    motion_encoder_corr_layers: tuple[int, int],
+    motion_encoder_flow_layers: tuple[int, int],
     motion_encoder_out_channels: int,
     # Update block
-    update_block_hidden_dims: List[int],
+    update_block_hidden_dims: list[int],
     # Flow Head
     flow_head_hidden_size: int,
     # Mask predictor
@@ -575,8 +596,8 @@ def _raft_stereo(
         motion_encoder=motion_encoder, hidden_dims=update_block_hidden_dims
     )
 
-    # We use the largest scale hidden_dims of update_block to get the predicted depth
-    depth_head = kwargs.pop("depth_head", None) or FlowHead(
+    # We use the largest scale hidden_dims of update_block to get the predicted disparity
+    disparity_head = kwargs.pop("disparity_head", None) or FlowHead(
         in_channels=update_block_hidden_dims[0],
         hidden_size=flow_head_hidden_size,
     )
@@ -597,27 +618,114 @@ def _raft_stereo(
         corr_pyramid=corr_pyramid,
         corr_block=corr_block,
         update_block=update_block,
-        depth_head=depth_head,
+        disparity_head=disparity_head,
         mask_predictor=mask_predictor,
         slow_fast=slow_fast,
         **kwargs,  # not really needed, all params should be consumed by now
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
 
 class Raft_Stereo_Realtime_Weights(WeightsEnum):
-    pass
+    SCENEFLOW_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_realtime-cf345ccb.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 8077152,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                "Kitty2015": {
+                    "3px": 0.9409,
+                }
+            },
+        },
+    )
+
+    DEFAULT = SCENEFLOW_V1
 
 
 class Raft_Stereo_Base_Weights(WeightsEnum):
-    pass
+    SCENEFLOW_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_base_sceneflow-eff3f2e6.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 11116176,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                # Using standard metrics for each dataset
+                "Kitty2015": {
+                    # Ratio of pixels with difference less than 3px from ground truth
+                    "3px": 0.9426,
+                },
+                # For middlebury, ratio of pixels with difference less than 2px from ground truth
+                # on full, half, and quarter image resolution
+                "Middlebury2014-val-full": {
+                    "2px": 0.8167,
+                },
+                "Middlebury2014-val-half": {
+                    "2px": 0.8741,
+                },
+                "Middlebury2014-val-quarter": {
+                    "2px": 0.9064,
+                },
+                "ETH3D-val": {
+                    # Ratio of pixels with difference less than 1px from ground truth
+                    "1px": 0.9672,
+                },
+            },
+        },
+    )
+
+    MIDDLEBURY_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_base_middlebury-afa9d252.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 11116176,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                "Middlebury-test": {
+                    "mae": 1.27,
+                    "1px": 0.9063,
+                    "2px": 0.9526,
+                    "5px": 0.9725,
+                }
+            },
+        },
+    )
+
+    ETH3D_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_base_eth3d-d4830f22.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 11116176,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                "ETH3D-test": {
+                    "mae": 0.18,
+                    "1px": 0.9756,
+                    "2px": 0.9956,
+                }
+            },
+        },
+    )
+
+    DEFAULT = MIDDLEBURY_V1
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", None))
 def raft_stereo_realtime(
     *, weights: Optional[Raft_Stereo_Realtime_Weights] = None, progress=True, **kwargs
 ) -> RaftStereo:
@@ -678,6 +786,7 @@ def raft_stereo_realtime(
 
 
 @register_model()
+@handle_legacy_interface(weights=("pretrained", None))
 def raft_stereo_base(*, weights: Optional[Raft_Stereo_Base_Weights] = None, progress=True, **kwargs) -> RaftStereo:
     """RAFT-Stereo model from
     `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 90dfd297da8..c264db5d33d 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -1,42 +1,6 @@
-from . import functional  # usort: skip
+from ._presets import StereoMatching  # usort: skip
 
-from ._transform import Transform  # usort: skip
-
-from ._augment import RandomCutmix, RandomErasing, RandomMixup, SimpleCopyPaste
-from ._auto_augment import AugMix, AutoAugment, AutoAugmentPolicy, RandAugment, TrivialAugmentWide
-from ._color import (
-    ColorJitter,
-    RandomAdjustSharpness,
-    RandomAutocontrast,
-    RandomEqualize,
-    RandomInvert,
-    RandomPhotometricDistort,
-    RandomPosterize,
-    RandomSolarize,
-)
-from ._container import Compose, RandomApply, RandomChoice, RandomOrder
-from ._geometry import (
-    CenterCrop,
-    ElasticTransform,
-    FiveCrop,
-    FixedSizeCrop,
-    Pad,
-    RandomAffine,
-    RandomCrop,
-    RandomHorizontalFlip,
-    RandomIoUCrop,
-    RandomPerspective,
-    RandomResizedCrop,
-    RandomRotation,
-    RandomShortestSize,
-    RandomVerticalFlip,
-    RandomZoomOut,
-    Resize,
-    ScaleJitter,
-    TenCrop,
-)
-from ._meta import ConvertBoundingBoxFormat, ConvertColorSpace, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, ToDtype
-from ._type_conversion import DecodeImage, LabelToOneHot, ToImagePIL, ToImageTensor
-
-from ._deprecated import Grayscale, RandomGrayscale, ToTensor, ToPILImage, PILToTensor  # usort: skip
+from ._augment import SimpleCopyPaste
+from ._geometry import FixedSizeCrop
+from ._misc import PermuteDimensions, TransposeDimensions
+from ._type_conversion import LabelToOneHot
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 32697f32257..a47536a0f62 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -1,215 +1,44 @@
-import math
-import numbers
-import warnings
-from typing import Any, Dict, List, Tuple
+from typing import Any, cast, Optional, Union
 
 import PIL.Image
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import tv_tensors
 from torchvision.ops import masks_to_boxes
-from torchvision.prototype import features
+from torchvision.prototype import tv_tensors as proto_tv_tensors
+from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2._utils import is_pure_tensor
 
-from torchvision.prototype.transforms import functional as F
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
 
-from ._transform import _RandomApplyTransform
-from ._utils import has_any, is_simple_tensor, query_chw
 
-
-class RandomErasing(_RandomApplyTransform):
+class SimpleCopyPaste(Transform):
     def __init__(
         self,
-        p: float = 0.5,
-        scale: Tuple[float, float] = (0.02, 0.33),
-        ratio: Tuple[float, float] = (0.3, 3.3),
-        value: float = 0,
-    ):
-        super().__init__(p=p)
-        if not isinstance(value, (numbers.Number, str, tuple, list)):
-            raise TypeError("Argument value should be either a number or str or a sequence")
-        if isinstance(value, str) and value != "random":
-            raise ValueError("If value is str, it should be 'random'")
-        if not isinstance(scale, (tuple, list)):
-            raise TypeError("Scale should be a sequence")
-        if not isinstance(ratio, (tuple, list)):
-            raise TypeError("Ratio should be a sequence")
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("Scale and ratio should be of kind (min, max)")
-        if scale[0] < 0 or scale[1] > 1:
-            raise ValueError("Scale should be between 0 and 1")
-        self.scale = scale
-        self.ratio = ratio
-        self.value = value
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        img_c, img_h, img_w = query_chw(sample)
-
-        if isinstance(self.value, (int, float)):
-            value = [self.value]
-        elif isinstance(self.value, str):
-            value = None
-        elif isinstance(self.value, tuple):
-            value = list(self.value)
-        else:
-            value = self.value
-
-        if value is not None and not (len(value) in (1, img_c)):
-            raise ValueError(
-                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
-            )
-
-        area = img_h * img_w
-
-        log_ratio = torch.log(torch.tensor(self.ratio))
-        for _ in range(10):
-            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-            aspect_ratio = torch.exp(
-                torch.empty(1).uniform_(
-                    log_ratio[0],  # type: ignore[arg-type]
-                    log_ratio[1],  # type: ignore[arg-type]
-                )
-            ).item()
-
-            h = int(round(math.sqrt(erase_area * aspect_ratio)))
-            w = int(round(math.sqrt(erase_area / aspect_ratio)))
-            if not (h < img_h and w < img_w):
-                continue
-
-            if value is None:
-                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
-            else:
-                v = torch.tensor(value)[:, None, None]
-
-            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
-            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
-            break
-        else:
-            i, j, h, w, v = 0, 0, img_h, img_w, None
-
-        return dict(i=i, j=j, h=h, w=w, v=v)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if params["v"] is not None:
-            inpt = F.erase(inpt, **params)
-
-        return inpt
-
-
-class _BaseMixupCutmix(_RandomApplyTransform):
-    def __init__(self, *, alpha: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.alpha = alpha
-        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
-
-    def forward(self, *inpts: Any) -> Any:
-        sample = inpts if len(inpts) > 1 else inpts[0]
-        if not (has_any(sample, features.Image, is_simple_tensor) and has_any(sample, features.OneHotLabel)):
-            raise TypeError(f"{type(self).__name__}() is only defined for tensor images and one-hot labels.")
-        if has_any(sample, features.BoundingBox, features.SegmentationMask, features.Label):
-            raise TypeError(
-                f"{type(self).__name__}() does not support bounding boxes, segmentation masks and plain labels."
-            )
-        return super().forward(sample)
-
-    def _mixup_onehotlabel(self, inpt: features.OneHotLabel, lam: float) -> features.OneHotLabel:
-        if inpt.ndim < 2:
-            raise ValueError("Need a batch of one hot labels")
-        output = inpt.clone()
-        output = output.roll(1, -2).mul_(1 - lam).add_(output.mul_(lam))
-        return features.OneHotLabel.new_like(inpt, output)
-
-
-class RandomMixup(_BaseMixupCutmix):
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        return dict(lam=float(self._dist.sample(())))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        lam = params["lam"]
-        if isinstance(inpt, features.Image) or is_simple_tensor(inpt):
-            if inpt.ndim < 4:
-                raise ValueError("Need a batch of images")
-            output = inpt.clone()
-            output = output.roll(1, -4).mul_(1 - lam).add_(output.mul_(lam))
-
-            if isinstance(inpt, features.Image):
-                output = features.Image.new_like(inpt, output)
-
-            return output
-        elif isinstance(inpt, features.OneHotLabel):
-            return self._mixup_onehotlabel(inpt, lam)
-        else:
-            return inpt
-
-
-class RandomCutmix(_BaseMixupCutmix):
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        lam = float(self._dist.sample(()))
-
-        _, H, W = query_chw(sample)
-
-        r_x = torch.randint(W, ())
-        r_y = torch.randint(H, ())
-
-        r = 0.5 * math.sqrt(1.0 - lam)
-        r_w_half = int(r * W)
-        r_h_half = int(r * H)
-
-        x1 = int(torch.clamp(r_x - r_w_half, min=0))
-        y1 = int(torch.clamp(r_y - r_h_half, min=0))
-        x2 = int(torch.clamp(r_x + r_w_half, max=W))
-        y2 = int(torch.clamp(r_y + r_h_half, max=H))
-        box = (x1, y1, x2, y2)
-
-        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
-
-        return dict(box=box, lam_adjusted=lam_adjusted)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Image) or is_simple_tensor(inpt):
-            box = params["box"]
-            if inpt.ndim < 4:
-                raise ValueError("Need a batch of images")
-            x1, y1, x2, y2 = box
-            image_rolled = inpt.roll(1, -4)
-            output = inpt.clone()
-            output[..., y1:y2, x1:x2] = image_rolled[..., y1:y2, x1:x2]
-
-            if isinstance(inpt, features.Image):
-                output = features.Image.new_like(inpt, output)
-
-            return output
-        elif isinstance(inpt, features.OneHotLabel):
-            lam_adjusted = params["lam_adjusted"]
-            return self._mixup_onehotlabel(inpt, lam_adjusted)
-        else:
-            return inpt
-
-
-class SimpleCopyPaste(_RandomApplyTransform):
-    def __init__(
-        self,
-        p: float = 0.5,
         blending: bool = True,
-        resize_interpolation: InterpolationMode = F.InterpolationMode.BILINEAR,
+        resize_interpolation: Union[int, InterpolationMode] = F.InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = None,
     ) -> None:
-        super().__init__(p=p)
-        self.resize_interpolation = resize_interpolation
+        super().__init__()
+        self.resize_interpolation = _check_interpolation(resize_interpolation)
         self.blending = blending
+        self.antialias = antialias
 
     def _copy_paste(
         self,
-        image: Any,
-        target: Dict[str, Any],
-        paste_image: Any,
-        paste_target: Dict[str, Any],
+        image: Union[torch.Tensor, tv_tensors.Image],
+        target: dict[str, Any],
+        paste_image: Union[torch.Tensor, tv_tensors.Image],
+        paste_target: dict[str, Any],
         random_selection: torch.Tensor,
-        blending: bool = True,
-        resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
-    ) -> Tuple[Any, Dict[str, Any]]:
+        blending: bool,
+        resize_interpolation: F.InterpolationMode,
+        antialias: Optional[bool],
+    ) -> tuple[torch.Tensor, dict[str, Any]]:
 
-        paste_masks = paste_target["masks"].new_like(paste_target["masks"], paste_target["masks"][random_selection])
-        paste_boxes = paste_target["boxes"].new_like(paste_target["boxes"], paste_target["boxes"][random_selection])
-        paste_labels = paste_target["labels"].new_like(paste_target["labels"], paste_target["labels"][random_selection])
+        paste_masks = tv_tensors.wrap(paste_target["masks"][random_selection], like=paste_target["masks"])
+        paste_boxes = tv_tensors.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"])
+        paste_labels = tv_tensors.wrap(paste_target["labels"][random_selection], like=paste_target["labels"])
 
         masks = target["masks"]
 
@@ -217,10 +46,10 @@ def _copy_paste(
         # This is something different to TF implementation we introduced here as
         # originally the algorithm works on equal-sized data
         # (for example, coming from LSJ data augmentations)
-        size1 = image.shape[-2:]
+        size1 = cast(list[int], image.shape[-2:])
         size2 = paste_image.shape[-2:]
         if size1 != size2:
-            paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation)
+            paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation, antialias=antialias)
             paste_masks = F.resize(paste_masks, size=size1)
             paste_boxes = F.resize(paste_boxes, size=size1)
 
@@ -229,11 +58,12 @@ def _copy_paste(
         if blending:
             paste_alpha_mask = F.gaussian_blur(paste_alpha_mask.unsqueeze(0), kernel_size=[5, 5], sigma=[2.0])
 
+        inverse_paste_alpha_mask = paste_alpha_mask.logical_not()
         # Copy-paste images:
-        image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
+        image = image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask))
 
         # Copy-paste masks:
-        masks = masks * (~paste_alpha_mask)
+        masks = masks * inverse_paste_alpha_mask
         non_all_zero_masks = masks.sum((-1, -2)) > 0
         masks = masks[non_all_zero_masks]
 
@@ -245,11 +75,13 @@ def _copy_paste(
         # Copy-paste boxes and labels
         bbox_format = target["boxes"].format
         xyxy_boxes = masks_to_boxes(masks)
-        # TODO: masks_to_boxes produces bboxes with x2y2 inclusive but x2y2 should be exclusive
-        # we need to add +1 to x2y2. We need to investigate that.
+        # masks_to_boxes produces bboxes with x2y2 inclusive but x2y2 should be exclusive
+        # we need to add +1 to x2y2.
+        # There is a similar +1 in other reference implementations:
+        # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
         boxes = F.convert_bounding_box_format(
-            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False
+            xyxy_boxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
 
@@ -258,7 +90,7 @@ def _copy_paste(
 
         # Check for degenerated boxes and remove them
         boxes = F.convert_bounding_box_format(
-            out_target["boxes"], old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY
+            out_target["boxes"], old_format=bbox_format, new_format=tv_tensors.BoundingBoxFormat.XYXY
         )
         degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
         if degenerate_boxes.any():
@@ -270,26 +102,28 @@ def _copy_paste(
 
         return image, out_target
 
-    def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], List[Dict[str, Any]]]:
+    def _extract_image_targets(
+        self, flat_sample: list[Any]
+    ) -> tuple[list[Union[torch.Tensor, tv_tensors.Image]], list[dict[str, Any]]]:
         # fetch all images, bboxes, masks and labels from unstructured input
-        # with List[image], List[BoundingBox], List[SegmentationMask], List[Label]
+        # with List[image], List[BoundingBoxes], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
         for obj in flat_sample:
-            if isinstance(obj, features.Image) or is_simple_tensor(obj):
+            if isinstance(obj, tv_tensors.Image) or is_pure_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
-                images.append(pil_to_tensor(obj))
-            elif isinstance(obj, features.BoundingBox):
+                images.append(F.to_image(obj))
+            elif isinstance(obj, tv_tensors.BoundingBoxes):
                 bboxes.append(obj)
-            elif isinstance(obj, features.SegmentationMask):
+            elif isinstance(obj, tv_tensors.Mask):
                 masks.append(obj)
-            elif isinstance(obj, (features.Label, features.OneHotLabel)):
+            elif isinstance(obj, (proto_tv_tensors.Label, proto_tv_tensors.OneHotLabel)):
                 labels.append(obj)
 
         if not (len(images) == len(bboxes) == len(masks) == len(labels)):
             raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain equal-sized list of Images, "
-                "BoundingBoxes, Segmentation Masks and Labels or OneHotLabels."
+                f"{type(self).__name__}() requires input sample to contain equal sized list of Images, "
+                "BoundingBoxes, Masks and Labels or OneHotLabels."
             )
 
         targets = []
@@ -299,35 +133,36 @@ def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], Lis
         return images, targets
 
     def _insert_outputs(
-        self, flat_sample: List[Any], output_images: List[Any], output_targets: List[Dict[str, Any]]
+        self,
+        flat_sample: list[Any],
+        output_images: list[torch.Tensor],
+        output_targets: list[dict[str, Any]],
     ) -> None:
         c0, c1, c2, c3 = 0, 0, 0, 0
         for i, obj in enumerate(flat_sample):
-            if isinstance(obj, features.Image):
-                flat_sample[i] = features.Image.new_like(obj, output_images[c0])
+            if isinstance(obj, tv_tensors.Image):
+                flat_sample[i] = tv_tensors.wrap(output_images[c0], like=obj)
                 c0 += 1
             elif isinstance(obj, PIL.Image.Image):
-                flat_sample[i] = F.to_image_pil(output_images[c0])
+                flat_sample[i] = F.to_pil_image(output_images[c0])
                 c0 += 1
-            elif is_simple_tensor(obj):
+            elif is_pure_tensor(obj):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
-            elif isinstance(obj, features.BoundingBox):
-                flat_sample[i] = features.BoundingBox.new_like(obj, output_targets[c1]["boxes"])
+            elif isinstance(obj, tv_tensors.BoundingBoxes):
+                flat_sample[i] = tv_tensors.wrap(output_targets[c1]["boxes"], like=obj)
                 c1 += 1
-            elif isinstance(obj, features.SegmentationMask):
-                flat_sample[i] = features.SegmentationMask.new_like(obj, output_targets[c2]["masks"])
+            elif isinstance(obj, tv_tensors.Mask):
+                flat_sample[i] = tv_tensors.wrap(output_targets[c2]["masks"], like=obj)
                 c2 += 1
-            elif isinstance(obj, (features.Label, features.OneHotLabel)):
-                flat_sample[i] = obj.new_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
+            elif isinstance(obj, (proto_tv_tensors.Label, proto_tv_tensors.OneHotLabel)):
+                flat_sample[i] = tv_tensors.wrap(output_targets[c3]["labels"], like=obj)
                 c3 += 1
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        flat_sample, spec = tree_flatten(sample)
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
 
-        images, targets = self._extract_image_targets(flat_sample)
+        images, targets = self._extract_image_targets(flat_inputs)
 
         # images = [t1, t2, ..., tN]
         # Let's define paste_images as shifted list of input images
@@ -359,11 +194,12 @@ def forward(self, *inputs: Any) -> Any:
                     random_selection=random_selection,
                     blending=self.blending,
                     resize_interpolation=self.resize_interpolation,
+                    antialias=self.antialias,
                 )
             output_images.append(output_image)
             output_targets.append(output_target)
 
         # Insert updated images and targets into input flat_sample
-        self._insert_outputs(flat_sample, output_images, output_targets)
+        self._insert_outputs(flat_inputs, output_images, output_targets)
 
-        return tree_unflatten(flat_sample, spec)
+        return tree_unflatten(flat_inputs, spec)
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
deleted file mode 100644
index 61bb30a3aa4..00000000000
--- a/torchvision/prototype/transforms/_color.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import collections.abc
-from typing import Any, Dict, Optional, Sequence, Tuple, TypeVar, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-from torchvision.transforms import functional as _F
-
-from ._transform import _RandomApplyTransform
-from ._utils import is_simple_tensor, query_chw
-
-T = TypeVar("T", features.Image, torch.Tensor, PIL.Image.Image)
-
-
-class ColorJitter(Transform):
-    def __init__(
-        self,
-        brightness: Optional[Union[float, Sequence[float]]] = None,
-        contrast: Optional[Union[float, Sequence[float]]] = None,
-        saturation: Optional[Union[float, Sequence[float]]] = None,
-        hue: Optional[Union[float, Sequence[float]]] = None,
-    ) -> None:
-        super().__init__()
-        self.brightness = self._check_input(brightness, "brightness")
-        self.contrast = self._check_input(contrast, "contrast")
-        self.saturation = self._check_input(saturation, "saturation")
-        self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
-
-    def _check_input(
-        self,
-        value: Optional[Union[float, Sequence[float]]],
-        name: str,
-        center: float = 1.0,
-        bound: Tuple[float, float] = (0, float("inf")),
-        clip_first_on_zero: bool = True,
-    ) -> Optional[Tuple[float, float]]:
-        if value is None:
-            return None
-
-        if isinstance(value, float):
-            if value < 0:
-                raise ValueError(f"If {name} is a single number, it must be non negative.")
-            value = [center - value, center + value]
-            if clip_first_on_zero:
-                value[0] = max(value[0], 0.0)
-        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
-            if not bound[0] <= value[0] <= value[1] <= bound[1]:
-                raise ValueError(f"{name} values should be between {bound}")
-        else:
-            raise TypeError(f"{name} should be a single number or a sequence with length 2.")
-
-        return None if value[0] == value[1] == center else (float(value[0]), float(value[1]))
-
-    @staticmethod
-    def _generate_value(left: float, right: float) -> float:
-        return float(torch.distributions.Uniform(left, right).sample())
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        fn_idx = torch.randperm(4)
-
-        b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1])
-        c = None if self.contrast is None else self._generate_value(self.contrast[0], self.contrast[1])
-        s = None if self.saturation is None else self._generate_value(self.saturation[0], self.saturation[1])
-        h = None if self.hue is None else self._generate_value(self.hue[0], self.hue[1])
-
-        return dict(fn_idx=fn_idx, brightness_factor=b, contrast_factor=c, saturation_factor=s, hue_factor=h)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        output = inpt
-        brightness_factor = params["brightness_factor"]
-        contrast_factor = params["contrast_factor"]
-        saturation_factor = params["saturation_factor"]
-        hue_factor = params["hue_factor"]
-        for fn_id in params["fn_idx"]:
-            if fn_id == 0 and brightness_factor is not None:
-                output = F.adjust_brightness(output, brightness_factor=brightness_factor)
-            elif fn_id == 1 and contrast_factor is not None:
-                output = F.adjust_contrast(output, contrast_factor=contrast_factor)
-            elif fn_id == 2 and saturation_factor is not None:
-                output = F.adjust_saturation(output, saturation_factor=saturation_factor)
-            elif fn_id == 3 and hue_factor is not None:
-                output = F.adjust_hue(output, hue_factor=hue_factor)
-        return output
-
-
-class RandomPhotometricDistort(Transform):
-    def __init__(
-        self,
-        contrast: Tuple[float, float] = (0.5, 1.5),
-        saturation: Tuple[float, float] = (0.5, 1.5),
-        hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
-        p: float = 0.5,
-    ):
-        super().__init__()
-        self.brightness = brightness
-        self.contrast = contrast
-        self.hue = hue
-        self.saturation = saturation
-        self.p = p
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_channels, _, _ = query_chw(sample)
-        return dict(
-            zip(
-                ["brightness", "contrast1", "saturation", "hue", "contrast2"],
-                torch.rand(6) < self.p,
-            ),
-            contrast_before=torch.rand(()) < 0.5,
-            channel_permutation=torch.randperm(num_channels) if torch.rand(()) < self.p else None,
-        )
-
-    def _permute_channels(self, inpt: Any, *, permutation: torch.Tensor) -> Any:
-        if not (isinstance(inpt, (features.Image, PIL.Image.Image)) or is_simple_tensor(inpt)):
-            return inpt
-
-        image = inpt
-        if isinstance(inpt, PIL.Image.Image):
-            image = _F.pil_to_tensor(image)
-
-        output = image[..., permutation, :, :]
-
-        if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.OTHER)
-        elif isinstance(inpt, PIL.Image.Image):
-            output = _F.to_pil_image(output)
-
-        return output
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if params["brightness"]:
-            inpt = F.adjust_brightness(
-                inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
-            )
-        if params["contrast1"] and params["contrast_before"]:
-            inpt = F.adjust_contrast(
-                inpt, contrast_factor=ColorJitter._generate_value(self.contrast[0], self.contrast[1])
-            )
-        if params["saturation"]:
-            inpt = F.adjust_saturation(
-                inpt, saturation_factor=ColorJitter._generate_value(self.saturation[0], self.saturation[1])
-            )
-        if params["hue"]:
-            inpt = F.adjust_hue(inpt, hue_factor=ColorJitter._generate_value(self.hue[0], self.hue[1]))
-        if params["contrast2"] and not params["contrast_before"]:
-            inpt = F.adjust_contrast(
-                inpt, contrast_factor=ColorJitter._generate_value(self.contrast[0], self.contrast[1])
-            )
-        if params["channel_permutation"]:
-            inpt = self._permute_channels(inpt, permutation=params["channel_permutation"])
-        return inpt
-
-
-class RandomEqualize(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.equalize(inpt)
-
-
-class RandomInvert(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.invert(inpt)
-
-
-class RandomPosterize(_RandomApplyTransform):
-    def __init__(self, bits: int, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.bits = bits
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.posterize(inpt, bits=self.bits)
-
-
-class RandomSolarize(_RandomApplyTransform):
-    def __init__(self, threshold: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.threshold = threshold
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.solarize(inpt, threshold=self.threshold)
-
-
-class RandomAutocontrast(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.autocontrast(inpt)
-
-
-class RandomAdjustSharpness(_RandomApplyTransform):
-    def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.sharpness_factor = sharpness_factor
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.adjust_sharpness(inpt, sharpness_factor=self.sharpness_factor)
diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/prototype/transforms/_container.py
deleted file mode 100644
index 38216ffef17..00000000000
--- a/torchvision/prototype/transforms/_container.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from typing import Any, Callable, Dict, List, Optional, Sequence
-
-import torch
-from torchvision.prototype.transforms import Transform
-
-from ._transform import _RandomApplyTransform
-
-
-class Compose(Transform):
-    def __init__(self, transforms: Sequence[Callable]) -> None:
-        super().__init__()
-        if not isinstance(transforms, Sequence):
-            raise TypeError("Argument transforms should be a sequence of callables")
-        self.transforms = transforms
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        for transform in self.transforms:
-            sample = transform(sample)
-        return sample
-
-
-class RandomApply(_RandomApplyTransform):
-    def __init__(self, transform: Transform, *, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.transform = transform
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return self.transform(inpt)
-
-    def extra_repr(self) -> str:
-        return f"p={self.p}"
-
-
-class RandomChoice(Transform):
-    def __init__(self, *transforms: Transform, probabilities: Optional[List[float]] = None) -> None:
-        if probabilities is None:
-            probabilities = [1] * len(transforms)
-        elif len(probabilities) != len(transforms):
-            raise ValueError(
-                f"The number of probabilities doesn't match the number of transforms: "
-                f"{len(probabilities)} != {len(transforms)}"
-            )
-
-        super().__init__()
-
-        self.transforms = transforms
-        for idx, transform in enumerate(transforms):
-            self.add_module(str(idx), transform)
-
-        total = sum(probabilities)
-        self.probabilities = [p / total for p in probabilities]
-
-    def forward(self, *inputs: Any) -> Any:
-        idx = int(torch.multinomial(torch.tensor(self.probabilities), 1))
-        transform = self.transforms[idx]
-        return transform(*inputs)
-
-
-class RandomOrder(Transform):
-    def __init__(self, *transforms: Transform) -> None:
-        super().__init__()
-        self.transforms = transforms
-        for idx, transform in enumerate(transforms):
-            self.add_module(str(idx), transform)
-
-    def forward(self, *inputs: Any) -> Any:
-        for idx in torch.randperm(len(self.transforms)):
-            transform = self.transforms[idx]
-            inputs = transform(*inputs)
-        return inputs
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
deleted file mode 100644
index 02e827916ce..00000000000
--- a/torchvision/prototype/transforms/_deprecated.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import warnings
-from typing import Any, Dict, Optional
-
-import numpy as np
-import PIL.Image
-import torch
-import torchvision.prototype.transforms.functional as F
-from torchvision.prototype import features
-from torchvision.prototype.features import ColorSpace
-from torchvision.prototype.transforms import Transform
-from torchvision.transforms import functional as _F
-from typing_extensions import Literal
-
-from ._transform import _RandomApplyTransform
-from ._utils import is_simple_tensor
-
-
-class ToTensor(Transform):
-
-    # Updated transformed types for ToTensor
-    _transformed_types = (torch.Tensor, features._Feature, PIL.Image.Image, np.ndarray)
-
-    def __init__(self) -> None:
-        warnings.warn(
-            "The transform `ToTensor()` is deprecated and will be removed in a future release. "
-            "Instead, please use `transforms.ToImageTensor()`."
-        )
-        super().__init__()
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, (PIL.Image.Image, np.ndarray)):
-            return _F.to_tensor(inpt)
-        else:
-            return inpt
-
-
-class PILToTensor(Transform):
-    def __init__(self) -> None:
-        warnings.warn(
-            "The transform `PILToTensor()` is deprecated and will be removed in a future release. "
-            "Instead, please use `transforms.ToImageTensor()`."
-        )
-        super().__init__()
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, PIL.Image.Image):
-            return _F.pil_to_tensor(inpt)
-        else:
-            return inpt
-
-
-class ToPILImage(Transform):
-
-    # Updated transformed types for ToPILImage
-    _transformed_types = (torch.Tensor, features._Feature, PIL.Image.Image, np.ndarray)
-
-    def __init__(self, mode: Optional[str] = None) -> None:
-        warnings.warn(
-            "The transform `ToPILImage()` is deprecated and will be removed in a future release. "
-            "Instead, please use `transforms.ToImagePIL()`."
-        )
-        super().__init__()
-        self.mode = mode
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if is_simple_tensor(inpt) or isinstance(inpt, (features.Image, np.ndarray)):
-            return _F.to_pil_image(inpt, mode=self.mode)
-        else:
-            return inpt
-
-
-class Grayscale(Transform):
-    def __init__(self, num_output_channels: Literal[1, 3] = 1) -> None:
-        deprecation_msg = (
-            f"The transform `Grayscale(num_output_channels={num_output_channels})` "
-            f"is deprecated and will be removed in a future release."
-        )
-        if num_output_channels == 1:
-            replacement_msg = (
-                "transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY)"
-            )
-        else:
-            replacement_msg = (
-                "transforms.Compose(\n"
-                "    transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY),\n"
-                "    transforms.ConvertImageColorSpace(old_color_space=ColorSpace.GRAY, color_space=ColorSpace.RGB),\n"
-                ")"
-            )
-        warnings.warn(f"{deprecation_msg} Instead, please use\n\n{replacement_msg}")
-
-        super().__init__()
-        self.num_output_channels = num_output_channels
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        output = F.convert_color_space(inpt, color_space=ColorSpace.GRAY, old_color_space=ColorSpace.RGB)
-        if self.num_output_channels == 3:
-            output = F.convert_color_space(inpt, color_space=ColorSpace.RGB, old_color_space=ColorSpace.GRAY)
-        return output
-
-
-class RandomGrayscale(_RandomApplyTransform):
-    def __init__(self, p: float = 0.1) -> None:
-        warnings.warn(
-            "The transform `RandomGrayscale(p=...)` is deprecated and will be removed in a future release. "
-            "Instead, please use\n\n"
-            "transforms.RandomApply(\n"
-            "    transforms.Compose(\n"
-            "        transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY),\n"
-            "        transforms.ConvertImageColorSpace(old_color_space=ColorSpace.GRAY, color_space=ColorSpace.RGB),\n"
-            "    )\n"
-            "    p=...,\n"
-            ")"
-        )
-
-        super().__init__(p=p)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        output = F.convert_color_space(inpt, color_space=ColorSpace.GRAY, old_color_space=ColorSpace.RGB)
-        return F.convert_color_space(output, color_space=ColorSpace.RGB, old_color_space=ColorSpace.GRAY)
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index d3006fe9e09..0e689fb668f 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -1,795 +1,61 @@
-import math
-import numbers
-import warnings
-from typing import Any, cast, Dict, List, Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Any, Optional, Union
 
 import PIL.Image
 import torch
-from torchvision.ops.boxes import box_iou
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-from torchvision.transforms.functional import InterpolationMode
-from torchvision.transforms.functional_tensor import _parse_pad_padding
-from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size
 
-from typing_extensions import Literal
+from torchvision import tv_tensors
+from torchvision.prototype.tv_tensors import Label, OneHotLabel
+from torchvision.transforms.v2 import functional as F, Transform
+from torchvision.transforms.v2._utils import (
+    _FillType,
+    _get_fill,
+    _setup_fill_arg,
+    _setup_size,
+    get_bounding_boxes,
+    has_any,
+    is_pure_tensor,
+    query_size,
+)
 
-from ._transform import _RandomApplyTransform
-from ._utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_chw
 
-
-class RandomHorizontalFlip(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.horizontal_flip(inpt)
-
-
-class RandomVerticalFlip(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.vertical_flip(inpt)
-
-
-class Resize(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__()
-
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-        self.interpolation = interpolation
-        self.max_size = max_size
-        self.antialias = antialias
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(
-            inpt,
-            self.size,
-            interpolation=self.interpolation,
-            max_size=self.max_size,
-            antialias=self.antialias,
-        )
-
-
-class CenterCrop(Transform):
-    def __init__(self, output_size: List[int]):
-        super().__init__()
-        self.output_size = output_size
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.center_crop(inpt, output_size=self.output_size)
-
-
-class RandomResizedCrop(Transform):
+class FixedSizeCrop(Transform):
     def __init__(
         self,
         size: Union[int, Sequence[int]],
-        scale: Tuple[float, float] = (0.08, 1.0),
-        ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-        if not isinstance(scale, Sequence):
-            raise TypeError("Scale should be a sequence")
-        scale = cast(Tuple[float, float], scale)
-        if not isinstance(ratio, Sequence):
-            raise TypeError("Ratio should be a sequence")
-        ratio = cast(Tuple[float, float], ratio)
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("Scale and ratio should be of kind (min, max)")
-
-        self.scale = scale
-        self.ratio = ratio
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # vfdev-5: techically, this op can work on bboxes/segm masks only inputs without image in samples
-        # What if we have multiple images/bboxes/masks of different sizes ?
-        # TODO: let's support bbox or mask in samples without image
-        _, height, width = query_chw(sample)
-        area = height * width
-
-        log_ratio = torch.log(torch.tensor(self.ratio))
-        for _ in range(10):
-            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-            aspect_ratio = torch.exp(
-                torch.empty(1).uniform_(
-                    log_ratio[0],  # type: ignore[arg-type]
-                    log_ratio[1],  # type: ignore[arg-type]
-                )
-            ).item()
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if 0 < w <= width and 0 < h <= height:
-                i = torch.randint(0, height - h + 1, size=(1,)).item()
-                j = torch.randint(0, width - w + 1, size=(1,)).item()
-                break
-        else:
-            # Fallback to central crop
-            in_ratio = float(width) / float(height)
-            if in_ratio < min(self.ratio):
-                w = width
-                h = int(round(w / min(self.ratio)))
-            elif in_ratio > max(self.ratio):
-                h = height
-                w = int(round(h * max(self.ratio)))
-            else:  # whole image
-                w = width
-                h = height
-            i = (height - h) // 2
-            j = (width - w) // 2
-
-        return dict(top=i, left=j, height=h, width=w)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resized_crop(
-            inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
-        )
-
-
-class FiveCrop(Transform):
-    """
-    Example:
-        >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[features.Image, ...], features.Label]):
-        ...         images, labels = sample
-        ...         batch_size = len(images)
-        ...         images = features.Image.new_like(images[0], torch.stack(images))
-        ...         labels = features.Label.new_like(labels, labels.repeat(batch_size))
-        ...         return images, labels
-        ...
-        >>> image = features.Image(torch.rand(3, 256, 256))
-        >>> label = features.Label(0)
-        >>> transform = transforms.Compose([transforms.FiveCrop(), BatchMultiCrop()])
-        >>> images, labels = transform(image, label)
-        >>> images.shape
-        torch.Size([5, 3, 224, 224])
-        >>> labels.shape
-        torch.Size([5])
-    """
-
-    def __init__(self, size: Union[int, Sequence[int]]) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        # TODO: returning a list is technically BC breaking since FiveCrop returned a tuple before. We switched to a
-        #  list here to align it with TenCrop.
-        if isinstance(inpt, features.Image):
-            output = F.five_crop_image_tensor(inpt, self.size)
-            return tuple(features.Image.new_like(inpt, o) for o in output)
-        elif is_simple_tensor(inpt):
-            return F.five_crop_image_tensor(inpt, self.size)
-        elif isinstance(inpt, PIL.Image.Image):
-            return F.five_crop_image_pil(inpt, self.size)
-        else:
-            return inpt
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        if has_any(sample, features.BoundingBox, features.SegmentationMask):
-            raise TypeError(f"BoundingBox'es and SegmentationMask's are not supported by {type(self).__name__}()")
-        return super().forward(sample)
-
-
-class TenCrop(Transform):
-    """
-    See :class:`~torchvision.prototype.transforms.FiveCrop` for an example.
-    """
-
-    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-        self.vertical_flip = vertical_flip
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Image):
-            output = F.ten_crop_image_tensor(inpt, self.size, vertical_flip=self.vertical_flip)
-            return [features.Image.new_like(inpt, o) for o in output]
-        elif is_simple_tensor(inpt):
-            return F.ten_crop_image_tensor(inpt, self.size, vertical_flip=self.vertical_flip)
-        elif isinstance(inpt, PIL.Image.Image):
-            return F.ten_crop_image_pil(inpt, self.size, vertical_flip=self.vertical_flip)
-        else:
-            return inpt
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        if has_any(sample, features.BoundingBox, features.SegmentationMask):
-            raise TypeError(f"BoundingBox'es and SegmentationMask's are not supported by {type(self).__name__}()")
-        return super().forward(sample)
-
-
-def _check_fill_arg(fill: Union[int, float, Sequence[int], Sequence[float]]) -> None:
-    if not isinstance(fill, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate fill arg")
-
-
-def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
-    if not isinstance(padding, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate padding arg")
-
-    if isinstance(padding, (tuple, list)) and len(padding) not in [1, 2, 4]:
-        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
-
-
-# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums)
-# https://github.com/pytorch/vision/issues/6250
-def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
-        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-
-class Pad(Transform):
-    def __init__(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-    ) -> None:
-        super().__init__()
-
-        _check_padding_arg(padding)
-        _check_fill_arg(fill)
-        _check_padding_mode_arg(padding_mode)
-
-        self.padding = padding
-        self.fill = fill
-        self.padding_mode = padding_mode
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.pad(inpt, padding=self.padding, fill=self.fill, padding_mode=self.padding_mode)
-
-
-class RandomZoomOut(_RandomApplyTransform):
-    def __init__(
-        self,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        side_range: Sequence[float] = (1.0, 4.0),
-        p: float = 0.5,
-    ) -> None:
-        super().__init__(p=p)
-
-        _check_fill_arg(fill)
-        self.fill = fill
-
-        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
-
-        self.side_range = side_range
-        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
-            raise ValueError(f"Invalid canvas side range provided {side_range}.")
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        orig_c, orig_h, orig_w = query_chw(sample)
-
-        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
-        canvas_width = int(orig_w * r)
-        canvas_height = int(orig_h * r)
-
-        r = torch.rand(2)
-        left = int((canvas_width - orig_w) * r[0])
-        top = int((canvas_height - orig_h) * r[1])
-        right = canvas_width - (left + orig_w)
-        bottom = canvas_height - (top + orig_h)
-        padding = [left, top, right, bottom]
-
-        return dict(padding=padding, fill=self.fill)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.pad(inpt, **params)
-
-
-class RandomRotation(Transform):
-    def __init__(
-        self,
-        degrees: Union[numbers.Number, Sequence],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        center: Optional[List[float]] = None,
-    ) -> None:
-        super().__init__()
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        self.interpolation = interpolation
-        self.expand = expand
-
-        _check_fill_arg(fill)
-
-        self.fill = fill
-
-        if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2,))
-
-        self.center = center
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
-        return dict(angle=angle)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.rotate(
-            inpt,
-            **params,
-            interpolation=self.interpolation,
-            expand=self.expand,
-            fill=self.fill,
-            center=self.center,
-        )
-
-
-class RandomAffine(Transform):
-    def __init__(
-        self,
-        degrees: Union[numbers.Number, Sequence],
-        translate: Optional[Sequence[float]] = None,
-        scale: Optional[Sequence[float]] = None,
-        shear: Optional[Union[float, Sequence[float]]] = None,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        center: Optional[List[float]] = None,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        padding_mode: str = "constant",
     ) -> None:
         super().__init__()
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        if translate is not None:
-            _check_sequence_input(translate, "translate", req_sizes=(2,))
-            for t in translate:
-                if not (0.0 <= t <= 1.0):
-                    raise ValueError("translation values should be between 0 and 1")
-        self.translate = translate
-        if scale is not None:
-            _check_sequence_input(scale, "scale", req_sizes=(2,))
-            for s in scale:
-                if s <= 0:
-                    raise ValueError("scale values should be positive")
-        self.scale = scale
-
-        if shear is not None:
-            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
-        else:
-            self.shear = shear
-
-        self.interpolation = interpolation
-
-        _check_fill_arg(fill)
+        size = tuple(_setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
+        self.crop_height = size[0]
+        self.crop_width = size[1]
 
         self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
-        if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2,))
-
-        self.center = center
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, height, width = query_chw(sample)
-
-        angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
-        if self.translate is not None:
-            max_dx = float(self.translate[0] * width)
-            max_dy = float(self.translate[1] * height)
-            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
-            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
-            translations = (tx, ty)
-        else:
-            translations = (0, 0)
-
-        if self.scale is not None:
-            scale = float(torch.empty(1).uniform_(self.scale[0], self.scale[1]).item())
-        else:
-            scale = 1.0
-
-        shear_x = shear_y = 0.0
-        if self.shear is not None:
-            shear_x = float(torch.empty(1).uniform_(self.shear[0], self.shear[1]).item())
-            if len(self.shear) == 4:
-                shear_y = float(torch.empty(1).uniform_(self.shear[2], self.shear[3]).item())
-
-        shear = (shear_x, shear_y)
-        return dict(angle=angle, translations=translations, scale=scale, shear=shear)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.affine(
-            inpt,
-            **params,
-            interpolation=self.interpolation,
-            fill=self.fill,
-            center=self.center,
-        )
-
-
-class RandomCrop(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        padding: Optional[Union[int, Sequence[int]]] = None,
-        pad_if_needed: bool = False,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-    ) -> None:
-        super().__init__()
-
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-        if pad_if_needed or padding is not None:
-            if padding is not None:
-                _check_padding_arg(padding)
-            _check_fill_arg(fill)
-            _check_padding_mode_arg(padding_mode)
-
-        self.padding = padding
-        self.pad_if_needed = pad_if_needed
-        self.fill = fill
         self.padding_mode = padding_mode
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, height, width = query_chw(sample)
-
-        if self.padding is not None:
-            # update height, width with static padding data
-            padding = self.padding
-            if isinstance(padding, Sequence):
-                padding = list(padding)
-            pad_left, pad_right, pad_top, pad_bottom = _parse_pad_padding(padding)
-            height += pad_top + pad_bottom
-            width += pad_left + pad_right
-
-        output_height, output_width = self.size
-        # We have to store maybe padded image size for pad_if_needed branch in _transform
-        input_height, input_width = height, width
-
-        if self.pad_if_needed:
-            # pad width if needed
-            if width < output_width:
-                width += 2 * (output_width - width)
-            # pad height if needed
-            if height < output_height:
-                height += 2 * (output_height - height)
-
-        if height + 1 < output_height or width + 1 < output_width:
-            raise ValueError(
-                f"Required crop size {(output_height, output_width)} is larger then input image size {(height, width)}"
-            )
-
-        if width == output_width and height == output_height:
-            return dict(top=0, left=0, height=height, width=width, input_width=input_width, input_height=input_height)
-
-        top = torch.randint(0, height - output_height + 1, size=(1,)).item()
-        left = torch.randint(0, width - output_width + 1, size=(1,)).item()
-
-        return dict(
-            top=top,
-            left=left,
-            height=output_height,
-            width=output_width,
-            input_width=input_width,
-            input_height=input_height,
-        )
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if self.padding is not None:
-            inpt = F.pad(inpt, padding=self.padding, fill=self.fill, padding_mode=self.padding_mode)
-
-        if self.pad_if_needed:
-            input_width, input_height = params["input_width"], params["input_height"]
-            if input_width < self.size[1]:
-                padding = [self.size[1] - input_width, 0]
-                inpt = F.pad(inpt, padding=padding, fill=self.fill, padding_mode=self.padding_mode)
-            if input_height < self.size[0]:
-                padding = [0, self.size[0] - input_height]
-                inpt = F.pad(inpt, padding=padding, fill=self.fill, padding_mode=self.padding_mode)
-
-        return F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
-
-
-class RandomPerspective(_RandomApplyTransform):
-    def __init__(
-        self,
-        distortion_scale: float,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        p: float = 0.5,
-    ) -> None:
-        super().__init__(p=p)
-
-        _check_fill_arg(fill)
-        if not (0 <= distortion_scale <= 1):
-            raise ValueError("Argument distortion_scale value should be between 0 and 1")
-
-        self.distortion_scale = distortion_scale
-        self.interpolation = interpolation
-        self.fill = fill
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, height, width = query_chw(sample)
-
-        distortion_scale = self.distortion_scale
-
-        half_height = height // 2
-        half_width = width // 2
-        topleft = [
-            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
-            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
-        ]
-        topright = [
-            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
-            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
-        ]
-        botright = [
-            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
-            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
-        ]
-        botleft = [
-            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
-            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
-        ]
-        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
-        endpoints = [topleft, topright, botright, botleft]
-        return dict(startpoints=startpoints, endpoints=endpoints)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.perspective(
-            inpt,
-            **params,
-            fill=self.fill,
-            interpolation=self.interpolation,
-        )
-
-
-def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size: int = 2) -> Sequence[float]:
-    if not isinstance(arg, (float, Sequence)):
-        raise TypeError(f"{name} should be float or a sequence of floats. Got {type(arg)}")
-    if isinstance(arg, Sequence) and len(arg) != req_size:
-        raise ValueError(f"If {name} is a sequence its length should be one of {req_size}. Got {len(arg)}")
-    if isinstance(arg, Sequence):
-        for element in arg:
-            if not isinstance(element, float):
-                raise ValueError(f"{name} should be a sequence of floats. Got {type(element)}")
-
-    if isinstance(arg, float):
-        arg = [float(arg), float(arg)]
-    if isinstance(arg, (list, tuple)) and len(arg) == 1:
-        arg = [arg[0], arg[0]]
-    return arg
-
-
-class ElasticTransform(Transform):
-    def __init__(
-        self,
-        alpha: Union[float, Sequence[float]] = 50.0,
-        sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ) -> None:
-        super().__init__()
-        self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
-
-        _check_fill_arg(fill)
-
-        self.interpolation = interpolation
-        self.fill = fill
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, *size = query_chw(sample)
-
-        dx = torch.rand([1, 1] + size) * 2 - 1
-        if self.sigma[0] > 0.0:
-            kx = int(8 * self.sigma[0] + 1)
-            # if kernel size is even we have to make it odd
-            if kx % 2 == 0:
-                kx += 1
-            dx = F.gaussian_blur(dx, [kx, kx], list(self.sigma))
-        dx = dx * self.alpha[0] / size[0]
-
-        dy = torch.rand([1, 1] + size) * 2 - 1
-        if self.sigma[1] > 0.0:
-            ky = int(8 * self.sigma[1] + 1)
-            # if kernel size is even we have to make it odd
-            if ky % 2 == 0:
-                ky += 1
-            dy = F.gaussian_blur(dy, [ky, ky], list(self.sigma))
-        dy = dy * self.alpha[1] / size[1]
-        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
-        return dict(displacement=displacement)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.elastic(
-            inpt,
-            **params,
-            fill=self.fill,
-            interpolation=self.interpolation,
-        )
-
-
-class RandomIoUCrop(Transform):
-    def __init__(
-        self,
-        min_scale: float = 0.3,
-        max_scale: float = 1.0,
-        min_aspect_ratio: float = 0.5,
-        max_aspect_ratio: float = 2.0,
-        sampler_options: Optional[List[float]] = None,
-        trials: int = 40,
-    ):
-        super().__init__()
-        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        if sampler_options is None:
-            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
-        self.options = sampler_options
-        self.trials = trials
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_h, orig_w = query_chw(sample)
-        bboxes = query_bounding_box(sample)
-
-        while True:
-            # sample an option
-            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
-            min_jaccard_overlap = self.options[idx]
-            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
-                return dict()
-
-            for _ in range(self.trials):
-                # check the aspect ratio limitations
-                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
-                new_w = int(orig_w * r[0])
-                new_h = int(orig_h * r[1])
-                aspect_ratio = new_w / new_h
-                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
-                    continue
-
-                # check for 0 area crops
-                r = torch.rand(2)
-                left = int((orig_w - new_w) * r[0])
-                top = int((orig_h - new_h) * r[1])
-                right = left + new_w
-                bottom = top + new_h
-                if left == right or top == bottom:
-                    continue
-
-                # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_bounding_box_format(
-                    bboxes, old_format=bboxes.format, new_format=features.BoundingBoxFormat.XYXY, copy=True
-                )
-                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
-                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
-                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
-                if not is_within_crop_area.any():
-                    continue
-
-                # check at least 1 box with jaccard limitations
-                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
-                ious = box_iou(
-                    xyxy_bboxes,
-                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
-                )
-                if ious.max() < min_jaccard_overlap:
-                    continue
-
-                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if len(params) < 1:
-            return inpt
-
-        is_within_crop_area = params["is_within_crop_area"]
-
-        if isinstance(inpt, (features.Label, features.OneHotLabel)):
-            return inpt.new_like(inpt, inpt[is_within_crop_area])  # type: ignore[arg-type]
-
-        output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
-
-        if isinstance(output, features.BoundingBox):
-            bboxes = output[is_within_crop_area]
-            bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size)
-            output = features.BoundingBox.new_like(output, bboxes)
-        elif isinstance(output, features.SegmentationMask) and output.shape[-3] > 1:
-            # apply is_within_crop_area if mask is one-hot encoded
-            masks = output[is_within_crop_area]
-            output = features.SegmentationMask.new_like(output, masks)
-
-        return output
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        if not (
-            has_all(sample, features.BoundingBox)
-            and has_any(sample, PIL.Image.Image, features.Image, is_simple_tensor)
-            and has_any(sample, features.Label, features.OneHotLabel)
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if not has_any(
+            flat_inputs,
+            PIL.Image.Image,
+            tv_tensors.Image,
+            is_pure_tensor,
+            tv_tensors.Video,
         ):
             raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
-                "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Segmentation Masks."
+                f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
             )
-        return super().forward(sample)
-
-
-class ScaleJitter(Transform):
-    def __init__(
-        self,
-        target_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (0.1, 2.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ):
-        super().__init__()
-        self.target_size = target_size
-        self.scale_range = scale_range
-        self.interpolation = interpolation
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_height, orig_width = query_chw(sample)
-
-        r = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
-        new_width = int(self.target_size[1] * r)
-        new_height = int(self.target_size[0] * r)
-
-        return dict(size=(new_height, new_width))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation)
-
 
-class RandomShortestSize(Transform):
-    def __init__(
-        self,
-        min_size: Union[List[int], Tuple[int], int],
-        max_size: int,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ):
-        super().__init__()
-        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
-        self.max_size = max_size
-        self.interpolation = interpolation
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_height, orig_width = query_chw(sample)
-
-        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
-        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
-
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        return dict(size=(new_height, new_width))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation)
-
-
-class FixedSizeCrop(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
-        padding_mode: str = "constant",
-    ) -> None:
-        super().__init__()
-        size = tuple(_setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
-        self.crop_height = size[0]
-        self.crop_width = size[1]
-        self.fill = fill  # TODO: Fill is currently respected only on PIL. Apply tensor patch.
-        self.padding_mode = padding_mode
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel):
+            raise TypeError(
+                f"If a BoundingBoxes is contained in the input sample, "
+                f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
+            )
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, height, width = query_chw(sample)
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
         new_height = min(height, self.crop_height)
         new_width = min(width, self.crop_width)
 
@@ -802,18 +68,26 @@ def _get_params(self, sample: Any) -> Dict[str, Any]:
         top = int(offset_height * r)
         left = int(offset_width * r)
 
-        if needs_crop:
-            bounding_boxes = query_bounding_box(sample)
-            bounding_boxes = cast(
-                features.BoundingBox, F.crop(bounding_boxes, top=top, left=left, height=height, width=width)
-            )
-            bounding_boxes = features.BoundingBox.new_like(
-                bounding_boxes,
-                F.clamp_bounding_box(
-                    bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size
-                ),
+        bounding_boxes: Optional[torch.Tensor]
+        try:
+            bounding_boxes = get_bounding_boxes(flat_inputs)
+        except ValueError:
+            bounding_boxes = None
+
+        if needs_crop and bounding_boxes is not None:
+            format = bounding_boxes.format
+            bounding_boxes, canvas_size = F.crop_bounding_boxes(
+                bounding_boxes.as_subclass(torch.Tensor),
+                format=format,
+                top=top,
+                left=left,
+                height=new_height,
+                width=new_width,
             )
-            height_and_width = bounding_boxes.to_format(features.BoundingBoxFormat.XYWH)[..., 2:]
+            bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size)
+            height_and_width = F.convert_bounding_box_format(
+                bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYWH
+            )[..., 2:]
             is_valid = torch.all(height_and_width > 0, dim=-1)
         else:
             is_valid = None
@@ -834,37 +108,28 @@ def _get_params(self, sample: Any) -> Dict[str, Any]:
             needs_pad=needs_pad,
         )
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
         if params["needs_crop"]:
-            inpt = F.crop(
+            inpt = self._call_kernel(
+                F.crop,
                 inpt,
                 top=params["top"],
                 left=params["left"],
                 height=params["height"],
                 width=params["width"],
             )
-            if isinstance(inpt, (features.Label, features.OneHotLabel, features.SegmentationMask)):
-                inpt = inpt.new_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
-            elif isinstance(inpt, features.BoundingBox):
-                inpt = features.BoundingBox.new_like(
-                    inpt,
-                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size),
+
+        if params["is_valid"] is not None:
+            if isinstance(inpt, (Label, OneHotLabel, tv_tensors.Mask)):
+                inpt = tv_tensors.wrap(inpt[params["is_valid"]], like=inpt)
+            elif isinstance(inpt, tv_tensors.BoundingBoxes):
+                inpt = tv_tensors.wrap(
+                    F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size),
+                    like=inpt,
                 )
 
         if params["needs_pad"]:
-            inpt = F.pad(inpt, params["padding"], fill=self.fill, padding_mode=self.padding_mode)
+            fill = _get_fill(self._fill, type(inpt))
+            inpt = self._call_kernel(F.pad, inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         return inpt
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        if not (
-            has_all(sample, features.BoundingBox)
-            and has_any(sample, PIL.Image.Image, features.Image, is_simple_tensor)
-            and has_any(sample, features.Label, features.OneHotLabel)
-        ):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
-                "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Segmentation Masks."
-            )
-        return super().forward(sample)
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
deleted file mode 100644
index b3b87b7cb09..00000000000
--- a/torchvision/prototype/transforms/_meta.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-import PIL.Image
-
-import torch
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-from torchvision.transforms.functional import convert_image_dtype
-
-from ._utils import is_simple_tensor
-
-
-class ConvertBoundingBoxFormat(Transform):
-    def __init__(self, format: Union[str, features.BoundingBoxFormat]) -> None:
-        super().__init__()
-        if isinstance(format, str):
-            format = features.BoundingBoxFormat[format]
-        self.format = format
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.BoundingBox):
-            output = F.convert_bounding_box_format(inpt, old_format=inpt.format, new_format=params["format"])
-            return features.BoundingBox.new_like(inpt, output, format=params["format"])
-        else:
-            return inpt
-
-
-class ConvertImageDtype(Transform):
-    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
-        super().__init__()
-        self.dtype = dtype
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Image):
-            output = convert_image_dtype(inpt, dtype=self.dtype)
-            return features.Image.new_like(inpt, output, dtype=self.dtype)
-        elif is_simple_tensor(inpt):
-            return convert_image_dtype(inpt, dtype=self.dtype)
-        else:
-            return inpt
-
-
-class ConvertColorSpace(Transform):
-    # F.convert_color_space does NOT handle `_Feature`'s in general
-    _transformed_types = (torch.Tensor, features.Image, PIL.Image.Image)
-
-    def __init__(
-        self,
-        color_space: Union[str, features.ColorSpace],
-        old_color_space: Optional[Union[str, features.ColorSpace]] = None,
-        copy: bool = True,
-    ) -> None:
-        super().__init__()
-
-        if isinstance(color_space, str):
-            color_space = features.ColorSpace.from_str(color_space)
-        self.color_space = color_space
-
-        if isinstance(old_color_space, str):
-            old_color_space = features.ColorSpace.from_str(old_color_space)
-        self.old_color_space = old_color_space
-
-        self.copy = copy
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.convert_color_space(
-            inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy
-        )
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index a45c757c6a4..a9dba1d5954 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,137 +1,69 @@
 import functools
-from typing import Any, Callable, Dict, List, Sequence, Type, Union
-
-import PIL.Image
+import warnings
+from collections import defaultdict
+from collections.abc import Sequence
+from typing import Any, Optional, TypeVar, Union
 
 import torch
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-from torchvision.transforms.transforms import _setup_size
 
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import Transform
 
-class Identity(Transform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt
+from torchvision.transforms.v2._utils import is_pure_tensor
 
 
-class Lambda(Transform):
-    def __init__(self, fn: Callable[[Any], Any], *types: Type):
-        super().__init__()
-        self.fn = fn
-        self.types = types
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if type(inpt) in self.types:
-            return self.fn(inpt)
-        else:
-            return inpt
-
-    def extra_repr(self) -> str:
-        extras = []
-        name = getattr(self.fn, "__name__", None)
-        if name:
-            extras.append(name)
-        extras.append(f"types={[type.__name__ for type in self.types]}")
-        return ", ".join(extras)
-
-
-class LinearTransformation(Transform):
-    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
-        super().__init__()
-        if transformation_matrix.size(0) != transformation_matrix.size(1):
-            raise ValueError(
-                "transformation_matrix should be square. Got "
-                f"{tuple(transformation_matrix.size())} rectangular matrix."
-            )
+T = TypeVar("T")
 
-        if mean_vector.size(0) != transformation_matrix.size(0):
-            raise ValueError(
-                f"mean_vector should have the same length {mean_vector.size(0)}"
-                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
-            )
 
-        if transformation_matrix.device != mean_vector.device:
-            raise ValueError(
-                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
-            )
+def _default_arg(value: T) -> T:
+    return value
 
-        self.transformation_matrix = transformation_matrix
-        self.mean_vector = mean_vector
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+def _get_defaultdict(default: T) -> dict[Any, T]:
+    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
+    # If it were possible, we could replace this with `defaultdict(lambda: default)`
+    return defaultdict(functools.partial(_default_arg, default))
 
-        if isinstance(inpt, features._Feature) and not isinstance(inpt, features.Image):
-            return inpt
-        elif isinstance(inpt, PIL.Image.Image):
-            raise TypeError("Unsupported input type")
 
-        # Image instance after linear transformation is not Image anymore due to unknown data range
-        # Thus we will return Tensor for input Image
+class PermuteDimensions(Transform):
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
 
-        shape = inpt.shape
-        n = shape[-3] * shape[-2] * shape[-1]
-        if n != self.transformation_matrix.shape[0]:
-            raise ValueError(
-                "Input tensor and transformation matrix have incompatible shape."
-                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
-                + f"{self.transformation_matrix.shape[0]}"
+    def __init__(self, dims: Union[Sequence[int], dict[type, Optional[Sequence[int]]]]) -> None:
+        super().__init__()
+        if not isinstance(dims, dict):
+            dims = _get_defaultdict(dims)
+        if torch.Tensor in dims and any(cls in dims for cls in [tv_tensors.Image, tv_tensors.Video]):
+            warnings.warn(
+                "Got `dims` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
             )
+        self.dims = dims
 
-        if inpt.device.type != self.mean_vector.device.type:
-            raise ValueError(
-                "Input tensor should be on the same device as transformation matrix and mean vector. "
-                f"Got {inpt.device} vs {self.mean_vector.device}"
-            )
+    def transform(self, inpt: Any, params: dict[str, Any]) -> torch.Tensor:
+        dims = self.dims[type(inpt)]
+        if dims is None:
+            return inpt.as_subclass(torch.Tensor)
+        return inpt.permute(*dims)
 
-        flat_tensor = inpt.view(-1, n) - self.mean_vector
-        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
-        return transformed_tensor.view(shape)
 
+class TransposeDimensions(Transform):
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
 
-class Normalize(Transform):
-    def __init__(self, mean: List[float], std: List[float]):
+    def __init__(self, dims: Union[tuple[int, int], dict[type, Optional[tuple[int, int]]]]) -> None:
         super().__init__()
-        self.mean = mean
-        self.std = std
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.normalize(inpt, mean=self.mean, std=self.std)
-
+        if not isinstance(dims, dict):
+            dims = _get_defaultdict(dims)
+        if torch.Tensor in dims and any(cls in dims for cls in [tv_tensors.Image, tv_tensors.Video]):
+            warnings.warn(
+                "Got `dims` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
+            )
+        self.dims = dims
 
-class GaussianBlur(Transform):
-    def __init__(
-        self, kernel_size: Union[int, Sequence[int]], sigma: Union[float, Sequence[float]] = (0.1, 2.0)
-    ) -> None:
-        super().__init__()
-        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
-        for ks in self.kernel_size:
-            if ks <= 0 or ks % 2 == 0:
-                raise ValueError("Kernel size value should be an odd and positive number.")
-
-        if isinstance(sigma, float):
-            if sigma <= 0:
-                raise ValueError("If sigma is a single number, it must be positive.")
-            sigma = (sigma, sigma)
-        elif isinstance(sigma, Sequence) and len(sigma) == 2:
-            if not 0.0 < sigma[0] <= sigma[1]:
-                raise ValueError("sigma values should be positive and of the form (min, max).")
-        else:
-            raise TypeError("sigma should be a single float or a list/tuple with length 2 floats.")
-
-        self.sigma = sigma
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
-        return dict(sigma=[sigma, sigma])
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.gaussian_blur(inpt, **params)
-
-
-class ToDtype(Lambda):
-    def __init__(self, dtype: torch.dtype, *types: Type) -> None:
-        self.dtype = dtype
-        super().__init__(functools.partial(torch.Tensor.to, dtype=dtype), *types)
-
-    def extra_repr(self) -> str:
-        return ", ".join([f"dtype={self.dtype}", f"types={[type.__name__ for type in self.types]}"])
+    def transform(self, inpt: Any, params: dict[str, Any]) -> torch.Tensor:
+        dims = self.dims[type(inpt)]
+        if dims is None:
+            return inpt.as_subclass(torch.Tensor)
+        return inpt.transpose(*dims)
diff --git a/torchvision/prototype/transforms/_presets.py b/torchvision/prototype/transforms/_presets.py
new file mode 100644
index 00000000000..aadb28cd312
--- /dev/null
+++ b/torchvision/prototype/transforms/_presets.py
@@ -0,0 +1,81 @@
+"""
+This file is part of the private API. Please do not use directly these classes as they will be modified on
+future versions without warning. The classes should be accessed only via the transforms argument of Weights.
+"""
+
+from typing import Optional, Union
+
+import PIL.Image
+
+import torch
+from torch import Tensor
+
+from torchvision.transforms.v2 import functional as F, InterpolationMode
+
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+
+__all__ = ["StereoMatching"]
+
+
+class StereoMatching(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        use_gray_scale: bool = False,
+        resize_size: Optional[tuple[int, ...]],
+        mean: tuple[float, ...] = (0.5, 0.5, 0.5),
+        std: tuple[float, ...] = (0.5, 0.5, 0.5),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    ) -> None:
+        super().__init__()
+
+        # pacify mypy
+        self.resize_size: Union[None, list]
+
+        if resize_size is not None:
+            self.resize_size = list(resize_size)
+        else:
+            self.resize_size = None
+
+        self.mean = list(mean)
+        self.std = list(std)
+        self.interpolation = _check_interpolation(interpolation)
+        self.use_gray_scale = use_gray_scale
+
+    def forward(self, left_image: Tensor, right_image: Tensor) -> tuple[Tensor, Tensor]:
+        def _process_image(img: PIL.Image.Image) -> Tensor:
+            if not isinstance(img, Tensor):
+                img = F.pil_to_tensor(img)
+            if self.resize_size is not None:
+                # We hard-code antialias=False to preserve results after we changed
+                # its default from None to True (see
+                # https://github.com/pytorch/vision/pull/7160)
+                # TODO: we could re-train the stereo models with antialias=True?
+                img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=False)
+            if self.use_gray_scale is True:
+                img = F.rgb_to_grayscale(img)
+            img = F.convert_image_dtype(img, torch.float)
+            img = F.normalize(img, mean=self.mean, std=self.std)
+            img = img.contiguous()
+            return img
+
+        left_image = _process_image(left_image)
+        right_image = _process_image(right_image)
+        return left_image, right_image
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        format_string += f"\n    resize_size={self.resize_size}"
+        format_string += f"\n    mean={self.mean}"
+        format_string += f"\n    std={self.std}"
+        format_string += f"\n    interpolation={self.interpolation}"
+        format_string += "\n)"
+        return format_string
+
+    def describe(self) -> str:
+        return (
+            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
+            f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``. "
+            f"Finally the values are first rescaled to ``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and "
+            f"``std={self.std}``."
+        )
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
deleted file mode 100644
index 9a12b53f355..00000000000
--- a/torchvision/prototype/transforms/_transform.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import enum
-from typing import Any, Dict, Tuple, Type
-
-import PIL.Image
-import torch
-from torch import nn
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype.features import _Feature
-from torchvision.utils import _log_api_usage_once
-
-
-class Transform(nn.Module):
-
-    # Class attribute defining transformed types. Other types are passed-through without any transformation
-    _transformed_types: Tuple[Type, ...] = (torch.Tensor, _Feature, PIL.Image.Image)
-
-    def __init__(self) -> None:
-        super().__init__()
-        _log_api_usage_once(self)
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        return dict()
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        raise NotImplementedError
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        params = self._get_params(sample)
-
-        flat_inputs, spec = tree_flatten(sample)
-        flat_outputs = [
-            self._transform(inpt, params) if isinstance(inpt, self._transformed_types) else inpt for inpt in flat_inputs
-        ]
-        return tree_unflatten(flat_outputs, spec)
-
-    def extra_repr(self) -> str:
-        extra = []
-        for name, value in self.__dict__.items():
-            if name.startswith("_") or name == "training":
-                continue
-
-            if not isinstance(value, (bool, int, float, str, tuple, list, enum.Enum)):
-                continue
-
-            extra.append(f"{name}={value}")
-
-        return ", ".join(extra)
-
-
-class _RandomApplyTransform(Transform):
-    def __init__(self, *, p: float = 0.5) -> None:
-        if not (0.0 <= p <= 1.0):
-            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
-
-        super().__init__()
-        self.p = p
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        if torch.rand(1) >= self.p:
-            return sample
-
-        return super().forward(sample)
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index 88464c09436..21447d7845d 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -1,74 +1,29 @@
-from typing import Any, Dict, Optional
-
-import numpy as np
-import PIL.Image
+from typing import Any
 
 import torch
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-
-from ._utils import is_simple_tensor
 
+from torch.nn.functional import one_hot
 
-class DecodeImage(Transform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.EncodedImage):
-            output = F.decode_image_with_pil(inpt)
-            return features.Image(output)
-        else:
-            return inpt
+from torchvision.prototype import tv_tensors as proto_tv_tensors
+from torchvision.transforms.v2 import Transform
 
 
 class LabelToOneHot(Transform):
+    _transformed_types = (proto_tv_tensors.Label,)
+
     def __init__(self, num_categories: int = -1):
         super().__init__()
         self.num_categories = num_categories
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Label):
-            num_categories = self.num_categories
-            if num_categories == -1 and inpt.categories is not None:
-                num_categories = len(inpt.categories)
-            output = F.label_to_one_hot(inpt, num_categories=num_categories)
-            return features.OneHotLabel(output, categories=inpt.categories)
-        else:
-            return inpt
+    def transform(self, inpt: proto_tv_tensors.Label, params: dict[str, Any]) -> proto_tv_tensors.OneHotLabel:
+        num_categories = self.num_categories
+        if num_categories == -1 and inpt.categories is not None:
+            num_categories = len(inpt.categories)
+        output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories)
+        return proto_tv_tensors.OneHotLabel(output, categories=inpt.categories)
 
     def extra_repr(self) -> str:
         if self.num_categories == -1:
             return ""
 
         return f"num_categories={self.num_categories}"
-
-
-class ToImageTensor(Transform):
-
-    # Updated transformed types for ToImageTensor
-    _transformed_types = (torch.Tensor, features._Feature, PIL.Image.Image, np.ndarray)
-
-    def __init__(self, *, copy: bool = False) -> None:
-        super().__init__()
-        self.copy = copy
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, (features.Image, PIL.Image.Image, np.ndarray)) or is_simple_tensor(inpt):
-            output = F.to_image_tensor(inpt, copy=self.copy)
-            return features.Image(output)
-        else:
-            return inpt
-
-
-class ToImagePIL(Transform):
-
-    # Updated transformed types for ToImagePIL
-    _transformed_types = (torch.Tensor, features._Feature, PIL.Image.Image, np.ndarray)
-
-    def __init__(self, *, mode: Optional[str] = None) -> None:
-        super().__init__()
-        self.mode = mode
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, (features.Image, PIL.Image.Image, np.ndarray)) or is_simple_tensor(inpt):
-            return F.to_image_pil(inpt, mode=self.mode)
-        else:
-            return inpt
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
deleted file mode 100644
index 3b3796fe5b9..00000000000
--- a/torchvision/prototype/transforms/_utils.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import Any, Callable, Tuple, Type, Union
-
-import PIL.Image
-import torch
-from torch.utils._pytree import tree_flatten
-from torchvision._utils import sequence_to_str
-from torchvision.prototype import features
-
-from .functional._meta import get_dimensions_image_pil, get_dimensions_image_tensor
-
-
-def query_bounding_box(sample: Any) -> features.BoundingBox:
-    flat_sample, _ = tree_flatten(sample)
-    for i in flat_sample:
-        if isinstance(i, features.BoundingBox):
-            return i
-
-    raise TypeError("No bounding box was found in the sample")
-
-
-def get_chw(image: Union[PIL.Image.Image, torch.Tensor, features.Image]) -> Tuple[int, int, int]:
-    if isinstance(image, features.Image):
-        channels = image.num_channels
-        height, width = image.image_size
-    elif isinstance(image, torch.Tensor):
-        channels, height, width = get_dimensions_image_tensor(image)
-    elif isinstance(image, PIL.Image.Image):
-        channels, height, width = get_dimensions_image_pil(image)
-    else:
-        raise TypeError(f"unable to get image dimensions from object of type {type(image).__name__}")
-    return channels, height, width
-
-
-def query_chw(sample: Any) -> Tuple[int, int, int]:
-    flat_sample, _ = tree_flatten(sample)
-    chws = {
-        get_chw(item)
-        for item in flat_sample
-        if isinstance(item, (features.Image, PIL.Image.Image)) or is_simple_tensor(item)
-    }
-    if not chws:
-        raise TypeError("No image was found in the sample")
-    elif len(chws) > 2:
-        raise TypeError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
-    return chws.pop()
-
-
-def has_any(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    flat_sample, _ = tree_flatten(sample)
-    for type_or_check in types_or_checks:
-        for obj in flat_sample:
-            if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
-                return True
-    return False
-
-
-def has_all(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    flat_sample, _ = tree_flatten(sample)
-    for type_or_check in types_or_checks:
-        for obj in flat_sample:
-            if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
-                break
-        else:
-            return False
-    return True
-
-
-def is_simple_tensor(inpt: Any) -> bool:
-    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, features._Feature)
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
deleted file mode 100644
index 85f53de4e04..00000000000
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from torchvision.transforms import InterpolationMode  # usort: skip
-from ._meta import (
-    clamp_bounding_box,
-    convert_bounding_box_format,
-    convert_color_space_image_tensor,
-    convert_color_space_image_pil,
-    convert_color_space,
-)  # usort: skip
-
-from ._augment import erase, erase_image_pil, erase_image_tensor
-from ._color import (
-    adjust_brightness,
-    adjust_brightness_image_pil,
-    adjust_brightness_image_tensor,
-    adjust_contrast,
-    adjust_contrast_image_pil,
-    adjust_contrast_image_tensor,
-    adjust_gamma,
-    adjust_gamma_image_pil,
-    adjust_gamma_image_tensor,
-    adjust_hue,
-    adjust_hue_image_pil,
-    adjust_hue_image_tensor,
-    adjust_saturation,
-    adjust_saturation_image_pil,
-    adjust_saturation_image_tensor,
-    adjust_sharpness,
-    adjust_sharpness_image_pil,
-    adjust_sharpness_image_tensor,
-    autocontrast,
-    autocontrast_image_pil,
-    autocontrast_image_tensor,
-    equalize,
-    equalize_image_pil,
-    equalize_image_tensor,
-    invert,
-    invert_image_pil,
-    invert_image_tensor,
-    posterize,
-    posterize_image_pil,
-    posterize_image_tensor,
-    solarize,
-    solarize_image_pil,
-    solarize_image_tensor,
-)
-from ._geometry import (
-    affine,
-    affine_bounding_box,
-    affine_image_pil,
-    affine_image_tensor,
-    affine_segmentation_mask,
-    center_crop,
-    center_crop_bounding_box,
-    center_crop_image_pil,
-    center_crop_image_tensor,
-    center_crop_segmentation_mask,
-    crop,
-    crop_bounding_box,
-    crop_image_pil,
-    crop_image_tensor,
-    crop_segmentation_mask,
-    elastic,
-    elastic_bounding_box,
-    elastic_image_pil,
-    elastic_image_tensor,
-    elastic_segmentation_mask,
-    elastic_transform,
-    five_crop_image_pil,
-    five_crop_image_tensor,
-    horizontal_flip,
-    horizontal_flip_bounding_box,
-    horizontal_flip_image_pil,
-    horizontal_flip_image_tensor,
-    horizontal_flip_segmentation_mask,
-    pad,
-    pad_bounding_box,
-    pad_image_pil,
-    pad_image_tensor,
-    pad_segmentation_mask,
-    perspective,
-    perspective_bounding_box,
-    perspective_image_pil,
-    perspective_image_tensor,
-    perspective_segmentation_mask,
-    resize,
-    resize_bounding_box,
-    resize_image_pil,
-    resize_image_tensor,
-    resize_segmentation_mask,
-    resized_crop,
-    resized_crop_bounding_box,
-    resized_crop_image_pil,
-    resized_crop_image_tensor,
-    resized_crop_segmentation_mask,
-    rotate,
-    rotate_bounding_box,
-    rotate_image_pil,
-    rotate_image_tensor,
-    rotate_segmentation_mask,
-    ten_crop_image_pil,
-    ten_crop_image_tensor,
-    vertical_flip,
-    vertical_flip_bounding_box,
-    vertical_flip_image_pil,
-    vertical_flip_image_tensor,
-    vertical_flip_segmentation_mask,
-)
-from ._misc import gaussian_blur, gaussian_blur_image_pil, gaussian_blur_image_tensor, normalize, normalize_image_tensor
-from ._type_conversion import (
-    decode_image_with_pil,
-    decode_video_with_av,
-    label_to_one_hot,
-    to_image_pil,
-    to_image_tensor,
-)
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
deleted file mode 100644
index f619343ab2e..00000000000
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from typing import Any
-
-import PIL.Image
-
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
-from torchvision.transforms.functional import pil_to_tensor, to_pil_image
-
-
-erase_image_tensor = _FT.erase
-
-
-def erase_image_pil(
-    img: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
-) -> PIL.Image.Image:
-    t_img = pil_to_tensor(img)
-    output = erase_image_tensor(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    return to_pil_image(output, mode=img.mode)
-
-
-def erase(inpt: Any, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False) -> Any:
-    if isinstance(inpt, torch.Tensor):
-        output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output)
-        return output
-    else:  # isinstance(inpt, PIL.Image.Image):
-        return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
deleted file mode 100644
index 554fb98ae52..00000000000
--- a/torchvision/prototype/transforms/functional/_color.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from typing import Union
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
-
-
-# shortcut type
-DType = Union[torch.Tensor, PIL.Image.Image, features._Feature]
-
-adjust_brightness_image_tensor = _FT.adjust_brightness
-adjust_brightness_image_pil = _FP.adjust_brightness
-
-
-def adjust_brightness(inpt: DType, brightness_factor: float) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.adjust_brightness(brightness_factor=brightness_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
-    else:
-        return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
-
-
-adjust_saturation_image_tensor = _FT.adjust_saturation
-adjust_saturation_image_pil = _FP.adjust_saturation
-
-
-def adjust_saturation(inpt: DType, saturation_factor: float) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.adjust_saturation(saturation_factor=saturation_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
-    else:
-        return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
-
-
-adjust_contrast_image_tensor = _FT.adjust_contrast
-adjust_contrast_image_pil = _FP.adjust_contrast
-
-
-def adjust_contrast(inpt: DType, contrast_factor: float) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.adjust_contrast(contrast_factor=contrast_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
-    else:
-        return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
-
-
-adjust_sharpness_image_tensor = _FT.adjust_sharpness
-adjust_sharpness_image_pil = _FP.adjust_sharpness
-
-
-def adjust_sharpness(inpt: DType, sharpness_factor: float) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.adjust_sharpness(sharpness_factor=sharpness_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
-    else:
-        return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
-
-
-adjust_hue_image_tensor = _FT.adjust_hue
-adjust_hue_image_pil = _FP.adjust_hue
-
-
-def adjust_hue(inpt: DType, hue_factor: float) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.adjust_hue(hue_factor=hue_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
-    else:
-        return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
-
-
-adjust_gamma_image_tensor = _FT.adjust_gamma
-adjust_gamma_image_pil = _FP.adjust_gamma
-
-
-def adjust_gamma(inpt: DType, gamma: float, gain: float = 1) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.adjust_gamma(gamma=gamma, gain=gain)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
-    else:
-        return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
-
-
-posterize_image_tensor = _FT.posterize
-posterize_image_pil = _FP.posterize
-
-
-def posterize(inpt: DType, bits: int) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.posterize(bits=bits)
-    elif isinstance(inpt, PIL.Image.Image):
-        return posterize_image_pil(inpt, bits=bits)
-    else:
-        return posterize_image_tensor(inpt, bits=bits)
-
-
-solarize_image_tensor = _FT.solarize
-solarize_image_pil = _FP.solarize
-
-
-def solarize(inpt: DType, threshold: float) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.solarize(threshold=threshold)
-    elif isinstance(inpt, PIL.Image.Image):
-        return solarize_image_pil(inpt, threshold=threshold)
-    else:
-        return solarize_image_tensor(inpt, threshold=threshold)
-
-
-autocontrast_image_tensor = _FT.autocontrast
-autocontrast_image_pil = _FP.autocontrast
-
-
-def autocontrast(inpt: DType) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.autocontrast()
-    elif isinstance(inpt, PIL.Image.Image):
-        return autocontrast_image_pil(inpt)
-    else:
-        return autocontrast_image_tensor(inpt)
-
-
-equalize_image_tensor = _FT.equalize
-equalize_image_pil = _FP.equalize
-
-
-def equalize(inpt: DType) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.equalize()
-    elif isinstance(inpt, PIL.Image.Image):
-        return equalize_image_pil(inpt)
-    else:
-        return equalize_image_tensor(inpt)
-
-
-invert_image_tensor = _FT.invert
-invert_image_pil = _FP.invert
-
-
-def invert(inpt: DType) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.invert()
-    elif isinstance(inpt, PIL.Image.Image):
-        return invert_image_pil(inpt)
-    else:
-        return invert_image_tensor(inpt)
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
deleted file mode 100644
index 1aa79a56727..00000000000
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ /dev/null
@@ -1,1104 +0,0 @@
-import numbers
-import warnings
-from typing import List, Optional, Sequence, Tuple, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
-from torchvision.transforms.functional import (
-    _compute_output_size,
-    _get_inverse_affine_matrix,
-    _get_perspective_coeffs,
-    InterpolationMode,
-    pil_modes_mapping,
-    pil_to_tensor,
-    to_pil_image,
-)
-
-from ._meta import convert_bounding_box_format, get_dimensions_image_pil, get_dimensions_image_tensor
-
-
-# shortcut type
-DType = Union[torch.Tensor, PIL.Image.Image, features._Feature]
-
-
-horizontal_flip_image_tensor = _FT.hflip
-horizontal_flip_image_pil = _FP.hflip
-
-
-def horizontal_flip_segmentation_mask(segmentation_mask: torch.Tensor) -> torch.Tensor:
-    return horizontal_flip_image_tensor(segmentation_mask)
-
-
-def horizontal_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
-) -> torch.Tensor:
-    shape = bounding_box.shape
-
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    bounding_box[:, [0, 2]] = image_size[1] - bounding_box[:, [2, 0]]
-
-    return convert_bounding_box_format(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(shape)
-
-
-def horizontal_flip(inpt: DType) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.horizontal_flip()
-    elif isinstance(inpt, PIL.Image.Image):
-        return horizontal_flip_image_pil(inpt)
-    else:
-        return horizontal_flip_image_tensor(inpt)
-
-
-vertical_flip_image_tensor = _FT.vflip
-vertical_flip_image_pil = _FP.vflip
-
-
-def vertical_flip_segmentation_mask(segmentation_mask: torch.Tensor) -> torch.Tensor:
-    return vertical_flip_image_tensor(segmentation_mask)
-
-
-def vertical_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
-) -> torch.Tensor:
-    shape = bounding_box.shape
-
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    bounding_box[:, [1, 3]] = image_size[0] - bounding_box[:, [3, 1]]
-
-    return convert_bounding_box_format(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(shape)
-
-
-def vertical_flip(inpt: DType) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.vertical_flip()
-    elif isinstance(inpt, PIL.Image.Image):
-        return vertical_flip_image_pil(inpt)
-    else:
-        return vertical_flip_image_tensor(inpt)
-
-
-def resize_image_tensor(
-    image: torch.Tensor,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-    antialias: bool = False,
-) -> torch.Tensor:
-    num_channels, old_height, old_width = get_dimensions_image_tensor(image)
-    new_height, new_width = _compute_output_size((old_height, old_width), size=size, max_size=max_size)
-    batch_shape = image.shape[:-3]
-    return _FT.resize(
-        image.reshape((-1, num_channels, old_height, old_width)),
-        size=[new_height, new_width],
-        interpolation=interpolation.value,
-        antialias=antialias,
-    ).reshape(batch_shape + (num_channels, new_height, new_width))
-
-
-def resize_image_pil(
-    img: PIL.Image.Image,
-    size: Union[Sequence[int], int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-) -> PIL.Image.Image:
-    if isinstance(size, int):
-        size = [size, size]
-    # Explicitly cast size to list otherwise mypy issue: incompatible type "Sequence[int]"; expected "List[int]"
-    size: List[int] = list(size)
-    size = _compute_output_size(img.size[::-1], size=size, max_size=max_size)
-    return _FP.resize(img, size, interpolation=pil_modes_mapping[interpolation])
-
-
-def resize_segmentation_mask(
-    segmentation_mask: torch.Tensor, size: List[int], max_size: Optional[int] = None
-) -> torch.Tensor:
-    return resize_image_tensor(segmentation_mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
-
-
-def resize_bounding_box(
-    bounding_box: torch.Tensor, size: List[int], image_size: Tuple[int, int], max_size: Optional[int] = None
-) -> torch.Tensor:
-    old_height, old_width = image_size
-    new_height, new_width = _compute_output_size(image_size, size=size, max_size=max_size)
-    ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
-    return bounding_box.view(-1, 2, 2).mul(ratios).view(bounding_box.shape)
-
-
-def resize(
-    inpt: DType,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
-) -> DType:
-    if isinstance(inpt, features._Feature):
-        antialias = False if antialias is None else antialias
-        return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    elif isinstance(inpt, PIL.Image.Image):
-        if antialias is not None and not antialias:
-            warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
-        return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
-    else:
-        antialias = False if antialias is None else antialias
-        return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-
-
-def _affine_parse_args(
-    angle: float,
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    center: Optional[List[float]] = None,
-) -> Tuple[float, List[float], List[float], Optional[List[float]]]:
-    if not isinstance(angle, (int, float)):
-        raise TypeError("Argument angle should be int or float")
-
-    if not isinstance(translate, (list, tuple)):
-        raise TypeError("Argument translate should be a sequence")
-
-    if len(translate) != 2:
-        raise ValueError("Argument translate should be a sequence of length 2")
-
-    if scale <= 0.0:
-        raise ValueError("Argument scale should be positive")
-
-    if not isinstance(shear, (numbers.Number, (list, tuple))):
-        raise TypeError("Shear should be either a single value or a sequence of two values")
-
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
-    if isinstance(angle, int):
-        angle = float(angle)
-
-    if isinstance(translate, tuple):
-        translate = list(translate)
-
-    if isinstance(shear, numbers.Number):
-        shear = [shear, 0.0]
-
-    if isinstance(shear, tuple):
-        shear = list(shear)
-
-    if len(shear) == 1:
-        shear = [shear[0], shear[0]]
-
-    if len(shear) != 2:
-        raise ValueError(f"Shear should be a sequence containing two values. Got {shear}")
-
-    if center is not None and not isinstance(center, (list, tuple)):
-        raise TypeError("Argument center should be a sequence")
-
-    return angle, translate, shear, center
-
-
-def affine_image_tensor(
-    img: torch.Tensor,
-    angle: float,
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: Optional[List[float]] = None,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    num_channels, height, width = img.shape[-3:]
-    extra_dims = img.shape[:-3]
-    img = img.view(-1, num_channels, height, width)
-
-    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
-
-    center_f = [0.0, 0.0]
-    if center is not None:
-        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-        center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
-
-    translate_f = [1.0 * t for t in translate]
-    matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
-
-    output = _FT.affine(img, matrix, interpolation=interpolation.value, fill=fill)
-    return output.view(extra_dims + (num_channels, height, width))
-
-
-def affine_image_pil(
-    img: PIL.Image.Image,
-    angle: float,
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    center: Optional[List[float]] = None,
-) -> PIL.Image.Image:
-    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
-
-    # center = (img_size[0] * 0.5 + 0.5, img_size[1] * 0.5 + 0.5)
-    # it is visually better to estimate the center without 0.5 offset
-    # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
-    if center is None:
-        _, height, width = get_dimensions_image_pil(img)
-        center = [width * 0.5, height * 0.5]
-    matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
-
-    return _FP.affine(img, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
-
-
-def _affine_bounding_box_xyxy(
-    bounding_box: torch.Tensor,
-    image_size: Tuple[int, int],
-    angle: float,
-    translate: Optional[List[float]] = None,
-    scale: Optional[float] = None,
-    shear: Optional[List[float]] = None,
-    center: Optional[List[float]] = None,
-    expand: bool = False,
-) -> torch.Tensor:
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
-
-    if translate is None:
-        translate = [0.0, 0.0]
-
-    if scale is None:
-        scale = 1.0
-
-    if shear is None:
-        shear = [0.0, 0.0]
-
-    if center is None:
-        height, width = image_size
-        center_f = [width * 0.5, height * 0.5]
-    else:
-        center_f = [float(c) for c in center]
-
-    translate_f = [float(t) for t in translate]
-    affine_matrix = torch.tensor(
-        _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear, inverted=False),
-        dtype=dtype,
-        device=device,
-    ).view(2, 3)
-    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
-    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
-    # Single point structure is similar to
-    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
-    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
-    # 2) Now let's transform the points using affine matrix
-    transformed_points = torch.matmul(points, affine_matrix.T)
-    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
-    # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.view(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
-
-    if expand:
-        # Compute minimum point for transformed image frame:
-        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-        height, width = image_size
-        points = torch.tensor(
-            [
-                [0.0, 0.0, 1.0],
-                [0.0, 1.0 * height, 1.0],
-                [1.0 * width, 1.0 * height, 1.0],
-                [1.0 * width, 0.0, 1.0],
-            ],
-            dtype=dtype,
-            device=device,
-        )
-        new_points = torch.matmul(points, affine_matrix.T)
-        tr, _ = torch.min(new_points, dim=0, keepdim=True)
-        # Translate bounding boxes
-        out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0]
-        out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1]
-
-    return out_bboxes
-
-
-def affine_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
-    angle: float,
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    original_shape = bounding_box.shape
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    out_bboxes = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center)
-
-    # out_bboxes should be of shape [N boxes, 4]
-
-    return convert_bounding_box_format(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def affine_segmentation_mask(
-    mask: torch.Tensor,
-    angle: float,
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    return affine_image_tensor(
-        mask,
-        angle=angle,
-        translate=translate,
-        scale=scale,
-        shear=shear,
-        interpolation=InterpolationMode.NEAREST,
-        center=center,
-    )
-
-
-def _convert_fill_arg(fill: Optional[Union[int, float, Sequence[int], Sequence[float]]]) -> Optional[List[float]]:
-    if fill is None:
-        fill = 0
-
-    # This cast does Sequence -> List[float] to please mypy and torch.jit.script
-    if not isinstance(fill, (int, float)):
-        fill = [float(v) for v in list(fill)]
-    else:
-        # It is OK to cast int to float as later we use inpt.dtype
-        fill = [float(fill)]
-    return fill
-
-
-def affine(
-    inpt: DType,
-    angle: float,
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    center: Optional[List[float]] = None,
-) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.affine(
-            angle, translate=translate, scale=scale, shear=shear, interpolation=interpolation, fill=fill, center=center
-        )
-    elif isinstance(inpt, PIL.Image.Image):
-        return affine_image_pil(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-    else:
-        fill = _convert_fill_arg(fill)
-
-        return affine_image_tensor(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-
-def rotate_image_tensor(
-    img: torch.Tensor,
-    angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    expand: bool = False,
-    fill: Optional[List[float]] = None,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    num_channels, height, width = img.shape[-3:]
-    extra_dims = img.shape[:-3]
-    img = img.view(-1, num_channels, height, width)
-
-    center_f = [0.0, 0.0]
-    if center is not None:
-        if expand:
-            warnings.warn("The provided center argument has no effect on the result if expand is True")
-        else:
-            _, height, width = get_dimensions_image_tensor(img)
-            # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-            center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
-
-    # due to current incoherence of rotation angle direction between affine and rotate implementations
-    # we need to set -angle.
-    matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
-    output = _FT.rotate(img, matrix, interpolation=interpolation.value, expand=expand, fill=fill)
-    new_height, new_width = output.shape[-2:]
-    return output.view(extra_dims + (num_channels, new_height, new_width))
-
-
-def rotate_image_pil(
-    img: PIL.Image.Image,
-    angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    expand: bool = False,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    center: Optional[List[float]] = None,
-) -> PIL.Image.Image:
-    if center is not None and expand:
-        warnings.warn("The provided center argument has no effect on the result if expand is True")
-        center = None
-
-    return _FP.rotate(
-        img, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center
-    )
-
-
-def rotate_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
-    angle: float,
-    expand: bool = False,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    if center is not None and expand:
-        warnings.warn("The provided center argument has no effect on the result if expand is True")
-        center = None
-
-    original_shape = bounding_box.shape
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    out_bboxes = _affine_bounding_box_xyxy(bounding_box, image_size, angle=-angle, center=center, expand=expand)
-
-    return convert_bounding_box_format(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def rotate_segmentation_mask(
-    img: torch.Tensor,
-    angle: float,
-    expand: bool = False,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    return rotate_image_tensor(
-        img,
-        angle=angle,
-        expand=expand,
-        interpolation=InterpolationMode.NEAREST,
-        center=center,
-    )
-
-
-def rotate(
-    inpt: DType,
-    angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    expand: bool = False,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    center: Optional[List[float]] = None,
-) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.rotate(angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    elif isinstance(inpt, PIL.Image.Image):
-        return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    else:
-        fill = _convert_fill_arg(fill)
-
-        return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-
-
-pad_image_pil = _FP.pad
-
-
-def pad_image_tensor(
-    img: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: Optional[Union[int, float]] = 0,
-    padding_mode: str = "constant",
-) -> torch.Tensor:
-    num_channels, height, width = img.shape[-3:]
-    extra_dims = img.shape[:-3]
-
-    padded_image = _FT.pad(
-        img=img.view(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
-    )
-
-    new_height, new_width = padded_image.shape[-2:]
-    return padded_image.view(extra_dims + (num_channels, new_height, new_width))
-
-
-# TODO: This should be removed once pytorch pad supports non-scalar padding values
-def _pad_with_vector_fill(
-    img: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: Sequence[float] = [0.0],
-    padding_mode: str = "constant",
-) -> torch.Tensor:
-    if padding_mode != "constant":
-        raise ValueError(f"Padding mode '{padding_mode}' is not supported if fill is not scalar")
-
-    output = pad_image_tensor(img, padding, fill=0, padding_mode="constant")
-    left, right, top, bottom = _FT._parse_pad_padding(padding)
-    fill = torch.tensor(fill, dtype=img.dtype, device=img.device).view(-1, 1, 1)
-
-    if top > 0:
-        output[..., :top, :] = fill
-    if left > 0:
-        output[..., :, :left] = fill
-    if bottom > 0:
-        output[..., -bottom:, :] = fill
-    if right > 0:
-        output[..., :, -right:] = fill
-    return output
-
-
-def pad_segmentation_mask(
-    segmentation_mask: torch.Tensor, padding: Union[int, List[int]], padding_mode: str = "constant"
-) -> torch.Tensor:
-    num_masks, height, width = segmentation_mask.shape[-3:]
-    extra_dims = segmentation_mask.shape[:-3]
-
-    padded_mask = pad_image_tensor(
-        img=segmentation_mask.view(-1, num_masks, height, width), padding=padding, fill=0, padding_mode=padding_mode
-    )
-
-    new_height, new_width = padded_mask.shape[-2:]
-    return padded_mask.view(extra_dims + (num_masks, new_height, new_width))
-
-
-def pad_bounding_box(
-    bounding_box: torch.Tensor, padding: Union[int, List[int]], format: features.BoundingBoxFormat
-) -> torch.Tensor:
-    left, _, top, _ = _FT._parse_pad_padding(padding)
-
-    bounding_box = bounding_box.clone()
-
-    # this works without conversion since padding only affects xy coordinates
-    bounding_box[..., 0] += left
-    bounding_box[..., 1] += top
-    if format == features.BoundingBoxFormat.XYXY:
-        bounding_box[..., 2] += left
-        bounding_box[..., 3] += top
-    return bounding_box
-
-
-def pad(
-    inpt: DType,
-    padding: Union[int, Sequence[int]],
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-    padding_mode: str = "constant",
-) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.pad(padding, fill=fill, padding_mode=padding_mode)
-    elif isinstance(inpt, PIL.Image.Image):
-        return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
-    else:
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        # TODO: PyTorch's pad supports only scalars on fill. So we need to overwrite the colour
-        if isinstance(fill, (int, float)) or fill is None:
-            return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
-        return _pad_with_vector_fill(inpt, padding, fill=fill, padding_mode=padding_mode)
-
-
-crop_image_tensor = _FT.crop
-crop_image_pil = _FP.crop
-
-
-def crop_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    top: int,
-    left: int,
-) -> torch.Tensor:
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    )
-
-    # Crop or implicit pad if left and/or top have negative values:
-    bounding_box[..., 0::2] -= left
-    bounding_box[..., 1::2] -= top
-
-    return convert_bounding_box_format(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    )
-
-
-def crop_segmentation_mask(img: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-    return crop_image_tensor(img, top, left, height, width)
-
-
-def crop(inpt: DType, top: int, left: int, height: int, width: int) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.crop(top, left, height, width)
-    elif isinstance(inpt, PIL.Image.Image):
-        return crop_image_pil(inpt, top, left, height, width)
-    else:
-        return crop_image_tensor(inpt, top, left, height, width)
-
-
-def perspective_image_tensor(
-    img: torch.Tensor,
-    perspective_coeffs: List[float],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[List[float]] = None,
-) -> torch.Tensor:
-    return _FT.perspective(img, perspective_coeffs, interpolation=interpolation.value, fill=fill)
-
-
-def perspective_image_pil(
-    img: PIL.Image.Image,
-    perspective_coeffs: List[float],
-    interpolation: InterpolationMode = InterpolationMode.BICUBIC,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> PIL.Image.Image:
-    return _FP.perspective(img, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
-
-
-def perspective_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    perspective_coeffs: List[float],
-) -> torch.Tensor:
-
-    if len(perspective_coeffs) != 8:
-        raise ValueError("Argument perspective_coeffs should have 8 float values")
-
-    original_shape = bounding_box.shape
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
-
-    # perspective_coeffs are computed as endpoint -> start point
-    # We have to invert perspective_coeffs for bboxes:
-    # (x, y) - end point and (x_out, y_out) - start point
-    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    # and we would like to get:
-    # x = (inv_coeffs[0] * x_out + inv_coeffs[1] * y_out + inv_coeffs[2])
-    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
-    # y = (inv_coeffs[3] * x_out + inv_coeffs[4] * y_out + inv_coeffs[5])
-    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
-    # and compute inv_coeffs in terms of coeffs
-
-    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
-    if denom == 0:
-        raise RuntimeError(
-            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform bounding boxes. "
-            f"Denominator is zero, denom={denom}"
-        )
-
-    inv_coeffs = [
-        (perspective_coeffs[4] - perspective_coeffs[5] * perspective_coeffs[7]) / denom,
-        (-perspective_coeffs[1] + perspective_coeffs[2] * perspective_coeffs[7]) / denom,
-        (perspective_coeffs[1] * perspective_coeffs[5] - perspective_coeffs[2] * perspective_coeffs[4]) / denom,
-        (-perspective_coeffs[3] + perspective_coeffs[5] * perspective_coeffs[6]) / denom,
-        (perspective_coeffs[0] - perspective_coeffs[2] * perspective_coeffs[6]) / denom,
-        (-perspective_coeffs[0] * perspective_coeffs[5] + perspective_coeffs[2] * perspective_coeffs[3]) / denom,
-        (-perspective_coeffs[4] * perspective_coeffs[6] + perspective_coeffs[3] * perspective_coeffs[7]) / denom,
-        (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
-    ]
-
-    theta1 = torch.tensor(
-        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
-        dtype=dtype,
-        device=device,
-    )
-
-    theta2 = torch.tensor(
-        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
-    )
-
-    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
-    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
-    # Single point structure is similar to
-    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
-    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
-    # 2) Now let's transform the points using perspective matrices
-    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-
-    numer_points = torch.matmul(points, theta1.T)
-    denom_points = torch.matmul(points, theta2.T)
-    transformed_points = numer_points / denom_points
-
-    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
-    # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.view(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
-
-    # out_bboxes should be of shape [N boxes, 4]
-
-    return convert_bounding_box_format(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def perspective_segmentation_mask(mask: torch.Tensor, perspective_coeffs: List[float]) -> torch.Tensor:
-    return perspective_image_tensor(
-        mask, perspective_coeffs=perspective_coeffs, interpolation=InterpolationMode.NEAREST
-    )
-
-
-def perspective(
-    inpt: DType,
-    startpoints: List[List[int]],
-    endpoints: List[List[int]],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> DType:
-    perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
-
-    if isinstance(inpt, features._Feature):
-        return inpt.perspective(perspective_coeffs, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, PIL.Image.Image):
-        return perspective_image_pil(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
-    else:
-        fill = _convert_fill_arg(fill)
-
-        return perspective_image_tensor(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
-
-
-def elastic_image_tensor(
-    img: torch.Tensor,
-    displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[List[float]] = None,
-) -> torch.Tensor:
-    return _FT.elastic_transform(img, displacement, interpolation=interpolation.value, fill=fill)
-
-
-def elastic_image_pil(
-    img: PIL.Image.Image,
-    displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> PIL.Image.Image:
-    t_img = pil_to_tensor(img)
-    fill = _convert_fill_arg(fill)
-
-    output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill)
-    return to_pil_image(output, mode=img.mode)
-
-
-def elastic_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    displacement: torch.Tensor,
-) -> torch.Tensor:
-    # TODO: add in docstring about approximation we are doing for grid inversion
-    displacement = displacement.to(bounding_box.device)
-
-    original_shape = bounding_box.shape
-    bounding_box = convert_bounding_box_format(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
-    # Or add image_size arg and check displacement shape
-    image_size = displacement.shape[-3], displacement.shape[-2]
-
-    id_grid = _FT._create_identity_grid(list(image_size)).to(bounding_box.device)
-    # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
-    # This is not an exact inverse of the grid
-    inv_grid = id_grid - displacement
-
-    # Get points from bboxes
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
-    index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long)
-    index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long)
-    # Transform points:
-    t_size = torch.tensor(image_size[::-1], device=displacement.device, dtype=displacement.dtype)
-    transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5
-
-    transformed_points = transformed_points.view(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
-
-    return convert_bounding_box_format(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def elastic_segmentation_mask(mask: torch.Tensor, displacement: torch.Tensor) -> torch.Tensor:
-    return elastic_image_tensor(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST)
-
-
-def elastic(
-    inpt: DType,
-    displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.elastic(displacement, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, PIL.Image.Image):
-        return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
-    else:
-        fill = _convert_fill_arg(fill)
-
-        return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
-
-
-elastic_transform = elastic
-
-
-def _center_crop_parse_output_size(output_size: List[int]) -> List[int]:
-    if isinstance(output_size, numbers.Number):
-        return [int(output_size), int(output_size)]
-    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
-        return [output_size[0], output_size[0]]
-    else:
-        return list(output_size)
-
-
-def _center_crop_compute_padding(crop_height: int, crop_width: int, image_height: int, image_width: int) -> List[int]:
-    return [
-        (crop_width - image_width) // 2 if crop_width > image_width else 0,
-        (crop_height - image_height) // 2 if crop_height > image_height else 0,
-        (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
-        (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
-    ]
-
-
-def _center_crop_compute_crop_anchor(
-    crop_height: int, crop_width: int, image_height: int, image_width: int
-) -> Tuple[int, int]:
-    crop_top = int(round((image_height - crop_height) / 2.0))
-    crop_left = int(round((image_width - crop_width) / 2.0))
-    return crop_top, crop_left
-
-
-def center_crop_image_tensor(img: torch.Tensor, output_size: List[int]) -> torch.Tensor:
-    crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    _, image_height, image_width = get_dimensions_image_tensor(img)
-
-    if crop_height > image_height or crop_width > image_width:
-        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        img = pad_image_tensor(img, padding_ltrb, fill=0)
-
-        _, image_height, image_width = get_dimensions_image_tensor(img)
-        if crop_width == image_width and crop_height == image_height:
-            return img
-
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
-    return crop_image_tensor(img, crop_top, crop_left, crop_height, crop_width)
-
-
-def center_crop_image_pil(img: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
-    crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    _, image_height, image_width = get_dimensions_image_pil(img)
-
-    if crop_height > image_height or crop_width > image_width:
-        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        img = pad_image_pil(img, padding_ltrb, fill=0)
-
-        _, image_height, image_width = get_dimensions_image_pil(img)
-        if crop_width == image_width and crop_height == image_height:
-            return img
-
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
-    return crop_image_pil(img, crop_top, crop_left, crop_height, crop_width)
-
-
-def center_crop_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    output_size: List[int],
-    image_size: Tuple[int, int],
-) -> torch.Tensor:
-    crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *image_size)
-    return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left)
-
-
-def center_crop_segmentation_mask(segmentation_mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
-    return center_crop_image_tensor(img=segmentation_mask, output_size=output_size)
-
-
-def center_crop(inpt: DType, output_size: List[int]) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.center_crop(output_size)
-    elif isinstance(inpt, PIL.Image.Image):
-        return center_crop_image_pil(inpt, output_size)
-    else:
-        return center_crop_image_tensor(inpt, output_size)
-
-
-def resized_crop_image_tensor(
-    img: torch.Tensor,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: bool = False,
-) -> torch.Tensor:
-    img = crop_image_tensor(img, top, left, height, width)
-    return resize_image_tensor(img, size, interpolation=interpolation, antialias=antialias)
-
-
-def resized_crop_image_pil(
-    img: PIL.Image.Image,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-) -> PIL.Image.Image:
-    img = crop_image_pil(img, top, left, height, width)
-    return resize_image_pil(img, size, interpolation=interpolation)
-
-
-def resized_crop_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-) -> torch.Tensor:
-    bounding_box = crop_bounding_box(bounding_box, format, top, left)
-    return resize_bounding_box(bounding_box, size, (height, width))
-
-
-def resized_crop_segmentation_mask(
-    mask: torch.Tensor,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-) -> torch.Tensor:
-    mask = crop_segmentation_mask(mask, top, left, height, width)
-    return resize_segmentation_mask(mask, size)
-
-
-def resized_crop(
-    inpt: DType,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
-) -> DType:
-    if isinstance(inpt, features._Feature):
-        antialias = False if antialias is None else antialias
-        return inpt.resized_crop(top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
-    elif isinstance(inpt, PIL.Image.Image):
-        return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
-    else:
-        antialias = False if antialias is None else antialias
-        return resized_crop_image_tensor(
-            inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
-        )
-
-
-def _parse_five_crop_size(size: List[int]) -> List[int]:
-    if isinstance(size, numbers.Number):
-        size = [int(size), int(size)]
-    elif isinstance(size, (tuple, list)) and len(size) == 1:
-        size = [size[0], size[0]]
-
-    if len(size) != 2:
-        raise ValueError("Please provide only two dimensions (h, w) for size.")
-
-    return size
-
-
-def five_crop_image_tensor(
-    img: torch.Tensor, size: List[int]
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    crop_height, crop_width = _parse_five_crop_size(size)
-    _, image_height, image_width = get_dimensions_image_tensor(img)
-
-    if crop_width > image_width or crop_height > image_height:
-        msg = "Requested crop size {} is bigger than input size {}"
-        raise ValueError(msg.format(size, (image_height, image_width)))
-
-    tl = crop_image_tensor(img, 0, 0, crop_height, crop_width)
-    tr = crop_image_tensor(img, 0, image_width - crop_width, crop_height, crop_width)
-    bl = crop_image_tensor(img, image_height - crop_height, 0, crop_height, crop_width)
-    br = crop_image_tensor(img, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
-    center = center_crop_image_tensor(img, [crop_height, crop_width])
-
-    return tl, tr, bl, br, center
-
-
-def five_crop_image_pil(
-    img: PIL.Image.Image, size: List[int]
-) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
-    crop_height, crop_width = _parse_five_crop_size(size)
-    _, image_height, image_width = get_dimensions_image_pil(img)
-
-    if crop_width > image_width or crop_height > image_height:
-        msg = "Requested crop size {} is bigger than input size {}"
-        raise ValueError(msg.format(size, (image_height, image_width)))
-
-    tl = crop_image_pil(img, 0, 0, crop_height, crop_width)
-    tr = crop_image_pil(img, 0, image_width - crop_width, crop_height, crop_width)
-    bl = crop_image_pil(img, image_height - crop_height, 0, crop_height, crop_width)
-    br = crop_image_pil(img, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
-    center = center_crop_image_pil(img, [crop_height, crop_width])
-
-    return tl, tr, bl, br, center
-
-
-def ten_crop_image_tensor(img: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]:
-    tl, tr, bl, br, center = five_crop_image_tensor(img, size)
-
-    if vertical_flip:
-        img = vertical_flip_image_tensor(img)
-    else:
-        img = horizontal_flip_image_tensor(img)
-
-    tl_flip, tr_flip, bl_flip, br_flip, center_flip = five_crop_image_tensor(img, size)
-
-    return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
-
-
-def ten_crop_image_pil(img: PIL.Image.Image, size: List[int], vertical_flip: bool = False) -> List[PIL.Image.Image]:
-    tl, tr, bl, br, center = five_crop_image_pil(img, size)
-
-    if vertical_flip:
-        img = vertical_flip_image_pil(img)
-    else:
-        img = horizontal_flip_image_pil(img)
-
-    tl_flip, tr_flip, bl_flip, br_flip, center_flip = five_crop_image_pil(img, size)
-
-    return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
deleted file mode 100644
index 168a6dfe1b4..00000000000
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from typing import Any, Optional, Tuple
-
-import PIL.Image
-import torch
-from torchvision.prototype.features import BoundingBoxFormat, ColorSpace, Image
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
-
-get_dimensions_image_tensor = _FT.get_dimensions
-get_dimensions_image_pil = _FP.get_dimensions
-
-
-def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor:
-    xyxy = xywh.clone()
-    xyxy[..., 2:] += xyxy[..., :2]
-    return xyxy
-
-
-def _xyxy_to_xywh(xyxy: torch.Tensor) -> torch.Tensor:
-    xywh = xyxy.clone()
-    xywh[..., 2:] -= xywh[..., :2]
-    return xywh
-
-
-def _cxcywh_to_xyxy(cxcywh: torch.Tensor) -> torch.Tensor:
-    cx, cy, w, h = torch.unbind(cxcywh, dim=-1)
-    x1 = cx - 0.5 * w
-    y1 = cy - 0.5 * h
-    x2 = cx + 0.5 * w
-    y2 = cy + 0.5 * h
-    return torch.stack((x1, y1, x2, y2), dim=-1)
-
-
-def _xyxy_to_cxcywh(xyxy: torch.Tensor) -> torch.Tensor:
-    x1, y1, x2, y2 = torch.unbind(xyxy, dim=-1)
-    cx = (x1 + x2) / 2
-    cy = (y1 + y2) / 2
-    w = x2 - x1
-    h = y2 - y1
-    return torch.stack((cx, cy, w, h), dim=-1)
-
-
-def convert_bounding_box_format(
-    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, copy: bool = True
-) -> torch.Tensor:
-    if new_format == old_format:
-        if copy:
-            return bounding_box.clone()
-        else:
-            return bounding_box
-
-    if old_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xywh_to_xyxy(bounding_box)
-    elif old_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _cxcywh_to_xyxy(bounding_box)
-
-    if new_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xyxy_to_xywh(bounding_box)
-    elif new_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _xyxy_to_cxcywh(bounding_box)
-
-    return bounding_box
-
-
-def clamp_bounding_box(
-    bounding_box: torch.Tensor, format: BoundingBoxFormat, image_size: Tuple[int, int]
-) -> torch.Tensor:
-    xyxy_boxes = convert_bounding_box_format(bounding_box, format, BoundingBoxFormat.XYXY)
-    xyxy_boxes[..., 0::2].clamp_(min=0, max=image_size[1])
-    xyxy_boxes[..., 1::2].clamp_(min=0, max=image_size[0])
-    return convert_bounding_box_format(xyxy_boxes, BoundingBoxFormat.XYXY, format, copy=False)
-
-
-def _split_alpha(image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    return image[..., :-1, :, :], image[..., -1:, :, :]
-
-
-def _strip_alpha(image: torch.Tensor) -> torch.Tensor:
-    image, alpha = _split_alpha(image)
-    if not torch.all(alpha == _FT._max_value(alpha.dtype)):
-        raise RuntimeError(
-            "Stripping the alpha channel if it contains values other than the max value is not supported."
-        )
-    return image
-
-
-def _add_alpha(image: torch.Tensor, alpha: Optional[torch.Tensor] = None) -> torch.Tensor:
-    if alpha is None:
-        shape = list(image.shape)
-        shape[-3] = 1
-        alpha = torch.full(shape, _FT._max_value(image.dtype), dtype=image.dtype, device=image.device)
-    return torch.cat((image, alpha), dim=-3)
-
-
-def _gray_to_rgb(grayscale: torch.Tensor) -> torch.Tensor:
-    repeats = [1] * grayscale.ndim
-    repeats[-3] = 3
-    return grayscale.repeat(repeats)
-
-
-_rgb_to_gray = _FT.rgb_to_grayscale
-
-
-def convert_color_space_image_tensor(
-    image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True
-) -> torch.Tensor:
-    if new_color_space == old_color_space:
-        if copy:
-            return image.clone()
-        else:
-            return image
-
-    if old_color_space == ColorSpace.OTHER or new_color_space == ColorSpace.OTHER:
-        raise RuntimeError(f"Conversion to or from {ColorSpace.OTHER} is not supported.")
-
-    if old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.GRAY_ALPHA:
-        return _add_alpha(image)
-    elif old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.RGB:
-        return _gray_to_rgb(image)
-    elif old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.RGB_ALPHA:
-        return _add_alpha(_gray_to_rgb(image))
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.GRAY:
-        return _strip_alpha(image)
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB:
-        return _gray_to_rgb(_strip_alpha(image))
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB_ALPHA:
-        image, alpha = _split_alpha(image)
-        return _add_alpha(_gray_to_rgb(image), alpha)
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY:
-        return _rgb_to_gray(image)
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY_ALPHA:
-        return _add_alpha(_rgb_to_gray(image))
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.RGB_ALPHA:
-        return _add_alpha(image)
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY:
-        return _rgb_to_gray(_strip_alpha(image))
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY_ALPHA:
-        image, alpha = _split_alpha(image)
-        return _add_alpha(_rgb_to_gray(image), alpha)
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.RGB:
-        return _strip_alpha(image)
-    else:
-        raise RuntimeError(f"Conversion from {old_color_space} to {new_color_space} is not supported.")
-
-
-_COLOR_SPACE_TO_PIL_MODE = {
-    ColorSpace.GRAY: "L",
-    ColorSpace.GRAY_ALPHA: "LA",
-    ColorSpace.RGB: "RGB",
-    ColorSpace.RGB_ALPHA: "RGBA",
-}
-
-
-def convert_color_space_image_pil(
-    image: PIL.Image.Image, color_space: ColorSpace, copy: bool = True
-) -> PIL.Image.Image:
-    old_mode = image.mode
-    try:
-        new_mode = _COLOR_SPACE_TO_PIL_MODE[color_space]
-    except KeyError:
-        raise ValueError(f"Conversion from {ColorSpace.from_pil_mode(old_mode)} to {color_space} is not supported.")
-
-    if not copy and image.mode == new_mode:
-        return image
-
-    return image.convert(new_mode)
-
-
-def convert_color_space(
-    inpt: Any, *, color_space: ColorSpace, old_color_space: Optional[ColorSpace] = None, copy: bool = True
-) -> Any:
-    if isinstance(inpt, Image):
-        return inpt.to_color_space(color_space, copy=copy)
-    elif isinstance(inpt, PIL.Image.Image):
-        return convert_color_space_image_pil(inpt, color_space, copy=copy)
-    else:
-        if old_color_space is None:
-            raise RuntimeError(
-                "In order to convert the color space of simple tensor images, "
-                "the `old_color_space=...` parameter needs to be passed."
-            )
-        return convert_color_space_image_tensor(
-            inpt, old_color_space=old_color_space, new_color_space=color_space, copy=copy
-        )
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
deleted file mode 100644
index c20c5760d4a..00000000000
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import List, Optional, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
-from torchvision.transforms.functional import pil_to_tensor, to_pil_image
-
-
-# shortcut type
-DType = Union[torch.Tensor, PIL.Image.Image, features._Feature]
-
-
-normalize_image_tensor = _FT.normalize
-
-
-def normalize(inpt: DType, mean: List[float], std: List[float], inplace: bool = False) -> DType:
-    if isinstance(inpt, features._Feature) and not isinstance(inpt, features.Image):
-        return inpt
-    elif isinstance(inpt, PIL.Image.Image):
-        raise TypeError("Unsupported input type")
-    else:
-        # Image instance after normalization is not Image anymore due to unknown data range
-        # Thus we return Tensor for input Image
-        return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
-
-
-def gaussian_blur_image_tensor(
-    img: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> torch.Tensor:
-    if isinstance(kernel_size, int):
-        kernel_size = [kernel_size, kernel_size]
-    if len(kernel_size) != 2:
-        raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
-    for ksize in kernel_size:
-        if ksize % 2 == 0 or ksize < 0:
-            raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
-
-    if sigma is None:
-        sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
-
-    if sigma is not None and not isinstance(sigma, (int, float, list, tuple)):
-        raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
-    if isinstance(sigma, (int, float)):
-        sigma = [float(sigma), float(sigma)]
-    if isinstance(sigma, (list, tuple)) and len(sigma) == 1:
-        sigma = [sigma[0], sigma[0]]
-    if len(sigma) != 2:
-        raise ValueError(f"If sigma is a sequence, its length should be 2. Got {len(sigma)}")
-    for s in sigma:
-        if s <= 0.0:
-            raise ValueError(f"sigma should have positive values. Got {sigma}")
-
-    return _FT.gaussian_blur(img, kernel_size, sigma)
-
-
-def gaussian_blur_image_pil(img: PIL.Image, kernel_size: List[int], sigma: Optional[List[float]] = None) -> PIL.Image:
-    t_img = pil_to_tensor(img)
-    output = gaussian_blur_image_tensor(t_img, kernel_size=kernel_size, sigma=sigma)
-    return to_pil_image(output, mode=img.mode)
-
-
-def gaussian_blur(inpt: DType, kernel_size: List[int], sigma: Optional[List[float]] = None) -> DType:
-    if isinstance(inpt, features._Feature):
-        return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, PIL.Image.Image):
-        return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
-    else:
-        return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
deleted file mode 100644
index 6e5cb5bf1c8..00000000000
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import unittest.mock
-from typing import Any, Dict, Optional, Tuple, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from torch.nn.functional import one_hot
-from torchvision.io.video import read_video
-from torchvision.prototype.utils._internal import ReadOnlyTensorBuffer
-from torchvision.transforms import functional as _F
-
-
-def decode_image_with_pil(encoded_image: torch.Tensor) -> torch.Tensor:
-    image = torch.as_tensor(np.array(PIL.Image.open(ReadOnlyTensorBuffer(encoded_image)), copy=True))
-    if image.ndim == 2:
-        image = image.unsqueeze(2)
-    return image.permute(2, 0, 1)
-
-
-def decode_video_with_av(encoded_video: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
-    with unittest.mock.patch("torchvision.io.video.os.path.exists", return_value=True):
-        return read_video(ReadOnlyTensorBuffer(encoded_video))  # type: ignore[arg-type]
-
-
-def label_to_one_hot(label: torch.Tensor, *, num_categories: int) -> torch.Tensor:
-    return one_hot(label, num_classes=num_categories)  # type: ignore[no-any-return]
-
-
-def to_image_tensor(image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], copy: bool = False) -> torch.Tensor:
-    if isinstance(image, np.ndarray):
-        image = torch.from_numpy(image)
-
-    if isinstance(image, torch.Tensor):
-        if copy:
-            return image.clone()
-        else:
-            return image
-
-    return _F.pil_to_tensor(image)
-
-
-def to_image_pil(
-    image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], mode: Optional[str] = None
-) -> PIL.Image.Image:
-    if isinstance(image, PIL.Image.Image):
-        if mode != image.mode:
-            return image.convert(mode)
-        else:
-            return image
-
-    return _F.to_pil_image(image, mode=mode)
diff --git a/torchvision/prototype/tv_tensors/__init__.py b/torchvision/prototype/tv_tensors/__init__.py
new file mode 100644
index 00000000000..604628b2540
--- /dev/null
+++ b/torchvision/prototype/tv_tensors/__init__.py
@@ -0,0 +1 @@
+from ._label import Label, OneHotLabel
diff --git a/torchvision/prototype/tv_tensors/_label.py b/torchvision/prototype/tv_tensors/_label.py
new file mode 100644
index 00000000000..4a017d76d7b
--- /dev/null
+++ b/torchvision/prototype/tv_tensors/_label.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from typing import Any, TypeVar
+
+import torch
+from torch.utils._pytree import tree_map
+
+from torchvision.tv_tensors._tv_tensor import TVTensor
+
+
+L = TypeVar("L", bound="_LabelBase")
+
+
+class _LabelBase(TVTensor):
+    categories: Sequence[str] | None
+
+    @classmethod
+    def _wrap(cls: type[L], tensor: torch.Tensor, *, categories: Sequence[str] | None) -> L:
+        label_base = tensor.as_subclass(cls)
+        label_base.categories = categories
+        return label_base
+
+    def __new__(
+        cls: type[L],
+        data: Any,
+        *,
+        categories: Sequence[str] | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> L:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor, categories=categories)
+
+    @classmethod
+    def from_category(
+        cls: type[L],
+        category: str,
+        *,
+        categories: Sequence[str],
+        **kwargs: Any,
+    ) -> L:
+        return cls(categories.index(category), categories=categories, **kwargs)
+
+
+class Label(_LabelBase):
+    def to_categories(self) -> Any:
+        if self.categories is None:
+            raise RuntimeError("Label does not have categories")
+
+        return tree_map(lambda idx: self.categories[idx], self.tolist())  # type: ignore[index]
+
+
+class OneHotLabel(_LabelBase):
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        categories: Sequence[str] | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool = False,
+    ) -> OneHotLabel:
+        one_hot_label = super().__new__(
+            cls, data, categories=categories, dtype=dtype, device=device, requires_grad=requires_grad
+        )
+
+        if categories is not None and len(categories) != one_hot_label.shape[-1]:
+            raise ValueError()
+
+        return one_hot_label
diff --git a/torchvision/prototype/utils/_internal.py b/torchvision/prototype/utils/_internal.py
index 3dee4b59a7a..262e278ce26 100644
--- a/torchvision/prototype/utils/_internal.py
+++ b/torchvision/prototype/utils/_internal.py
@@ -3,7 +3,8 @@
 import io
 import mmap
 import platform
-from typing import BinaryIO, Callable, Collection, Sequence, TypeVar, Union
+from collections.abc import Collection, Sequence
+from typing import BinaryIO, Callable, TypeVar, Union
 
 import numpy as np
 import torch
@@ -99,7 +100,7 @@ def fromfile(
 
 class ReadOnlyTensorBuffer:
     def __init__(self, tensor: torch.Tensor) -> None:
-        self._memory = memoryview(tensor.numpy())
+        self._memory = memoryview(tensor.numpy())  # type: ignore[arg-type]
         self._cursor: int = 0
 
     def tell(self) -> int:
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/_functional_pil.py
similarity index 88%
rename from torchvision/transforms/functional_pil.py
rename to torchvision/transforms/_functional_pil.py
index 45c32782540..56b806cf6ed 100644
--- a/torchvision/transforms/functional_pil.py
+++ b/torchvision/transforms/_functional_pil.py
@@ -1,16 +1,17 @@
 import numbers
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Any, Literal, Optional, Union
 
 import numpy as np
 import torch
 from PIL import Image, ImageEnhance, ImageOps
-from typing_extensions import Literal
+
+from ..utils import _Image_fromarray
 
 try:
     import accimage
 except ImportError:
     accimage = None
-from . import _pil_constants
 
 
 @torch.jit.unused
@@ -22,7 +23,7 @@ def _is_pil_image(img: Any) -> bool:
 
 
 @torch.jit.unused
-def get_dimensions(img: Any) -> List[int]:
+def get_dimensions(img: Any) -> list[int]:
     if _is_pil_image(img):
         if hasattr(img, "getbands"):
             channels = len(img.getbands())
@@ -34,7 +35,7 @@ def get_dimensions(img: Any) -> List[int]:
 
 
 @torch.jit.unused
-def get_image_size(img: Any) -> List[int]:
+def get_image_size(img: Any) -> list[int]:
     if _is_pil_image(img):
         return list(img.size)
     raise TypeError(f"Unexpected type {type(img)}")
@@ -55,7 +56,7 @@ def hflip(img: Image.Image) -> Image.Image:
     if not _is_pil_image(img):
         raise TypeError(f"img should be PIL Image. Got {type(img)}")
 
-    return img.transpose(_pil_constants.FLIP_LEFT_RIGHT)
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
 
 
 @torch.jit.unused
@@ -63,7 +64,7 @@ def vflip(img: Image.Image) -> Image.Image:
     if not _is_pil_image(img):
         raise TypeError(f"img should be PIL Image. Got {type(img)}")
 
-    return img.transpose(_pil_constants.FLIP_TOP_BOTTOM)
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
 
 
 @torch.jit.unused
@@ -111,10 +112,10 @@ def adjust_hue(img: Image.Image, hue_factor: float) -> Image.Image:
     h, s, v = img.convert("HSV").split()
 
     np_h = np.array(h, dtype=np.uint8)
-    # uint8 addition take cares of rotation across boundaries
-    with np.errstate(over="ignore"):
-        np_h += np.uint8(hue_factor * 255)
-    h = Image.fromarray(np_h, "L")
+    # This will over/underflow, as desired
+    np_h += np.int32(hue_factor * 255).astype(np.uint8)
+
+    h = _Image_fromarray(np_h, "L")
 
     img = Image.merge("HSV", (h, s, v)).convert(input_mode)
     return img
@@ -145,8 +146,8 @@ def adjust_gamma(
 @torch.jit.unused
 def pad(
     img: Image.Image,
-    padding: Union[int, List[int], Tuple[int, ...]],
-    fill: Optional[Union[float, List[float], Tuple[float, ...]]] = 0,
+    padding: Union[int, list[int], tuple[int, ...]],
+    fill: Optional[Union[float, list[float], tuple[float, ...]]] = 0,
     padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
 ) -> Image.Image:
 
@@ -240,8 +241,8 @@ def crop(
 @torch.jit.unused
 def resize(
     img: Image.Image,
-    size: Union[List[int], int],
-    interpolation: int = _pil_constants.BILINEAR,
+    size: Union[list[int], int],
+    interpolation: int = Image.BILINEAR,
 ) -> Image.Image:
 
     if not _is_pil_image(img):
@@ -254,10 +255,10 @@ def resize(
 
 @torch.jit.unused
 def _parse_fill(
-    fill: Optional[Union[float, List[float], Tuple[float, ...]]],
+    fill: Optional[Union[float, list[float], tuple[float, ...]]],
     img: Image.Image,
     name: str = "fillcolor",
-) -> Dict[str, Optional[Union[float, List[float], Tuple[float, ...]]]]:
+) -> dict[str, Optional[Union[float, list[float], tuple[float, ...]]]]:
 
     # Process fill color for affine transforms
     num_channels = get_image_num_channels(img)
@@ -266,11 +267,13 @@ def _parse_fill(
     if isinstance(fill, (int, float)) and num_channels > 1:
         fill = tuple([fill] * num_channels)
     if isinstance(fill, (list, tuple)):
-        if len(fill) != num_channels:
+        if len(fill) == 1:
+            fill = fill * num_channels
+        elif len(fill) != num_channels:
             msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
             raise ValueError(msg.format(len(fill), num_channels))
 
-        fill = tuple(fill)
+        fill = tuple(fill)  # type: ignore[arg-type]
 
     if img.mode != "F":
         if isinstance(fill, (list, tuple)):
@@ -284,8 +287,8 @@ def _parse_fill(
 @torch.jit.unused
 def affine(
     img: Image.Image,
-    matrix: List[float],
-    interpolation: int = _pil_constants.NEAREST,
+    matrix: list[float],
+    interpolation: int = Image.NEAREST,
     fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
 ) -> Image.Image:
 
@@ -294,16 +297,16 @@ def affine(
 
     output_size = img.size
     opts = _parse_fill(fill, img)
-    return img.transform(output_size, _pil_constants.AFFINE, matrix, interpolation, **opts)
+    return img.transform(output_size, Image.AFFINE, matrix, interpolation, **opts)
 
 
 @torch.jit.unused
 def rotate(
     img: Image.Image,
     angle: float,
-    interpolation: int = _pil_constants.NEAREST,
+    interpolation: int = Image.NEAREST,
     expand: bool = False,
-    center: Optional[Tuple[int, int]] = None,
+    center: Optional[tuple[int, int]] = None,
     fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
 ) -> Image.Image:
 
@@ -317,8 +320,8 @@ def rotate(
 @torch.jit.unused
 def perspective(
     img: Image.Image,
-    perspective_coeffs: List[float],
-    interpolation: int = _pil_constants.BICUBIC,
+    perspective_coeffs: list[float],
+    interpolation: int = Image.BICUBIC,
     fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
 ) -> Image.Image:
 
@@ -327,7 +330,7 @@ def perspective(
 
     opts = _parse_fill(fill, img)
 
-    return img.transform(img.size, _pil_constants.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
+    return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
 
 
 @torch.jit.unused
@@ -341,7 +344,7 @@ def to_grayscale(img: Image.Image, num_output_channels: int) -> Image.Image:
         img = img.convert("L")
         np_img = np.array(img, dtype=np.uint8)
         np_img = np.dstack([np_img, np_img, np_img])
-        img = Image.fromarray(np_img, "RGB")
+        img = _Image_fromarray(np_img, "RGB")
     else:
         raise ValueError("num_output_channels should be either 1 or 3")
 
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/_functional_tensor.py
similarity index 87%
rename from torchvision/transforms/functional_tensor.py
rename to torchvision/transforms/_functional_tensor.py
index 4dfadb685d0..71409c40af3 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/_functional_tensor.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 from torch import Tensor
@@ -15,20 +15,14 @@ def _assert_image_tensor(img: Tensor) -> None:
         raise TypeError("Tensor is not a torch image.")
 
 
-def _assert_threshold(img: Tensor, threshold: float) -> None:
-    bound = 1 if img.is_floating_point() else 255
-    if threshold > bound:
-        raise TypeError("Threshold should be less than bound of img.")
-
-
-def get_dimensions(img: Tensor) -> List[int]:
+def get_dimensions(img: Tensor) -> list[int]:
     _assert_image_tensor(img)
     channels = 1 if img.ndim == 2 else img.shape[-3]
     height, width = img.shape[-2:]
     return [channels, height, width]
 
 
-def get_image_size(img: Tensor) -> List[int]:
+def get_image_size(img: Tensor) -> list[int]:
     # Returns (w, h) of tensor image
     _assert_image_tensor(img)
     return [img.shape[-1], img.shape[-2]]
@@ -51,15 +45,19 @@ def _max_value(dtype: torch.dtype) -> int:
         return 127
     elif dtype == torch.int16:
         return 32767
+    elif dtype == torch.uint16:
+        return 65535
     elif dtype == torch.int32:
         return 2147483647
     elif dtype == torch.int64:
         return 9223372036854775807
     else:
+        # This is only here for completeness. This value is implicitly assumed in a lot of places so changing it is not
+        # easy.
         return 1
 
 
-def _assert_channels(img: Tensor, permitted: List[int]) -> None:
+def _assert_channels(img: Tensor, permitted: list[int]) -> None:
     c = get_dimensions(img)[0]
     if c not in permitted:
         raise TypeError(f"Input image tensor permitted channel values are {permitted}, but found {c}")
@@ -137,7 +135,12 @@ def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
     bottom = top + height
 
     if left < 0 or top < 0 or right > w or bottom > h:
-        padding_ltrb = [max(-left, 0), max(-top, 0), max(right - w, 0), max(bottom - h, 0)]
+        padding_ltrb = [
+            max(-left + min(0, right), 0),
+            max(-top + min(0, bottom), 0),
+            max(right - max(w, left), 0),
+            max(bottom - max(h, top), 0),
+        ]
         return pad(img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0)
     return img[..., top:bottom, left:right]
 
@@ -145,16 +148,19 @@ def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
 def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
     if img.ndim < 3:
         raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-    _assert_channels(img, [3])
+    _assert_channels(img, [1, 3])
 
     if num_output_channels not in (1, 3):
         raise ValueError("num_output_channels should be either 1 or 3")
 
-    r, g, b = img.unbind(dim=-3)
-    # This implementation closely follows the TF one:
-    # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
-    l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
-    l_img = l_img.unsqueeze(dim=-3)
+    if img.shape[-3] == 3:
+        r, g, b = img.unbind(dim=-3)
+        # This implementation closely follows the TF one:
+        # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
+        l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
+        l_img = l_img.unsqueeze(dim=-3)
+    else:
+        l_img = img.clone()
 
     if num_output_channels == 3:
         return l_img.expand(img.shape)
@@ -204,8 +210,7 @@ def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
         return img
 
     orig_dtype = img.dtype
-    if img.dtype == torch.uint8:
-        img = img.to(dtype=torch.float32) / 255.0
+    img = convert_image_dtype(img, torch.float32)
 
     img = _rgb2hsv(img)
     h, s, v = img.unbind(dim=-3)
@@ -213,10 +218,7 @@ def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
     img = torch.stack((h, s, v), dim=-3)
     img_hue_adj = _hsv2rgb(img)
 
-    if orig_dtype == torch.uint8:
-        img_hue_adj = (img_hue_adj * 255.0).to(dtype=orig_dtype)
-
-    return img_hue_adj
+    return convert_image_dtype(img_hue_adj, orig_dtype)
 
 
 def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
@@ -255,7 +257,7 @@ def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
 
 def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
     ratio = float(ratio)
-    bound = 1.0 if img1.is_floating_point() else 255.0
+    bound = _max_value(img1.dtype)
     return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
 
 
@@ -272,7 +274,7 @@ def _rgb2hsv(img: Tensor) -> Tensor:
     #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
     #   + H channel has division by `(maxc - minc)`.
     #
-    # Instead of overwriting NaN afterwards, we just prevent it from occuring so
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring, so
     # we don't need to deal with it in case we save the NaN in a buffer in
     # backprop, if it is ever supported, but it doesn't hurt to do so.
     eqc = maxc == minc
@@ -319,7 +321,7 @@ def _hsv2rgb(img: Tensor) -> Tensor:
     return torch.einsum("...ijk, ...xijk -> ...xjk", mask.to(dtype=img.dtype), a4)
 
 
-def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor:
+def _pad_symmetric(img: Tensor, padding: list[int]) -> Tensor:
     # padding is left, right, top, bottom
 
     # crop if needed
@@ -350,7 +352,7 @@ def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor:
         raise RuntimeError("Symmetric padding of N-D tensors are not supported yet")
 
 
-def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
+def _parse_pad_padding(padding: Union[int, list[int]]) -> list[int]:
     if isinstance(padding, int):
         if torch.jit.is_scripting():
             # This maybe unreachable
@@ -371,7 +373,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
 
 
 def pad(
-    img: Tensor, padding: Union[int, List[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
+    img: Tensor, padding: Union[int, list[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
 ) -> Tensor:
     _assert_image_tensor(img)
 
@@ -416,7 +418,7 @@ def pad(
     out_dtype = img.dtype
     need_cast = False
     if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
-        # Here we temporary cast input tensor to float
+        # Here we temporarily cast input tensor to float
         # until pytorch issue is resolved :
         # https://github.com/pytorch/pytorch/issues/40763
         need_cast = True
@@ -438,9 +440,9 @@ def pad(
 
 def resize(
     img: Tensor,
-    size: List[int],
+    size: list[int],
     interpolation: str = "bilinear",
-    antialias: Optional[bool] = None,
+    antialias: Optional[bool] = True,
 ) -> Tensor:
     _assert_image_tensor(img)
 
@@ -451,7 +453,11 @@ def resize(
         antialias = False
 
     if antialias and interpolation not in ["bilinear", "bicubic"]:
-        raise ValueError("Antialias option is supported for bilinear and bicubic interpolation modes only")
+        # We manually set it to False to avoid an error downstream in interpolate()
+        # This behaviour is documented: the parameter is irrelevant for modes
+        # that are not bilinear or bicubic. We used to raise an error here, but
+        # now we don't as True is the default.
+        antialias = False
 
     img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
 
@@ -470,11 +476,11 @@ def resize(
 
 def _assert_grid_transform_inputs(
     img: Tensor,
-    matrix: Optional[List[float]],
+    matrix: Optional[list[float]],
     interpolation: str,
-    fill: Optional[List[float]],
-    supported_interpolation_modes: List[str],
-    coeffs: Optional[List[float]] = None,
+    fill: Optional[Union[int, float, list[float]]],
+    supported_interpolation_modes: list[str],
+    coeffs: Optional[list[float]] = None,
 ) -> None:
 
     if not (isinstance(img, torch.Tensor)):
@@ -496,7 +502,7 @@ def _assert_grid_transform_inputs(
 
     # Check fill
     num_channels = get_dimensions(img)[0]
-    if isinstance(fill, (tuple, list)) and (len(fill) > 1 and len(fill) != num_channels):
+    if fill is not None and isinstance(fill, (tuple, list)) and len(fill) > 1 and len(fill) != num_channels:
         msg = (
             "The number of elements in 'fill' cannot broadcast to match the number of "
             "channels of the image ({} != {})"
@@ -507,7 +513,7 @@ def _assert_grid_transform_inputs(
         raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
 
 
-def _cast_squeeze_in(img: Tensor, req_dtypes: List[torch.dtype]) -> Tuple[Tensor, bool, bool, torch.dtype]:
+def _cast_squeeze_in(img: Tensor, req_dtypes: list[torch.dtype]) -> tuple[Tensor, bool, bool, torch.dtype]:
     need_squeeze = False
     # make image NCHW
     if img.ndim < 4:
@@ -536,7 +542,9 @@ def _cast_squeeze_out(img: Tensor, need_cast: bool, need_squeeze: bool, out_dtyp
     return img
 
 
-def _apply_grid_transform(img: Tensor, grid: Tensor, mode: str, fill: Optional[List[float]]) -> Tensor:
+def _apply_grid_transform(
+    img: Tensor, grid: Tensor, mode: str, fill: Optional[Union[int, float, list[float]]]
+) -> Tensor:
 
     img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [grid.dtype])
 
@@ -546,8 +554,8 @@ def _apply_grid_transform(img: Tensor, grid: Tensor, mode: str, fill: Optional[L
 
     # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
     if fill is not None:
-        dummy = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
-        img = torch.cat((img, dummy), dim=1)
+        mask = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
+        img = torch.cat((img, mask), dim=1)
 
     img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False)
 
@@ -556,8 +564,8 @@ def _apply_grid_transform(img: Tensor, grid: Tensor, mode: str, fill: Optional[L
         mask = img[:, -1:, :, :]  # N * 1 * H * W
         img = img[:, :-1, :, :]  # N * C * H * W
         mask = mask.expand_as(img)
-        len_fill = len(fill) if isinstance(fill, (tuple, list)) else 1
-        fill_img = torch.tensor(fill, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
+        fill_list, len_fill = (fill, len(fill)) if isinstance(fill, (tuple, list)) else ([float(fill)], 1)
+        fill_img = torch.tensor(fill_list, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
         if mode == "nearest":
             mask = mask < 0.5
             img[mask] = fill_img[mask]
@@ -595,7 +603,10 @@ def _gen_affine_grid(
 
 
 def affine(
-    img: Tensor, matrix: List[float], interpolation: str = "nearest", fill: Optional[List[float]] = None
+    img: Tensor,
+    matrix: list[float],
+    interpolation: str = "nearest",
+    fill: Optional[Union[int, float, list[float]]] = None,
 ) -> Tensor:
     _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
 
@@ -607,7 +618,7 @@ def affine(
     return _apply_grid_transform(img, grid, interpolation, fill=fill)
 
 
-def _compute_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
+def _compute_affine_output_size(matrix: list[float], w: int, h: int) -> tuple[int, int]:
 
     # Inspired of PIL implementation:
     # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
@@ -642,14 +653,14 @@ def _compute_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]
 
 def rotate(
     img: Tensor,
-    matrix: List[float],
+    matrix: list[float],
     interpolation: str = "nearest",
     expand: bool = False,
-    fill: Optional[List[float]] = None,
+    fill: Optional[Union[int, float, list[float]]] = None,
 ) -> Tensor:
     _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
     w, h = img.shape[-1], img.shape[-2]
-    ow, oh = _compute_output_size(matrix, w, h) if expand else (w, h)
+    ow, oh = _compute_affine_output_size(matrix, w, h) if expand else (w, h)
     dtype = img.dtype if torch.is_floating_point(img) else torch.float32
     theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
     # grid will be generated on the same device as theta and img
@@ -658,7 +669,7 @@ def rotate(
     return _apply_grid_transform(img, grid, interpolation, fill=fill)
 
 
-def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
+def _perspective_grid(coeffs: list[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
     # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
     # src/libImaging/Geometry.c#L394
 
@@ -688,7 +699,10 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
 
 
 def perspective(
-    img: Tensor, perspective_coeffs: List[float], interpolation: str = "bilinear", fill: Optional[List[float]] = None
+    img: Tensor,
+    perspective_coeffs: list[float],
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, list[float]]] = None,
 ) -> Tensor:
     if not (isinstance(img, torch.Tensor)):
         raise TypeError("Input img should be Tensor.")
@@ -710,10 +724,10 @@ def perspective(
     return _apply_grid_transform(img, grid, interpolation, fill=fill)
 
 
-def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor:
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> Tensor:
     ksize_half = (kernel_size - 1) * 0.5
 
-    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
+    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size, dtype=dtype, device=device)
     pdf = torch.exp(-0.5 * (x / sigma).pow(2))
     kernel1d = pdf / pdf.sum()
 
@@ -721,15 +735,15 @@ def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor:
 
 
 def _get_gaussian_kernel2d(
-    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
+    kernel_size: list[int], sigma: list[float], dtype: torch.dtype, device: torch.device
 ) -> Tensor:
-    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
-    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device)
     kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :])
     return kernel2d
 
 
-def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Tensor:
+def gaussian_blur(img: Tensor, kernel_size: list[int], sigma: list[float]) -> Tensor:
     if not (isinstance(img, torch.Tensor)):
         raise TypeError(f"img should be Tensor. Got {type(img)}")
 
@@ -739,12 +753,7 @@ def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Te
     kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
     kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
 
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(
-        img,
-        [
-            kernel.dtype,
-        ],
-    )
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
 
     # padding = (left, right, top, bottom)
     padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
@@ -764,8 +773,7 @@ def invert(img: Tensor) -> Tensor:
 
     _assert_channels(img, [1, 3])
 
-    bound = torch.tensor(1 if img.is_floating_point() else 255, dtype=img.dtype, device=img.device)
-    return bound - img
+    return _max_value(img.dtype) - img
 
 
 def posterize(img: Tensor, bits: int) -> Tensor:
@@ -791,7 +799,8 @@ def solarize(img: Tensor, threshold: float) -> Tensor:
 
     _assert_channels(img, [1, 3])
 
-    _assert_threshold(img, threshold)
+    if threshold > _max_value(img.dtype):
+        raise TypeError("Threshold should be less than bound of img.")
 
     inverted_img = invert(img)
     return torch.where(img >= threshold, inverted_img, img)
@@ -805,12 +814,7 @@ def _blurred_degenerate_image(img: Tensor) -> Tensor:
     kernel /= kernel.sum()
     kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
 
-    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(
-        img,
-        [
-            kernel.dtype,
-        ],
-    )
+    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
     result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
     result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
 
@@ -843,7 +847,7 @@ def autocontrast(img: Tensor) -> Tensor:
 
     _assert_channels(img, [1, 3])
 
-    bound = 1.0 if img.is_floating_point() else 255.0
+    bound = _max_value(img.dtype)
     dtype = img.dtype if torch.is_floating_point(img) else torch.float32
 
     minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
@@ -864,7 +868,7 @@ def _scale_channel(img_chan: Tensor) -> Tensor:
     if img_chan.is_cuda:
         hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
     else:
-        hist = torch.bincount(img_chan.view(-1), minlength=256)
+        hist = torch.bincount(img_chan.reshape(-1), minlength=256)
 
     nonzero_hist = hist[hist != 0]
     step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
@@ -898,7 +902,7 @@ def equalize(img: Tensor) -> Tensor:
     return torch.stack([_equalize_single_image(x) for x in img])
 
 
-def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False) -> Tensor:
+def normalize(tensor: Tensor, mean: list[float], std: list[float], inplace: bool = False) -> Tensor:
     _assert_image_tensor(tensor)
 
     if not tensor.is_floating_point():
@@ -921,8 +925,7 @@ def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool
         mean = mean.view(-1, 1, 1)
     if std.ndim == 1:
         std = std.view(-1, 1, 1)
-    tensor.sub_(mean).div_(std)
-    return tensor
+    return tensor.sub_(mean).div_(std)
 
 
 def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
@@ -935,7 +938,7 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
     return img
 
 
-def _create_identity_grid(size: List[int]) -> Tensor:
+def _create_identity_grid(size: list[int]) -> Tensor:
     hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
     grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
     return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
@@ -945,7 +948,7 @@ def elastic_transform(
     img: Tensor,
     displacement: Tensor,
     interpolation: str = "bilinear",
-    fill: Optional[List[float]] = None,
+    fill: Optional[Union[int, float, list[float]]] = None,
 ) -> Tensor:
 
     if not (isinstance(img, torch.Tensor)):
diff --git a/torchvision/transforms/_functional_video.py b/torchvision/transforms/_functional_video.py
index f969a2542d0..91df7d42cd7 100644
--- a/torchvision/transforms/_functional_video.py
+++ b/torchvision/transforms/_functional_video.py
@@ -4,7 +4,7 @@
 
 
 warnings.warn(
-    "The 'torchvision.transforms._functional_video' module is deprecated since 0.12 and will be removed in 0.14. "
+    "The 'torchvision.transforms._functional_video' module is deprecated since 0.12 and will be removed in the future. "
     "Please use the 'torchvision.transforms.functional' module instead."
 )
 
diff --git a/torchvision/transforms/_pil_constants.py b/torchvision/transforms/_pil_constants.py
deleted file mode 100644
index 46f6ce5d24d..00000000000
--- a/torchvision/transforms/_pil_constants.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from PIL import Image
-
-# See https://pillow.readthedocs.io/en/stable/releasenotes/9.1.0.html#deprecations
-# TODO: Remove this file once PIL minimal version is >= 9.1
-
-if hasattr(Image, "Resampling"):
-    BICUBIC = Image.Resampling.BICUBIC
-    BILINEAR = Image.Resampling.BILINEAR
-    LINEAR = Image.Resampling.BILINEAR
-    NEAREST = Image.Resampling.NEAREST
-
-    AFFINE = Image.Transform.AFFINE
-    FLIP_LEFT_RIGHT = Image.Transpose.FLIP_LEFT_RIGHT
-    FLIP_TOP_BOTTOM = Image.Transpose.FLIP_TOP_BOTTOM
-    PERSPECTIVE = Image.Transform.PERSPECTIVE
-else:
-    BICUBIC = Image.BICUBIC
-    BILINEAR = Image.BILINEAR
-    NEAREST = Image.NEAREST
-    LINEAR = Image.LINEAR
-
-    AFFINE = Image.AFFINE
-    FLIP_LEFT_RIGHT = Image.FLIP_LEFT_RIGHT
-    FLIP_TOP_BOTTOM = Image.FLIP_TOP_BOTTOM
-    PERSPECTIVE = Image.PERSPECTIVE
diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py
index 33b94d01c9d..a7eba6721c7 100644
--- a/torchvision/transforms/_presets.py
+++ b/torchvision/transforms/_presets.py
@@ -2,7 +2,8 @@
 This file is part of the private API. Please do not use directly these classes as they will be modified on
 future versions without warning. The classes should be accessed only via the transforms argument of Weights.
 """
-from typing import Optional, Tuple
+
+from typing import Optional, Union
 
 import torch
 from torch import nn, Tensor
@@ -41,9 +42,10 @@ def __init__(
         *,
         crop_size: int,
         resize_size: int = 256,
-        mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
-        std: Tuple[float, ...] = (0.229, 0.224, 0.225),
+        mean: tuple[float, ...] = (0.485, 0.456, 0.406),
+        std: tuple[float, ...] = (0.229, 0.224, 0.225),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
     ) -> None:
         super().__init__()
         self.crop_size = [crop_size]
@@ -51,9 +53,10 @@ def __init__(
         self.mean = list(mean)
         self.std = list(std)
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(self, img: Tensor) -> Tensor:
-        img = F.resize(img, self.resize_size, interpolation=self.interpolation)
+        img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
         img = F.center_crop(img, self.crop_size)
         if not isinstance(img, Tensor):
             img = F.pil_to_tensor(img)
@@ -84,10 +87,10 @@ class VideoClassification(nn.Module):
     def __init__(
         self,
         *,
-        crop_size: Tuple[int, int],
-        resize_size: Tuple[int, int],
-        mean: Tuple[float, ...] = (0.43216, 0.394666, 0.37645),
-        std: Tuple[float, ...] = (0.22803, 0.22145, 0.216989),
+        crop_size: tuple[int, int],
+        resize_size: Union[tuple[int], tuple[int, int]],
+        mean: tuple[float, ...] = (0.43216, 0.394666, 0.37645),
+        std: tuple[float, ...] = (0.22803, 0.22145, 0.216989),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     ) -> None:
         super().__init__()
@@ -105,7 +108,11 @@ def forward(self, vid: Tensor) -> Tensor:
 
         N, T, C, H, W = vid.shape
         vid = vid.view(-1, C, H, W)
-        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation)
+        # We hard-code antialias=False to preserve results after we changed
+        # its default from None to True (see
+        # https://github.com/pytorch/vision/pull/7160)
+        # TODO: we could re-train the video models with antialias=True?
+        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation, antialias=False)
         vid = F.center_crop(vid, self.crop_size)
         vid = F.convert_image_dtype(vid, torch.float)
         vid = F.normalize(vid, mean=self.mean, std=self.std)
@@ -142,19 +149,21 @@ def __init__(
         self,
         *,
         resize_size: Optional[int],
-        mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
-        std: Tuple[float, ...] = (0.229, 0.224, 0.225),
+        mean: tuple[float, ...] = (0.485, 0.456, 0.406),
+        std: tuple[float, ...] = (0.229, 0.224, 0.225),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
     ) -> None:
         super().__init__()
         self.resize_size = [resize_size] if resize_size is not None else None
         self.mean = list(mean)
         self.std = list(std)
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(self, img: Tensor) -> Tensor:
         if isinstance(self.resize_size, list):
-            img = F.resize(img, self.resize_size, interpolation=self.interpolation)
+            img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
         if not isinstance(img, Tensor):
             img = F.pil_to_tensor(img)
         img = F.convert_image_dtype(img, torch.float)
@@ -180,7 +189,7 @@ def describe(self) -> str:
 
 
 class OpticalFlow(nn.Module):
-    def forward(self, img1: Tensor, img2: Tensor) -> Tuple[Tensor, Tensor]:
+    def forward(self, img1: Tensor, img2: Tensor) -> tuple[Tensor, Tensor]:
         if not isinstance(img1, Tensor):
             img1 = F.pil_to_tensor(img1)
         if not isinstance(img2, Tensor):
diff --git a/torchvision/transforms/_transforms_video.py b/torchvision/transforms/_transforms_video.py
index 1ed6de7612d..a04da4f7484 100644
--- a/torchvision/transforms/_transforms_video.py
+++ b/torchvision/transforms/_transforms_video.py
@@ -20,7 +20,7 @@
 
 
 warnings.warn(
-    "The 'torchvision.transforms._transforms_video' module is deprecated since 0.12 and will be removed in 0.14. "
+    "The 'torchvision.transforms._transforms_video' module is deprecated since 0.12 and will be removed in the future. "
     "Please use the 'torchvision.transforms' module instead."
 )
 
@@ -151,7 +151,7 @@ def __repr__(self) -> str:
 
 class RandomHorizontalFlipVideo:
     """
-    Flip the video clip along the horizonal direction with a given probability
+    Flip the video clip along the horizontal direction with a given probability
     Args:
         p (float): probability of the clip being flipped. Default value is 0.5
     """
diff --git a/torchvision/transforms/autoaugment.py b/torchvision/transforms/autoaugment.py
index 9dbbe91e741..20291d09b94 100644
--- a/torchvision/transforms/autoaugment.py
+++ b/torchvision/transforms/autoaugment.py
@@ -1,6 +1,6 @@
 import math
 from enum import Enum
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 from torch import Tensor
@@ -11,7 +11,7 @@
 
 
 def _apply_op(
-    img: Tensor, op_name: str, magnitude: float, interpolation: InterpolationMode, fill: Optional[List[float]]
+    img: Tensor, op_name: str, magnitude: float, interpolation: InterpolationMode, fill: Optional[list[float]]
 ):
     if op_name == "ShearX":
         # magnitude should be arctan(magnitude)
@@ -122,7 +122,7 @@ def __init__(
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[List[float]] = None,
+        fill: Optional[list[float]] = None,
     ) -> None:
         super().__init__()
         self.policy = policy
@@ -132,7 +132,7 @@ def __init__(
 
     def _get_policies(
         self, policy: AutoAugmentPolicy
-    ) -> List[Tuple[Tuple[str, float, Optional[int]], Tuple[str, float, Optional[int]]]]:
+    ) -> list[tuple[tuple[str, float, Optional[int]], tuple[str, float, Optional[int]]]]:
         if policy == AutoAugmentPolicy.IMAGENET:
             return [
                 (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
@@ -220,7 +220,7 @@ def _get_policies(
         else:
             raise ValueError(f"The provided policy {policy} is not recognized.")
 
-    def _augmentation_space(self, num_bins: int, image_size: Tuple[int, int]) -> Dict[str, Tuple[Tensor, bool]]:
+    def _augmentation_space(self, num_bins: int, image_size: tuple[int, int]) -> dict[str, tuple[Tensor, bool]]:
         return {
             # op_name: (magnitudes, signed)
             "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
@@ -240,7 +240,7 @@ def _augmentation_space(self, num_bins: int, image_size: Tuple[int, int]) -> Dic
         }
 
     @staticmethod
-    def get_params(transform_num: int) -> Tuple[int, Tensor, Tensor]:
+    def get_params(transform_num: int) -> tuple[int, Tensor, Tensor]:
         """Get parameters for autoaugment transformation
 
         Returns:
@@ -309,7 +309,7 @@ def __init__(
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[List[float]] = None,
+        fill: Optional[list[float]] = None,
     ) -> None:
         super().__init__()
         self.num_ops = num_ops
@@ -318,7 +318,7 @@ def __init__(
         self.interpolation = interpolation
         self.fill = fill
 
-    def _augmentation_space(self, num_bins: int, image_size: Tuple[int, int]) -> Dict[str, Tuple[Tensor, bool]]:
+    def _augmentation_space(self, num_bins: int, image_size: tuple[int, int]) -> dict[str, tuple[Tensor, bool]]:
         return {
             # op_name: (magnitudes, signed)
             "Identity": (torch.tensor(0.0), False),
@@ -397,14 +397,14 @@ def __init__(
         self,
         num_magnitude_bins: int = 31,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Optional[List[float]] = None,
+        fill: Optional[list[float]] = None,
     ) -> None:
         super().__init__()
         self.num_magnitude_bins = num_magnitude_bins
         self.interpolation = interpolation
         self.fill = fill
 
-    def _augmentation_space(self, num_bins: int) -> Dict[str, Tuple[Tensor, bool]]:
+    def _augmentation_space(self, num_bins: int) -> dict[str, tuple[Tensor, bool]]:
         return {
             # op_name: (magnitudes, signed)
             "Identity": (torch.tensor(0.0), False),
@@ -492,7 +492,7 @@ def __init__(
         alpha: float = 1.0,
         all_ops: bool = True,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Optional[List[float]] = None,
+        fill: Optional[list[float]] = None,
     ) -> None:
         super().__init__()
         self._PARAMETER_MAX = 10
@@ -506,7 +506,7 @@ def __init__(
         self.interpolation = interpolation
         self.fill = fill
 
-    def _augmentation_space(self, num_bins: int, image_size: Tuple[int, int]) -> Dict[str, Tuple[Tensor, bool]]:
+    def _augmentation_space(self, num_bins: int, image_size: tuple[int, int]) -> dict[str, tuple[Tensor, bool]]:
         s = {
             # op_name: (magnitudes, signed)
             "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 8bd4b366cc9..7b950b0c45b 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -1,12 +1,14 @@
 import math
 import numbers
+import sys
 import warnings
 from enum import Enum
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 import torch
 from PIL import Image
+from PIL.Image import Image as PILImage
 from torch import Tensor
 
 try:
@@ -14,16 +16,18 @@
 except ImportError:
     accimage = None
 
-from ..utils import _log_api_usage_once
-from . import functional_pil as F_pil, functional_tensor as F_t
+from ..utils import _Image_fromarray, _log_api_usage_once
+from . import _functional_pil as F_pil, _functional_tensor as F_t
 
 
 class InterpolationMode(Enum):
     """Interpolation modes
-    Available interpolation methods are ``nearest``, ``bilinear``, ``bicubic``, ``box``, ``hamming``, and ``lanczos``.
+    Available interpolation methods are ``nearest``, ``nearest-exact``, ``bilinear``, ``bicubic``, ``box``, ``hamming``,
+    and ``lanczos``.
     """
 
     NEAREST = "nearest"
+    NEAREST_EXACT = "nearest-exact"
     BILINEAR = "bilinear"
     BICUBIC = "bicubic"
     # For PIL compatibility
@@ -50,6 +54,7 @@ def _interpolation_modes_from_int(i: int) -> InterpolationMode:
     InterpolationMode.NEAREST: 0,
     InterpolationMode.BILINEAR: 2,
     InterpolationMode.BICUBIC: 3,
+    InterpolationMode.NEAREST_EXACT: 0,
     InterpolationMode.BOX: 4,
     InterpolationMode.HAMMING: 5,
     InterpolationMode.LANCZOS: 1,
@@ -58,7 +63,7 @@ def _interpolation_modes_from_int(i: int) -> InterpolationMode:
 _is_pil_image = F_pil._is_pil_image
 
 
-def get_dimensions(img: Tensor) -> List[int]:
+def get_dimensions(img: Tensor) -> list[int]:
     """Returns the dimensions of an image as [channels, height, width].
 
     Args:
@@ -75,7 +80,7 @@ def get_dimensions(img: Tensor) -> List[int]:
     return F_pil.get_dimensions(img)
 
 
-def get_image_size(img: Tensor) -> List[int]:
+def get_image_size(img: Tensor) -> list[int]:
     """Returns the size of an image as [width, height].
 
     Args:
@@ -119,7 +124,7 @@ def _is_numpy_image(img: Any) -> bool:
     return img.ndim in {2, 3}
 
 
-def to_tensor(pic) -> Tensor:
+def to_tensor(pic: Union[PILImage, np.ndarray]) -> Tensor:
     """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
     This function does not support torchscript.
 
@@ -159,12 +164,12 @@ def to_tensor(pic) -> Tensor:
         return torch.from_numpy(nppic).to(dtype=default_float_dtype)
 
     # handle PIL Image
-    mode_to_nptype = {"I": np.int32, "I;16": np.int16, "F": np.float32}
+    mode_to_nptype = {"I": np.int32, "I;16" if sys.byteorder == "little" else "I;16B": np.int16, "F": np.float32}
     img = torch.from_numpy(np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
 
     if pic.mode == "1":
         img = 255 * img
-    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
     # put it from HWC to CHW format
     img = img.permute((2, 0, 1)).contiguous()
     if isinstance(img, torch.ByteTensor):
@@ -202,7 +207,7 @@ def pil_to_tensor(pic: Any) -> Tensor:
 
     # handle PIL Image
     img = torch.as_tensor(np.array(pic, copy=True))
-    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
     # put it from HWC to CHW format
     img = img.permute((2, 0, 1))
     return img
@@ -255,41 +260,26 @@ def to_pil_image(pic, mode=None):
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(to_pil_image)
 
-    if not (isinstance(pic, torch.Tensor) or isinstance(pic, np.ndarray)):
+    if isinstance(pic, torch.Tensor):
+        if pic.ndim == 3:
+            pic = pic.permute((1, 2, 0))
+        pic = pic.numpy(force=True)
+    elif not isinstance(pic, np.ndarray):
         raise TypeError(f"pic should be Tensor or ndarray. Got {type(pic)}.")
 
-    elif isinstance(pic, torch.Tensor):
-        if pic.ndimension() not in {2, 3}:
-            raise ValueError(f"pic should be 2/3 dimensional. Got {pic.ndimension()} dimensions.")
-
-        elif pic.ndimension() == 2:
-            # if 2D image, add channel dimension (CHW)
-            pic = pic.unsqueeze(0)
-
-        # check number of channels
-        if pic.shape[-3] > 4:
-            raise ValueError(f"pic should not have > 4 channels. Got {pic.shape[-3]} channels.")
-
-    elif isinstance(pic, np.ndarray):
-        if pic.ndim not in {2, 3}:
-            raise ValueError(f"pic should be 2/3 dimensional. Got {pic.ndim} dimensions.")
-
-        elif pic.ndim == 2:
-            # if 2D image, add channel dimension (HWC)
-            pic = np.expand_dims(pic, 2)
+    if pic.ndim == 2:
+        # if 2D image, add channel dimension (HWC)
+        pic = np.expand_dims(pic, 2)
+    if pic.ndim != 3:
+        raise ValueError(f"pic should be 2/3 dimensional. Got {pic.ndim} dimensions.")
 
-        # check number of channels
-        if pic.shape[-1] > 4:
-            raise ValueError(f"pic should not have > 4 channels. Got {pic.shape[-1]} channels.")
+    if pic.shape[-1] > 4:
+        raise ValueError(f"pic should not have > 4 channels. Got {pic.shape[-1]} channels.")
 
     npimg = pic
-    if isinstance(pic, torch.Tensor):
-        if pic.is_floating_point() and mode != "F":
-            pic = pic.mul(255).byte()
-        npimg = np.transpose(pic.cpu().numpy(), (1, 2, 0))
 
-    if not isinstance(npimg, np.ndarray):
-        raise TypeError("Input pic must be a torch.Tensor or NumPy ndarray, not {type(npimg)}")
+    if np.issubdtype(npimg.dtype, np.floating) and mode != "F":
+        npimg = (npimg * 255).astype(np.uint8)
 
     if npimg.shape[2] == 1:
         expected_mode = None
@@ -297,7 +287,7 @@ def to_pil_image(pic, mode=None):
         if npimg.dtype == np.uint8:
             expected_mode = "L"
         elif npimg.dtype == np.int16:
-            expected_mode = "I;16"
+            expected_mode = "I;16" if sys.byteorder == "little" else "I;16B"
         elif npimg.dtype == np.int32:
             expected_mode = "I"
         elif npimg.dtype == np.float32:
@@ -331,10 +321,10 @@ def to_pil_image(pic, mode=None):
     if mode is None:
         raise TypeError(f"Input type {npimg.dtype} is not supported")
 
-    return Image.fromarray(npimg, mode=mode)
+    return _Image_fromarray(npimg, mode=mode)
 
 
-def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False) -> Tensor:
+def normalize(tensor: Tensor, mean: list[float], std: list[float], inplace: bool = False) -> Tensor:
     """Normalize a float tensor image with mean and standard deviation.
     This transform does not support PIL Image.
 
@@ -360,12 +350,23 @@ def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool
     return F_t.normalize(tensor, mean=mean, std=std, inplace=inplace)
 
 
-def _compute_output_size(image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None) -> List[int]:
-    if len(size) == 1:  # specified size only for the smallest edge
-        h, w = image_size
-        short, long = (w, h) if w <= h else (h, w)
+def _compute_resized_output_size(
+    image_size: tuple[int, int],
+    size: Optional[list[int]],
+    max_size: Optional[int] = None,
+    allow_size_none: bool = False,  # only True in v2
+) -> list[int]:
+    h, w = image_size
+    short, long = (w, h) if w <= h else (h, w)
+    if size is None:
+        if not allow_size_none:
+            raise ValueError("This should never happen!!")
+        if not isinstance(max_size, int):
+            raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
+        new_short, new_long = int(max_size * short / long), max_size
+        new_w, new_h = (new_short, new_long) if w <= h else (new_long, new_short)
+    elif len(size) == 1:  # specified size only for the smallest edge
         requested_new_short = size if isinstance(size, int) else size[0]
-
         new_short, new_long = requested_new_short, int(requested_new_short * long / short)
 
         if max_size is not None:
@@ -385,22 +386,15 @@ def _compute_output_size(image_size: Tuple[int, int], size: List[int], max_size:
 
 def resize(
     img: Tensor,
-    size: List[int],
+    size: list[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
+    antialias: Optional[bool] = True,
 ) -> Tensor:
     r"""Resize the input image to the given size.
     If the image is torch Tensor, it is expected
     to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
 
-    .. warning::
-        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
-        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
-        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
-        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
-        closer.
-
     Args:
         img (PIL Image or Tensor): Image to be resized.
         size (sequence or int): Desired output size. If size is a sequence like
@@ -414,37 +408,48 @@ def resize(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
 
     Returns:
         PIL Image or Tensor: Resized image.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(resize)
-    # Backward compatibility with integer value
+
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
-
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if isinstance(size, (list, tuple)):
         if len(size) not in [1, 2]:
@@ -460,13 +465,13 @@ def resize(
     _, image_height, image_width = get_dimensions(img)
     if isinstance(size, int):
         size = [size]
-    output_size = _compute_output_size((image_height, image_width), size, max_size)
+    output_size = _compute_resized_output_size((image_height, image_width), size, max_size)
 
-    if (image_height, image_width) == output_size:
+    if [image_height, image_width] == output_size:
         return img
 
     if not isinstance(img, torch.Tensor):
-        if antialias is not None and not antialias:
+        if antialias is False:
             warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
         pil_interpolation = pil_modes_mapping[interpolation]
         return F_pil.resize(img, size=output_size, interpolation=pil_interpolation)
@@ -474,7 +479,7 @@ def resize(
     return F_t.resize(img, size=output_size, interpolation=interpolation.value, antialias=antialias)
 
 
-def pad(img: Tensor, padding: List[int], fill: Union[int, float] = 0, padding_mode: str = "constant") -> Tensor:
+def pad(img: Tensor, padding: list[int], fill: Union[int, float] = 0, padding_mode: str = "constant") -> Tensor:
     r"""Pad the given image on all sides with the given "pad" value.
     If the image is torch Tensor, it is expected
     to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
@@ -548,7 +553,7 @@ def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
     return F_t.crop(img, top, left, height, width)
 
 
-def center_crop(img: Tensor, output_size: List[int]) -> Tensor:
+def center_crop(img: Tensor, output_size: list[int]) -> Tensor:
     """Crops the given image at the center.
     If the image is torch Tensor, it is expected
     to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -595,9 +600,9 @@ def resized_crop(
     left: int,
     height: int,
     width: int,
-    size: List[int],
+    size: list[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
+    antialias: Optional[bool] = True,
 ) -> Tensor:
     """Crop the given image and resize it to desired size.
     If the image is torch Tensor, it is expected
@@ -615,13 +620,27 @@ def resized_crop(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
     Returns:
         PIL Image or Tensor: Cropped image.
     """
@@ -652,7 +671,7 @@ def hflip(img: Tensor) -> Tensor:
     return F_t.hflip(img)
 
 
-def _get_perspective_coeffs(startpoints: List[List[int]], endpoints: List[List[int]]) -> List[float]:
+def _get_perspective_coeffs(startpoints: list[list[int]], endpoints: list[list[int]]) -> list[float]:
     """Helper function to get the coefficients (a, b, c, d, e, f, g, h) for the perspective transforms.
 
     In Perspective Transform each pixel (x, y) in the original image gets transformed as,
@@ -667,25 +686,30 @@ def _get_perspective_coeffs(startpoints: List[List[int]], endpoints: List[List[i
     Returns:
         octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
     """
-    a_matrix = torch.zeros(2 * len(startpoints), 8, dtype=torch.float)
+    if len(startpoints) != 4 or len(endpoints) != 4:
+        raise ValueError(
+            f"Please provide exactly four corners, got {len(startpoints)} startpoints and {len(endpoints)} endpoints."
+        )
+    a_matrix = torch.zeros(2 * len(startpoints), 8, dtype=torch.float64)
 
     for i, (p1, p2) in enumerate(zip(endpoints, startpoints)):
         a_matrix[2 * i, :] = torch.tensor([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]])
         a_matrix[2 * i + 1, :] = torch.tensor([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]])
 
-    b_matrix = torch.tensor(startpoints, dtype=torch.float).view(8)
-    res = torch.linalg.lstsq(a_matrix, b_matrix, driver="gels").solution
+    b_matrix = torch.tensor(startpoints, dtype=torch.float64).view(8)
+    # do least squares in double precision to prevent numerical issues
+    res = torch.linalg.lstsq(a_matrix, b_matrix, driver="gels").solution.to(torch.float32)
 
-    output: List[float] = res.tolist()
+    output: list[float] = res.tolist()
     return output
 
 
 def perspective(
     img: Tensor,
-    startpoints: List[List[int]],
-    endpoints: List[List[int]],
+    startpoints: list[list[int]],
+    endpoints: list[list[int]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[List[float]] = None,
+    fill: Optional[list[float]] = None,
 ) -> Tensor:
     """Perform perspective transform of the given image.
     If the image is torch Tensor, it is expected
@@ -700,8 +724,7 @@ def perspective(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -717,16 +740,12 @@ def perspective(
 
     coeffs = _get_perspective_coeffs(startpoints, endpoints)
 
-    # Backward compatibility with integer value
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
-
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if not isinstance(img, torch.Tensor):
         pil_interpolation = pil_modes_mapping[interpolation]
@@ -755,7 +774,7 @@ def vflip(img: Tensor) -> Tensor:
     return F_t.vflip(img)
 
 
-def five_crop(img: Tensor, size: List[int]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+def five_crop(img: Tensor, size: list[int]) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
     """Crop the given image into four corners and the central crop.
     If the image is torch Tensor, it is expected
     to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
@@ -800,7 +819,9 @@ def five_crop(img: Tensor, size: List[int]) -> Tuple[Tensor, Tensor, Tensor, Ten
     return tl, tr, bl, br, center
 
 
-def ten_crop(img: Tensor, size: List[int], vertical_flip: bool = False) -> List[Tensor]:
+def ten_crop(
+    img: Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
     """Generate ten cropped images from the given image.
     Crop the given image into four corners and the central crop plus the
     flipped version of these (horizontal flipping is used by default).
@@ -852,7 +873,7 @@ def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         brightness_factor (float):  How much to adjust the brightness. Can be
-            any non negative number. 0 gives a black image, 1 gives the
+            any non-negative number. 0 gives a black image, 1 gives the
             original image while 2 increases the brightness by a factor of 2.
 
     Returns:
@@ -874,7 +895,7 @@ def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         contrast_factor (float): How much to adjust the contrast. Can be any
-            non negative number. 0 gives a solid gray image, 1 gives the
+            non-negative number. 0 gives a solid gray image, 1 gives the
             original image while 2 increases the contrast by a factor of 2.
 
     Returns:
@@ -983,8 +1004,8 @@ def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
 
 
 def _get_inverse_affine_matrix(
-    center: List[float], angle: float, translate: List[float], scale: float, shear: List[float], inverted: bool = True
-) -> List[float]:
+    center: list[float], angle: float, translate: list[float], scale: float, shear: list[float], inverted: bool = True
+) -> list[float]:
     # Helper method to compute inverse matrix for affine transformation
 
     # Pillow requires inverse affine transformation matrix:
@@ -997,7 +1018,7 @@ def _get_inverse_affine_matrix(
     #       RotateScaleShear(a, s, (sx, sy)) =
     #       = R(a) * S(s) * SHy(sy) * SHx(sx)
     #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
-    #         [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
     #         [ 0                    , 0                                      , 1 ]
     # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
     # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
@@ -1047,9 +1068,8 @@ def rotate(
     angle: float,
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
-    center: Optional[List[int]] = None,
-    fill: Optional[List[float]] = None,
-    resample: Optional[int] = None,
+    center: Optional[list[int]] = None,
+    fill: Optional[list[float]] = None,
 ) -> Tensor:
     """Rotate the image by angle.
     If the image is torch Tensor, it is expected
@@ -1061,8 +1081,7 @@ def rotate(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1075,11 +1094,6 @@ def rotate(
             .. note::
                 In torchscript mode single int/float value is not supported, please use a sequence
                 of length 1: ``[value, ]``.
-        resample (int, optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``interpolation``
-                instead.
-
     Returns:
         PIL Image or Tensor: Rotated image.
 
@@ -1088,20 +1102,13 @@ def rotate(
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(rotate)
-    if resample is not None:
-        warnings.warn(
-            "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. "
-            "Please use 'interpolation' instead."
-        )
-        interpolation = _interpolation_modes_from_int(resample)
 
-    # Backward compatibility with integer value
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
@@ -1109,9 +1116,6 @@ def rotate(
     if center is not None and not isinstance(center, (list, tuple)):
         raise TypeError("Argument center should be a sequence")
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
     if not isinstance(img, torch.Tensor):
         pil_interpolation = pil_modes_mapping[interpolation]
         return F_pil.rotate(img, angle=angle, interpolation=pil_interpolation, expand=expand, center=center, fill=fill)
@@ -1131,14 +1135,12 @@ def rotate(
 def affine(
     img: Tensor,
     angle: float,
-    translate: List[int],
+    translate: list[int],
     scale: float,
-    shear: List[float],
+    shear: list[float],
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: Optional[List[float]] = None,
-    resample: Optional[int] = None,
-    fillcolor: Optional[List[float]] = None,
-    center: Optional[List[int]] = None,
+    fill: Optional[list[float]] = None,
+    center: Optional[list[int]] = None,
 ) -> Tensor:
     """Apply affine transformation on the image keeping image center invariant.
     If the image is torch Tensor, it is expected
@@ -1150,26 +1152,18 @@ def affine(
         translate (sequence of integers): horizontal and vertical translations (post-rotation translation)
         scale (float): overall scale
         shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction.
-            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
-            the second value corresponds to a shear parallel to the y axis.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x-axis, while
+            the second value corresponds to a shear parallel to the y-axis.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
             .. note::
                 In torchscript mode single int/float value is not supported, please use a sequence
                 of length 1: ``[value, ]``.
-        fillcolor (sequence or number, optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``fill`` instead.
-        resample (int, optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``interpolation``
-                instead.
         center (sequence, optional): Optional center of rotation. Origin is the upper left corner.
             Default is the center of the image.
 
@@ -1178,27 +1172,13 @@ def affine(
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(affine)
-    if resample is not None:
-        warnings.warn(
-            "The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. "
-            "Please use 'interpolation' instead."
-        )
-        interpolation = _interpolation_modes_from_int(resample)
 
-    # Backward compatibility with integer value
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
-
-    if fillcolor is not None:
-        warnings.warn(
-            "The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. "
-            "Please use 'fill' instead."
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
         )
-        fill = fillcolor
 
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
@@ -1215,9 +1195,6 @@ def affine(
     if not isinstance(shear, (numbers.Number, (list, tuple))):
         raise TypeError("Shear should be either a single value or a sequence of two values")
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
     if isinstance(angle, int):
         angle = float(angle)
 
@@ -1261,6 +1238,9 @@ def affine(
     return F_t.affine(img, matrix=matrix, interpolation=interpolation.value, fill=fill)
 
 
+# Looks like to_grayscale() is a stand-alone functional that is never called
+# from the transform classes. Perhaps it's still here for BC? I can't be
+# bothered to dig.
 @torch.jit.unused
 def to_grayscale(img, num_output_channels=1):
     """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image.
@@ -1291,7 +1271,7 @@ def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
 
     Note:
         Please, note that this method supports only RGB images as input. For inputs in other color spaces,
-        please, consider using meth:`~torchvision.transforms.functional.to_grayscale` with PIL Image.
+        please, consider using :meth:`~torchvision.transforms.functional.to_grayscale` with PIL Image.
 
     Args:
         img (PIL Image or Tensor): RGB Image to be converted to grayscale.
@@ -1322,7 +1302,7 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
         h (int): Height of the erased region.
         w (int): Width of the erased region.
         v: Erasing value.
-        inplace(bool, optional): For in-place operations. By default is set False.
+        inplace(bool, optional): For in-place operations. By default, is set False.
 
     Returns:
         Tensor Image: Erased image.
@@ -1335,10 +1315,12 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
     return F_t.erase(img, i, j, h, w, v, inplace=inplace)
 
 
-def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Tensor:
-    """Performs Gaussian blurring on the image by given kernel.
+def gaussian_blur(img: Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None) -> Tensor:
+    """Performs Gaussian blurring on the image by given kernel
+
+    The convolution will be using reflection padding corresponding to the kernel size, to maintain the input shape.
     If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    to have [..., H, W] shape, where ... means at most one leading dimension.
 
     Args:
         img (PIL Image or Tensor): Image to be blurred
@@ -1427,7 +1409,7 @@ def posterize(img: Tensor, bits: int) -> Tensor:
 
     Args:
         img (PIL Image or Tensor): Image to have its colors posterized.
-            If img is torch Tensor, it should be of type torch.uint8 and
+            If img is torch Tensor, it should be of type torch.uint8, and
             it is expected to be in [..., 1 or 3, H, W] format, where ... means
             it can have an arbitrary number of leading dimensions.
             If img is PIL Image, it is expected to be in mode "L" or "RGB".
@@ -1474,7 +1456,7 @@ def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
             original image while 2 increases the sharpness by a factor of 2.
 
     Returns:
@@ -1537,7 +1519,7 @@ def elastic_transform(
     img: Tensor,
     displacement: Tensor,
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: Optional[List[float]] = None,
+    fill: Optional[list[float]] = None,
 ) -> Tensor:
     """Transform a tensor image with elastic transformations.
     Given alpha and sigma, it will generate displacement
@@ -1559,12 +1541,10 @@ def elastic_transform(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``.
-            For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (number or str or tuple): Pixel fill value for constant fill. Default is 0.
             If a tuple of length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or str or tuple value is supported for PIL Image.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(elastic_transform)
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 095460675cc..6de15075033 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -3,7 +3,7 @@
 import random
 import warnings
 from collections.abc import Sequence
-from typing import List, Optional, Tuple
+from typing import Optional, Union
 
 import torch
 from torch import Tensor
@@ -105,7 +105,9 @@ def __repr__(self) -> str:
 
 
 class ToTensor:
-    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript.
+    """Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image or numpy.ndarray (H x W x C) in the range
     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
@@ -139,7 +141,9 @@ def __repr__(self) -> str:
 
 
 class PILToTensor:
-    """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript.
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
     """
@@ -166,7 +170,8 @@ def __repr__(self) -> str:
 
 
 class ConvertImageDtype(torch.nn.Module):
-    """Convert a tensor image to the given ``dtype`` and scale the values accordingly
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly.
+
     This function does not support PIL Image.
 
     Args:
@@ -194,19 +199,21 @@ def forward(self, image):
 
 
 class ToPILImage:
-    """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript.
+    """Convert a tensor or an ndarray to PIL Image
+
+    This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
-    H x W x C to a PIL Image while preserving the value range.
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
 
     Args:
         mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
             If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
             - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
             - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
             - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
-            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
-            ``short``).
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, ``short``).
 
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
@@ -276,14 +283,7 @@ def __repr__(self) -> str:
 class Resize(torch.nn.Module):
     """Resize the input image to the given size.
     If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
-
-    .. warning::
-        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
-        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
-        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
-        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
-        closer.
+    to have [..., H, W] shape, where ... means a maximum of two leading dimensions
 
     Args:
         size (sequence or int): Desired output size. If size is a sequence like
@@ -296,25 +296,38 @@ class Resize(torch.nn.Module):
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
-            ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
     """
 
-    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias=None):
+    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias=True):
         super().__init__()
         _log_api_usage_once(self)
         if not isinstance(size, (int, Sequence)):
@@ -324,12 +337,7 @@ def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None
         self.size = size
         self.max_size = max_size
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.interpolation = interpolation
@@ -581,7 +589,7 @@ class RandomCrop(torch.nn.Module):
             int instead of sequence like (h, w), a square crop (size, size) is
             made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
         padding (int or sequence, optional): Optional padding on each border
-            of the image. Default is None. If a single int is provided this
+            of the image, applied before cropping. Default is None. If a single int is provided this
             is used to pad all borders. If sequence of length 2 is provided this is the padding
             on left/right and top/bottom respectively. If a sequence of length 4 is provided
             this is the padding for the left, top, right and bottom borders respectively.
@@ -615,7 +623,7 @@ class RandomCrop(torch.nn.Module):
     """
 
     @staticmethod
-    def get_params(img: Tensor, output_size: Tuple[int, int]) -> Tuple[int, int, int, int]:
+    def get_params(img: Tensor, output_size: tuple[int, int]) -> tuple[int, int, int, int]:
         """Get parameters for ``crop`` for a random crop.
 
         Args:
@@ -628,7 +636,7 @@ def get_params(img: Tensor, output_size: Tuple[int, int]) -> Tuple[int, int, int
         _, h, w = F.get_dimensions(img)
         th, tw = output_size
 
-        if h + 1 < th or w + 1 < tw:
+        if h < th or w < tw:
             raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
 
         if w == tw and h == th:
@@ -752,8 +760,7 @@ class RandomPerspective(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
     """
@@ -763,12 +770,7 @@ def __init__(self, distortion_scale=0.5, p=0.5, interpolation=InterpolationMode.
         _log_api_usage_once(self)
         self.p = p
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.interpolation = interpolation
@@ -804,7 +806,7 @@ def forward(self, img):
         return img
 
     @staticmethod
-    def get_params(width: int, height: int, distortion_scale: float) -> Tuple[List[List[int]], List[List[int]]]:
+    def get_params(width: int, height: int, distortion_scale: float) -> tuple[list[list[int]], list[list[int]]]:
         """Get parameters for ``perspective`` for a random perspective transform.
 
         Args:
@@ -865,14 +867,27 @@ class RandomResizedCrop(torch.nn.Module):
             resizing.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
-            ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
     """
 
     def __init__(
@@ -881,7 +896,7 @@ def __init__(
         scale=(0.08, 1.0),
         ratio=(3.0 / 4.0, 4.0 / 3.0),
         interpolation=InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[bool] = True,
     ):
         super().__init__()
         _log_api_usage_once(self)
@@ -894,12 +909,7 @@ def __init__(
         if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
             warnings.warn("Scale and ratio should be of kind (min, max)")
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.interpolation = interpolation
@@ -908,7 +918,7 @@ def __init__(
         self.ratio = ratio
 
     @staticmethod
-    def get_params(img: Tensor, scale: List[float], ratio: List[float]) -> Tuple[int, int, int, int]:
+    def get_params(img: Tensor, scale: list[float], ratio: list[float]) -> tuple[int, int, int, int]:
         """Get parameters for ``crop`` for a random sized crop.
 
         Args:
@@ -967,7 +977,7 @@ def __repr__(self) -> str:
         format_string = self.__class__.__name__ + f"(size={self.size}"
         format_string += f", scale={tuple(round(s, 4) for s in self.scale)}"
         format_string += f", ratio={tuple(round(r, 4) for r in self.ratio)}"
-        format_string += f", interpolation={interpolate_str})"
+        format_string += f", interpolation={interpolate_str}"
         format_string += f", antialias={self.antialias})"
         return format_string
 
@@ -1039,7 +1049,7 @@ class TenCrop(torch.nn.Module):
 
     Example:
          >>> transform = Compose([
-         >>>    TenCrop(size), # this is a list of PIL Images
+         >>>    TenCrop(size), # this is a tuple of PIL Images
          >>>    Lambda(lambda crops: torch.stack([PILToTensor()(crop) for crop in crops])) # returns a 4D tensor
          >>> ])
          >>> #In your test loop you can do the following:
@@ -1108,6 +1118,11 @@ def __init__(self, transformation_matrix, mean_vector):
                 f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
             )
 
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
         self.transformation_matrix = transformation_matrix
         self.mean_vector = mean_vector
 
@@ -1135,7 +1150,8 @@ def forward(self, tensor: Tensor) -> Tensor:
             )
 
         flat_tensor = tensor.view(-1, n) - self.mean_vector
-        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
+        transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
+        transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
         tensor = transformed_tensor.view(shape)
         return tensor
 
@@ -1160,7 +1176,7 @@ class ColorJitter(torch.nn.Module):
             or the given [min, max]. Should be non negative numbers.
         contrast (float or tuple of float (min, max)): How much to jitter contrast.
             contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-            or the given [min, max]. Should be non negative numbers.
+            or the given [min, max]. Should be non-negative numbers.
         saturation (float or tuple of float (min, max)): How much to jitter saturation.
             saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
             or the given [min, max]. Should be non negative numbers.
@@ -1172,7 +1188,13 @@ class ColorJitter(torch.nn.Module):
             or use an interpolation that generates negative values before using this function.
     """
 
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+    def __init__(
+        self,
+        brightness: Union[float, tuple[float, float]] = 0,
+        contrast: Union[float, tuple[float, float]] = 0,
+        saturation: Union[float, tuple[float, float]] = 0,
+        hue: Union[float, tuple[float, float]] = 0,
+    ) -> None:
         super().__init__()
         _log_api_usage_once(self)
         self.brightness = self._check_input(brightness, "brightness")
@@ -1189,24 +1211,27 @@ def _check_input(self, value, name, center=1, bound=(0, float("inf")), clip_firs
             if clip_first_on_zero:
                 value[0] = max(value[0], 0.0)
         elif isinstance(value, (tuple, list)) and len(value) == 2:
-            if not bound[0] <= value[0] <= value[1] <= bound[1]:
-                raise ValueError(f"{name} values should be between {bound}")
+            value = [float(value[0]), float(value[1])]
         else:
             raise TypeError(f"{name} should be a single number or a list/tuple with length 2.")
 
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound}, but got {value}.")
+
         # if value is 0 or (1., 1.) for brightness/contrast/saturation
         # or (0., 0.) for hue, do nothing
         if value[0] == value[1] == center:
-            value = None
-        return value
+            return None
+        else:
+            return tuple(value)
 
     @staticmethod
     def get_params(
-        brightness: Optional[List[float]],
-        contrast: Optional[List[float]],
-        saturation: Optional[List[float]],
-        hue: Optional[List[float]],
-    ) -> Tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]:
+        brightness: Optional[list[float]],
+        contrast: Optional[list[float]],
+        saturation: Optional[list[float]],
+        hue: Optional[list[float]],
+    ) -> tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]:
         """Get the parameters for the randomized transform to be applied on image.
 
         Args:
@@ -1279,8 +1304,7 @@ class RandomRotation(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1289,33 +1313,16 @@ class RandomRotation(torch.nn.Module):
             Default is the center of the image.
         fill (sequence or number): Pixel fill value for the area outside the rotated
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
-        resample (int, optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``interpolation``
-                instead.
 
     .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
 
     """
 
-    def __init__(
-        self, degrees, interpolation=InterpolationMode.NEAREST, expand=False, center=None, fill=0, resample=None
-    ):
+    def __init__(self, degrees, interpolation=InterpolationMode.NEAREST, expand=False, center=None, fill=0):
         super().__init__()
         _log_api_usage_once(self)
-        if resample is not None:
-            warnings.warn(
-                "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. "
-                "Please use 'interpolation' instead."
-            )
-            interpolation = _interpolation_modes_from_int(resample)
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -1325,7 +1332,7 @@ def __init__(
 
         self.center = center
 
-        self.resample = self.interpolation = interpolation
+        self.interpolation = interpolation
         self.expand = expand
 
         if fill is None:
@@ -1336,7 +1343,7 @@ def __init__(
         self.fill = fill
 
     @staticmethod
-    def get_params(degrees: List[float]) -> float:
+    def get_params(degrees: list[float]) -> float:
         """Get parameters for ``rotate`` for a random rotation.
 
         Returns:
@@ -1362,7 +1369,7 @@ def forward(self, img):
                 fill = [float(f) for f in fill]
         angle = self.get_params(self.degrees)
 
-        return F.rotate(img, angle, self.resample, self.expand, self.center, fill)
+        return F.rotate(img, angle, self.interpolation, self.expand, self.center, fill)
 
     def __repr__(self) -> str:
         interpolate_str = self.interpolation.value
@@ -1393,25 +1400,17 @@ class RandomAffine(torch.nn.Module):
         scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
             randomly sampled from the range a <= scale <= b. Will keep original scale by default.
         shear (sequence or number, optional): Range of degrees to select from.
-            If shear is a number, a shear parallel to the x axis in the range (-shear, +shear)
-            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x axis in the
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
             range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
-            a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Will not apply shear by default.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
-        fillcolor (sequence or number, optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``fill`` instead.
-        resample (int, optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``interpolation``
-                instead.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
 
@@ -1427,34 +1426,14 @@ def __init__(
         shear=None,
         interpolation=InterpolationMode.NEAREST,
         fill=0,
-        fillcolor=None,
-        resample=None,
         center=None,
     ):
         super().__init__()
         _log_api_usage_once(self)
-        if resample is not None:
-            warnings.warn(
-                "The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. "
-                "Please use 'interpolation' instead."
-            )
-            interpolation = _interpolation_modes_from_int(resample)
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
-        if fillcolor is not None:
-            warnings.warn(
-                "The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. "
-                "Please use 'fill' instead."
-            )
-            fill = fillcolor
-
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
 
         if translate is not None:
@@ -1476,14 +1455,14 @@ def __init__(
         else:
             self.shear = shear
 
-        self.resample = self.interpolation = interpolation
+        self.interpolation = interpolation
 
         if fill is None:
             fill = 0
         elif not isinstance(fill, (Sequence, numbers.Number)):
             raise TypeError("Fill should be either a sequence or a number.")
 
-        self.fillcolor = self.fill = fill
+        self.fill = fill
 
         if center is not None:
             _check_sequence_input(center, "center", req_sizes=(2,))
@@ -1492,12 +1471,12 @@ def __init__(
 
     @staticmethod
     def get_params(
-        degrees: List[float],
-        translate: Optional[List[float]],
-        scale_ranges: Optional[List[float]],
-        shears: Optional[List[float]],
-        img_size: List[int],
-    ) -> Tuple[float, Tuple[int, int], float, Tuple[float, float]]:
+        degrees: list[float],
+        translate: Optional[list[float]],
+        scale_ranges: Optional[list[float]],
+        shears: Optional[list[float]],
+        img_size: list[int],
+    ) -> tuple[float, tuple[int, int], float, tuple[float, float]]:
         """Get parameters for affine transformation
 
         Returns:
@@ -1636,7 +1615,7 @@ def __repr__(self) -> str:
 
 
 class RandomErasing(torch.nn.Module):
-    """Randomly selects a rectangle region in an torch Tensor image and erases its pixels.
+    """Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
     This transform does not support PIL Image.
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
 
@@ -1670,9 +1649,9 @@ def __init__(self, p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace
             raise TypeError("Argument value should be either a number or str or a sequence")
         if isinstance(value, str) and value != "random":
             raise ValueError("If value is str, it should be 'random'")
-        if not isinstance(scale, (tuple, list)):
+        if not isinstance(scale, Sequence):
             raise TypeError("Scale should be a sequence")
-        if not isinstance(ratio, (tuple, list)):
+        if not isinstance(ratio, Sequence):
             raise TypeError("Ratio should be a sequence")
         if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
             warnings.warn("Scale and ratio should be of kind (min, max)")
@@ -1689,8 +1668,8 @@ def __init__(self, p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace
 
     @staticmethod
     def get_params(
-        img: Tensor, scale: Tuple[float, float], ratio: Tuple[float, float], value: Optional[List[float]] = None
-    ) -> Tuple[int, int, int, int, Tensor]:
+        img: Tensor, scale: tuple[float, float], ratio: tuple[float, float], value: Optional[list[float]] = None
+    ) -> tuple[int, int, int, int, Tensor]:
         """Get parameters for ``erase`` for a random erasing.
 
         Args:
@@ -1741,15 +1720,15 @@ def forward(self, img):
 
             # cast self.value to script acceptable type
             if isinstance(self.value, (int, float)):
-                value = [self.value]
+                value = [float(self.value)]
             elif isinstance(self.value, str):
                 value = None
-            elif isinstance(self.value, tuple):
-                value = list(self.value)
+            elif isinstance(self.value, (list, tuple)):
+                value = [float(v) for v in self.value]
             else:
                 value = self.value
 
-            if value is not None and not (len(value) in (1, img.shape[-3])):
+            if value is not None and len(value) not in (1, img.shape[-3]):
                 raise ValueError(
                     "If value is a sequence, it should have either a single value or "
                     f"{img.shape[-3]} (number of input channels)"
@@ -1774,7 +1753,7 @@ def __repr__(self) -> str:
 class GaussianBlur(torch.nn.Module):
     """Blurs image with randomly chosen Gaussian blur.
     If the image is torch Tensor, it is expected
-    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    to have [..., C, H, W] shape, where ... means at most one leading dimension.
 
     Args:
         kernel_size (int or sequence): Size of the Gaussian kernel.
@@ -1972,7 +1951,7 @@ class RandomAdjustSharpness(torch.nn.Module):
 
     Args:
         sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
             original image while 2 increases the sharpness by a factor of 2.
         p (float): probability of the image being sharpened. Default value is 0.5
     """
@@ -2079,7 +2058,7 @@ class ElasticTransform(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
 
@@ -2120,21 +2099,20 @@ def __init__(self, alpha=50.0, sigma=5.0, interpolation=InterpolationMode.BILINE
 
         self.sigma = sigma
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument interpolation should be of type InterpolationMode instead of int. "
-                "Please, use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
         self.interpolation = interpolation
 
-        if not isinstance(fill, (int, float)):
-            raise TypeError(f"fill should be int or float. Got {type(fill)}")
+        if isinstance(fill, (int, float)):
+            fill = [float(fill)]
+        elif isinstance(fill, (list, tuple)):
+            fill = [float(f) for f in fill]
+        else:
+            raise TypeError(f"fill should be int or float or a list or tuple of them. Got {type(fill)}")
         self.fill = fill
 
     @staticmethod
-    def get_params(alpha: List[float], sigma: List[float], size: List[int]) -> Tensor:
+    def get_params(alpha: list[float], sigma: list[float], size: list[int]) -> Tensor:
         dx = torch.rand([1, 1] + size) * 2 - 1
         if sigma[0] > 0.0:
             kx = int(8 * sigma[0] + 1)
@@ -2157,19 +2135,19 @@ def get_params(alpha: List[float], sigma: List[float], size: List[int]) -> Tenso
     def forward(self, tensor: Tensor) -> Tensor:
         """
         Args:
-            img (PIL Image or Tensor): Image to be transformed.
+            tensor (PIL Image or Tensor): Image to be transformed.
 
         Returns:
             PIL Image or Tensor: Transformed image.
         """
-        size = F.get_image_size(tensor)[::-1]
-        displacement = self.get_params(self.alpha, self.sigma, size)
+        _, height, width = F.get_dimensions(tensor)
+        displacement = self.get_params(self.alpha, self.sigma, [height, width])
         return F.elastic_transform(tensor, displacement, self.interpolation, self.fill)
 
     def __repr__(self):
-        format_string = self.__class__.__name__ + "(alpha="
-        format_string += str(self.alpha) + ")"
-        format_string += ", (sigma=" + str(self.sigma) + ")"
-        format_string += ", interpolation={self.interpolation}"
-        format_string += ", fill={self.fill})"
+        format_string = self.__class__.__name__
+        format_string += f"(alpha={self.alpha}"
+        format_string += f", sigma={self.sigma}"
+        format_string += f", interpolation={self.interpolation}"
+        format_string += f", fill={self.fill})"
         return format_string
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
new file mode 100644
index 00000000000..408065dab94
--- /dev/null
+++ b/torchvision/transforms/v2/__init__.py
@@ -0,0 +1,60 @@
+from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
+
+from . import functional  # usort: skip
+
+from ._transform import Transform  # usort: skip
+
+from ._augment import CutMix, JPEG, MixUp, RandomErasing
+from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
+from ._color import (
+    ColorJitter,
+    Grayscale,
+    RandomAdjustSharpness,
+    RandomAutocontrast,
+    RandomChannelPermutation,
+    RandomEqualize,
+    RandomGrayscale,
+    RandomInvert,
+    RandomPhotometricDistort,
+    RandomPosterize,
+    RandomSolarize,
+    RGB,
+)
+from ._container import Compose, RandomApply, RandomChoice, RandomOrder
+from ._geometry import (
+    CenterCrop,
+    ElasticTransform,
+    FiveCrop,
+    Pad,
+    RandomAffine,
+    RandomCrop,
+    RandomHorizontalFlip,
+    RandomIoUCrop,
+    RandomPerspective,
+    RandomResize,
+    RandomResizedCrop,
+    RandomRotation,
+    RandomShortestSize,
+    RandomVerticalFlip,
+    RandomZoomOut,
+    Resize,
+    ScaleJitter,
+    TenCrop,
+)
+from ._meta import ClampBoundingBoxes, ClampKeyPoints, ConvertBoundingBoxFormat, SetClampingMode
+from ._misc import (
+    ConvertImageDtype,
+    GaussianBlur,
+    GaussianNoise,
+    Identity,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    SanitizeBoundingBoxes,
+    ToDtype,
+)
+from ._temporal import UniformTemporalSubsample
+from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
+from ._utils import check_type, get_bounding_boxes, has_all, has_any, query_chw, query_size
+
+from ._deprecated import ToTensor  # usort: skip
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
new file mode 100644
index 00000000000..c6da9aba98b
--- /dev/null
+++ b/torchvision/transforms/v2/_augment.py
@@ -0,0 +1,374 @@
+import math
+import numbers
+import warnings
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn.functional import one_hot
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+from ._transform import _RandomApplyTransform, Transform
+from ._utils import _check_sequence_input, _parse_labels_getter, has_any, is_pure_tensor, query_chw, query_size
+
+
+class RandomErasing(_RandomApplyTransform):
+    """Randomly select a rectangle region in the input image or video and erase its pixels.
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+        p (float, optional): probability that the random erasing operation will be performed.
+        scale (tuple of float, optional): range of proportion of erased area against input image.
+        ratio (tuple of float, optional): range of aspect ratio of erased area.
+        value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+        inplace (bool, optional): boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased input.
+
+    Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
+    _v1_transform_cls = _transforms.RandomErasing
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        return dict(
+            super()._extract_params_for_v1_transform(),
+            value="random" if self.value is None else self.value,
+        )
+
+    def __init__(
+        self,
+        p: float = 0.5,
+        scale: Sequence[float] = (0.02, 0.33),
+        ratio: Sequence[float] = (0.3, 3.3),
+        value: float = 0.0,
+        inplace: bool = False,
+    ):
+        super().__init__(p=p)
+        if not isinstance(value, (numbers.Number, str, tuple, list)):
+            raise TypeError("Argument value should be either a number or str or a sequence")
+        if isinstance(value, str) and value != "random":
+            raise ValueError("If value is str, it should be 'random'")
+        if not isinstance(scale, Sequence):
+            raise TypeError("Scale should be a sequence")
+        if not isinstance(ratio, Sequence):
+            raise TypeError("Ratio should be a sequence")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+        if scale[0] < 0 or scale[1] > 1:
+            raise ValueError("Scale should be between 0 and 1")
+        self.scale = scale
+        self.ratio = ratio
+        if isinstance(value, (int, float)):
+            self.value = [float(value)]
+        elif isinstance(value, str):
+            self.value = None
+        elif isinstance(value, (list, tuple)):
+            self.value = [float(v) for v in value]
+        else:
+            self.value = value
+        self.inplace = inplace
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        img_c, img_h, img_w = query_chw(flat_inputs)
+
+        if self.value is not None and len(self.value) not in (1, img_c):
+            raise ValueError(
+                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
+            )
+
+        area = img_h * img_w
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            h = int(round(math.sqrt(erase_area * aspect_ratio)))
+            w = int(round(math.sqrt(erase_area / aspect_ratio)))
+            if not (h < img_h and w < img_w):
+                continue
+
+            if self.value is None:
+                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
+            else:
+                v = torch.tensor(self.value)[:, None, None]
+
+            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
+            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
+            break
+        else:
+            i, j, h, w, v = 0, 0, img_h, img_w, None
+
+        return dict(i=i, j=j, h=h, w=w, v=v)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if params["v"] is not None:
+            inpt = self._call_kernel(F.erase, inpt, **params, inplace=self.inplace)
+
+        return inpt
+
+
+class _BaseMixUpCutMix(Transform):
+    def __init__(self, *, alpha: float = 1.0, num_classes: Optional[int] = None, labels_getter="default") -> None:
+        super().__init__()
+        self.alpha = float(alpha)
+        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
+
+        self.num_classes = num_classes
+
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        if has_any(flat_inputs, PIL.Image.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints):
+            raise ValueError(
+                f"{type(self).__name__}() does not support PIL images, bounding boxes, keypoints and masks."
+            )
+
+        labels = self._labels_getter(inputs)
+        if not isinstance(labels, torch.Tensor):
+            raise ValueError(f"The labels must be a tensor, but got {type(labels)} instead.")
+        if labels.ndim not in (1, 2):
+            raise ValueError(
+                f"labels should be index based with shape (batch_size,) "
+                f"or probability based with shape (batch_size, num_classes), "
+                f"but got a tensor of shape {labels.shape} instead."
+            )
+        if labels.ndim == 2 and self.num_classes is not None and labels.shape[-1] != self.num_classes:
+            raise ValueError(
+                f"When passing 2D labels, "
+                f"the number of elements in last dimension must match num_classes: "
+                f"{labels.shape[-1]} != {self.num_classes}. "
+                f"You can Leave num_classes to None."
+            )
+        if labels.ndim == 1 and self.num_classes is None:
+            raise ValueError("num_classes must be passed if the labels are index-based (1D)")
+
+        params = {
+            "labels": labels,
+            "batch_size": labels.shape[0],
+            **self.make_params(
+                [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+            ),
+        }
+
+        # By default, the labels will be False inside needs_transform_list, since they are a torch.Tensor coming
+        # after an image or video. However, we need to handle them in _transform, so we make sure to set them to True
+        needs_transform_list[next(idx for idx, inpt in enumerate(flat_inputs) if inpt is labels)] = True
+        flat_outputs = [
+            self.transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int):
+        expected_num_dims = 5 if isinstance(inpt, tv_tensors.Video) else 4
+        if inpt.ndim != expected_num_dims:
+            raise ValueError(
+                f"Expected a batched input with {expected_num_dims} dims, but got {inpt.ndim} dimensions instead."
+            )
+        if inpt.shape[0] != batch_size:
+            raise ValueError(
+                f"The batch size of the image or video does not match the batch size of the labels: "
+                f"{inpt.shape[0]} != {batch_size}."
+            )
+
+    def _mixup_label(self, label: torch.Tensor, *, lam: float) -> torch.Tensor:
+        if label.ndim == 1:
+            label = one_hot(label, num_classes=self.num_classes)  # type: ignore[arg-type]
+        if not label.dtype.is_floating_point:
+            label = label.float()
+        return label.roll(1, 0).mul_(1.0 - lam).add_(label.mul(lam))
+
+
+class MixUp(_BaseMixUpCutMix):
+    """Apply MixUp to the provided batch of images and labels.
+
+    Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
+
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int, optional): number of classes in the batch. Used for one-hot-encoding.
+            Can be None only if the labels are already one-hot-encoded.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``MixUp()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        lam = params["lam"]
+
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=lam)
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
+
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
+
+            return output
+        else:
+            return inpt
+
+
+class CutMix(_BaseMixUpCutMix):
+    """Apply CutMix to the provided batch of images and labels.
+
+    Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
+    <https://arxiv.org/abs/1905.04899>`_.
+
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int, optional): number of classes in the batch. Used for one-hot-encoding.
+            Can be None only if the labels are already one-hot-encoded.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``CutMix()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        lam = float(self._dist.sample(()))  # type: ignore[arg-type]
+
+        H, W = query_size(flat_inputs)
+
+        r_x = torch.randint(W, size=(1,))
+        r_y = torch.randint(H, size=(1,))
+
+        r = 0.5 * math.sqrt(1.0 - lam)
+        r_w_half = int(r * W)
+        r_h_half = int(r * H)
+
+        x1 = int(torch.clamp(r_x - r_w_half, min=0))
+        y1 = int(torch.clamp(r_y - r_h_half, min=0))
+        x2 = int(torch.clamp(r_x + r_w_half, max=W))
+        y2 = int(torch.clamp(r_y + r_h_half, max=H))
+        box = (x1, y1, x2, y2)
+
+        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
+
+        return dict(box=box, lam_adjusted=lam_adjusted)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=params["lam_adjusted"])
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            x1, y1, x2, y2 = params["box"]
+            rolled = inpt.roll(1, 0)
+            output = inpt.clone()
+            output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
+
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
+
+            return output
+        else:
+            return inpt
+
+
+class JPEG(Transform):
+    """Apply JPEG compression and decompression to the given images.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to be of dtype uint8, on CPU, and have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        quality (sequence or number): JPEG quality, from 1 to 100. Lower means more compression.
+            If quality is a sequence like (min, max), it specifies the range of JPEG quality to
+            randomly select from (inclusive of both ends).
+
+    Returns:
+        image with JPEG compression.
+    """
+
+    def __init__(self, quality: Union[int, Sequence[int]]):
+        super().__init__()
+        if isinstance(quality, int):
+            if isinstance(quality, bool):
+                raise TypeError("quality can't be bool")
+            quality = [quality, quality]
+        else:
+            _check_sequence_input(quality, "quality", req_sizes=(2,))
+
+        if not (1 <= quality[0] <= quality[1] <= 100 and isinstance(quality[0], int) and isinstance(quality[1], int)):
+            raise ValueError(f"quality must be an integer from 1 to 100, got {quality =}")
+
+        self.quality = quality
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        quality = torch.randint(self.quality[0], self.quality[1] + 1, ()).item()
+        return dict(quality=quality)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.jpeg, inpt, quality=params["quality"])
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
similarity index 53%
rename from torchvision/prototype/transforms/_auto_augment.py
rename to torchvision/transforms/v2/_auto_augment.py
index bdf2e2455ad..52707af1f2e 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -1,73 +1,134 @@
 import math
-import numbers
-from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
+from typing import Any, Callable, cast, Optional, Union
 
 import PIL.Image
 import torch
 
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-from torchvision.transforms.autoaugment import AutoAugmentPolicy
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms import _functional_tensor as _FT
+from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.functional._meta import get_size
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
-from ._utils import is_simple_tensor, query_chw
+from ._utils import _get_fill, _setup_fill_arg, check_type, is_pure_tensor
 
-K = TypeVar("K")
-V = TypeVar("V")
+
+ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video]
 
 
 class _AutoAugmentBase(Transform):
     def __init__(
         self,
         *,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
     ) -> None:
         super().__init__()
-        self.interpolation = interpolation
-
-        if not isinstance(fill, (numbers.Number, tuple, list)):
-            raise TypeError("Got inappropriate fill arg")
+        self.interpolation = _check_interpolation(interpolation)
         self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
 
-    def _get_random_item(self, dct: Dict[K, V]) -> Tuple[K, V]:
+        if isinstance(params["fill"], dict):
+            raise ValueError(f"{type(self).__name__}() can not be scripted for when `fill` is a dictionary.")
+
+        return params
+
+    def _get_random_item(self, dct: dict[str, tuple[Callable, bool]]) -> tuple[str, tuple[Callable, bool]]:
         keys = tuple(dct.keys())
         key = keys[int(torch.randint(len(keys), ()))]
         return key, dct[key]
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, height, width = query_chw(sample)
-        return dict(height=height, width=width)
+    def _flatten_and_extract_image_or_video(
+        self,
+        inputs: Any,
+        unsupported_types: tuple[type, ...] = (tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints),
+    ) -> tuple[tuple[list[Any], TreeSpec, int], ImageOrVideo]:
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        image_or_videos = []
+        for idx, (inpt, needs_transform) in enumerate(zip(flat_inputs, needs_transform_list)):
+            if needs_transform and check_type(
+                inpt,
+                (
+                    tv_tensors.Image,
+                    PIL.Image.Image,
+                    is_pure_tensor,
+                    tv_tensors.Video,
+                ),
+            ):
+                image_or_videos.append((idx, inpt))
+            elif isinstance(inpt, unsupported_types):
+                raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
+
+        if not image_or_videos:
+            raise TypeError("Found no image in the sample.")
+        if len(image_or_videos) > 1:
+            raise TypeError(
+                f"Auto augment transformations are only properly defined for a single image or video, "
+                f"but found {len(image_or_videos)}."
+            )
+
+        idx, image_or_video = image_or_videos[0]
+        return (flat_inputs, spec, idx), image_or_video
 
-    def _apply_image_transform(
+    def _unflatten_and_insert_image_or_video(
         self,
-        image: Any,
+        flat_inputs_with_spec: tuple[list[Any], TreeSpec, int],
+        image_or_video: ImageOrVideo,
+    ) -> Any:
+        flat_inputs, spec, idx = flat_inputs_with_spec
+        flat_inputs[idx] = image_or_video
+        return tree_unflatten(flat_inputs, spec)
+
+    def _apply_image_or_video_transform(
+        self,
+        image: ImageOrVideo,
         transform_id: str,
         magnitude: float,
-        interpolation: InterpolationMode,
-        fill: Union[int, float, Sequence[int], Sequence[float]],
-    ) -> Any:
+        interpolation: Union[InterpolationMode, int],
+        fill: dict[Union[type, str], _FillTypeJIT],
+    ) -> ImageOrVideo:
+        # Note: this cast is wrong and is only here to make mypy happy (it disagrees with torchscript)
+        image = cast(torch.Tensor, image)
+        fill_ = _get_fill(fill, type(image))
+
         if transform_id == "Identity":
             return image
         elif transform_id == "ShearX":
+            # magnitude should be arctan(magnitude)
+            # official autoaug: (1, level, 0, 0, 1, 0)
+            # https://github.com/tensorflow/models/blob/dd02069717128186b88afa8d857ce57d17957f03/research/autoaugment/augmentation_transforms.py#L290
+            # compared to
+            # torchvision:      (1, tan(level), 0, 0, 1, 0)
+            # https://github.com/pytorch/vision/blob/0c2373d0bba3499e95776e7936e207d8a1676e65/torchvision/transforms/functional.py#L976
             return F.affine(
                 image,
                 angle=0.0,
                 translate=[0, 0],
                 scale=1.0,
-                shear=[math.degrees(magnitude), 0.0],
+                shear=[math.degrees(math.atan(magnitude)), 0.0],
                 interpolation=interpolation,
-                fill=fill,
+                fill=fill_,
+                center=[0, 0],
             )
         elif transform_id == "ShearY":
+            # magnitude should be arctan(magnitude)
+            # See above
             return F.affine(
                 image,
                 angle=0.0,
                 translate=[0, 0],
                 scale=1.0,
-                shear=[0.0, math.degrees(magnitude)],
+                shear=[0.0, math.degrees(math.atan(magnitude))],
                 interpolation=interpolation,
-                fill=fill,
+                fill=fill_,
+                center=[0, 0],
             )
         elif transform_id == "TranslateX":
             return F.affine(
@@ -75,9 +136,9 @@ def _apply_image_transform(
                 angle=0.0,
                 translate=[int(magnitude), 0],
                 scale=1.0,
-                shear=[0.0, 0.0],
                 interpolation=interpolation,
-                fill=fill,
+                shear=[0.0, 0.0],
+                fill=fill_,
             )
         elif transform_id == "TranslateY":
             return F.affine(
@@ -85,12 +146,12 @@ def _apply_image_transform(
                 angle=0.0,
                 translate=[0, int(magnitude)],
                 scale=1.0,
-                shear=[0.0, 0.0],
                 interpolation=interpolation,
-                fill=fill,
+                shear=[0.0, 0.0],
+                fill=fill_,
             )
         elif transform_id == "Rotate":
-            return F.rotate(image, angle=magnitude)
+            return F.rotate(image, angle=magnitude, interpolation=interpolation, fill=fill_)
         elif transform_id == "Brightness":
             return F.adjust_brightness(image, brightness_factor=1.0 + magnitude)
         elif transform_id == "Color":
@@ -102,7 +163,8 @@ def _apply_image_transform(
         elif transform_id == "Posterize":
             return F.posterize(image, bits=int(magnitude))
         elif transform_id == "Solarize":
-            return F.solarize(image, threshold=magnitude)
+            bound = _FT._max_value(image.dtype) if isinstance(image, torch.Tensor) else 255.0
+            return F.solarize(image, threshold=bound * magnitude)
         elif transform_id == "AutoContrast":
             return F.autocontrast(image)
         elif transform_id == "Equalize":
@@ -114,6 +176,27 @@ def _apply_image_transform(
 
 
 class AutoAugment(_AutoAugmentBase):
+    r"""AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy, optional): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.AutoAugment
+
     _AUGMENTATION_SPACE = {
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
         "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
@@ -131,12 +214,10 @@ class AutoAugment(_AutoAugmentBase):
         "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
         "Invert": (lambda num_bins, height, width: None, False),
@@ -145,8 +226,8 @@ class AutoAugment(_AutoAugmentBase):
     def __init__(
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.policy = policy
@@ -154,7 +235,7 @@ def __init__(
 
     def _get_policies(
         self, policy: AutoAugmentPolicy
-    ) -> List[Tuple[Tuple[str, float, Optional[int]], Tuple[str, float, Optional[int]]]]:
+    ) -> list[tuple[tuple[str, float, Optional[int]], tuple[str, float, Optional[int]]]]:
         if policy == AutoAugmentPolicy.IMAGENET:
             return [
                 (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
@@ -242,22 +323,19 @@ def _get_policies(
         else:
             raise ValueError(f"The provided policy {policy} is not recognized.")
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        params = super(AutoAugment, self)._get_params(sample)
-        params["policy"] = self._policies[int(torch.randint(len(self._policies), ()))]
-        return params
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)  # type: ignore[arg-type]
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if not (isinstance(inpt, (features.Image, PIL.Image.Image)) or is_simple_tensor(inpt)):
-            return inpt
+        policy = self._policies[int(torch.randint(len(self._policies), ()))]
 
-        for transform_id, probability, magnitude_idx in params["policy"]:
+        for transform_id, probability, magnitude_idx in policy:
             if not torch.rand(()) <= probability:
                 continue
 
             magnitudes_fn, signed = self._AUGMENTATION_SPACE[transform_id]
 
-            magnitudes = magnitudes_fn(10, params["height"], params["width"])
+            magnitudes = magnitudes_fn(10, height, width)
             if magnitudes is not None:
                 magnitude = float(magnitudes[magnitude_idx])
                 if signed and torch.rand(()) <= 0.5:
@@ -265,14 +343,37 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
             else:
                 magnitude = 0.0
 
-            inpt = self._apply_image_transform(
-                inpt, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
             )
 
-        return inpt
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
 
 class RandAugment(_AutoAugmentBase):
+    r"""RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int, optional): Number of augmentation transformations to apply sequentially,
+            must be non-negative integer. Default: 2.
+        magnitude (int, optional): Magnitude for all the transformations.
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.RandAugment
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
@@ -291,53 +392,69 @@ class RandAugment(_AutoAugmentBase):
         "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
     }
 
     def __init__(
         self,
-        *,
         num_ops: int = 2,
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
+        if not isinstance(num_ops, int) or (num_ops < 0):
+            raise ValueError(f"num_ops should be a non-negative integer, but got {num_ops} instead.")
         self.num_ops = num_ops
         self.magnitude = magnitude
         self.num_magnitude_bins = num_magnitude_bins
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if not (isinstance(inpt, (features.Image, PIL.Image.Image)) or is_simple_tensor(inpt)):
-            return inpt
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)  # type: ignore[arg-type]
 
         for _ in range(self.num_ops):
             transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
-
-            magnitudes = magnitudes_fn(self.num_magnitude_bins, params["height"], params["width"])
+            magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
             if magnitudes is not None:
-                magnitude = float(magnitudes[int(torch.randint(self.num_magnitude_bins, ()))])
+                magnitude = float(magnitudes[self.magnitude])
                 if signed and torch.rand(()) <= 0.5:
                     magnitude *= -1
             else:
                 magnitude = 0.0
-
-            inpt = self._apply_image_transform(
-                inpt, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
             )
 
-        return inpt
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
+    r"""Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.TrivialAugmentWide
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
@@ -350,33 +467,30 @@ class TrivialAugmentWide(_AutoAugmentBase):
         "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
         "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
     }
 
     def __init__(
         self,
-        *,
         num_magnitude_bins: int = 31,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
     ):
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_magnitude_bins = num_magnitude_bins
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if not (isinstance(inpt, (features.Image, PIL.Image.Image)) or is_simple_tensor(inpt)):
-            return inpt
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)  # type: ignore[arg-type]
 
         transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
 
-        magnitudes = magnitudes_fn(self.num_magnitude_bins, params["height"], params["width"])
+        magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
         if magnitudes is not None:
             magnitude = float(magnitudes[int(torch.randint(self.num_magnitude_bins, ()))])
             if signed and torch.rand(()) <= 0.5:
@@ -384,12 +498,38 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         else:
             magnitude = 0.0
 
-        return self._apply_image_transform(
-            inpt, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+        image_or_video = self._apply_image_or_video_transform(
+            image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
         )
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
 
 class AugMix(_AutoAugmentBase):
+    r"""AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int, optional): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int, optional): The number of augmentation chains. Default is ``3``.
+        chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.AugMix
+
     _PARTIAL_AUGMENTATION_SPACE = {
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
         "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
@@ -397,16 +537,14 @@ class AugMix(_AutoAugmentBase):
         "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, height / 3.0, num_bins), True),
         "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 4 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (4 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
     }
-    _AUGMENTATION_SPACE: Dict[str, Tuple[Callable[[int, int, int], Optional[torch.Tensor]], bool]] = {
+    _AUGMENTATION_SPACE: dict[str, tuple[Callable[[int, int, int], Optional[torch.Tensor]], bool]] = {
         **_PARTIAL_AUGMENTATION_SPACE,
         "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
@@ -421,8 +559,8 @@ def __init__(
         chain_depth: int = -1,
         alpha: float = 1.0,
         all_ops: bool = True,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Union[int, float, Sequence[int], Sequence[float]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self._PARAMETER_MAX = 10
@@ -438,39 +576,42 @@ def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor:
         # Must be on a separate method so that we can overwrite it in tests.
         return torch._sample_dirichlet(params)
 
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Image) or is_simple_tensor(inpt):
-            image = inpt
-        elif isinstance(inpt, PIL.Image.Image):
-            image = pil_to_tensor(inpt)
-        else:
-            return inpt
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(orig_image_or_video)  # type: ignore[arg-type]
+
+        if isinstance(orig_image_or_video, torch.Tensor):
+            image_or_video = orig_image_or_video
+        else:  # isinstance(inpt, PIL.Image.Image):
+            image_or_video = F.pil_to_tensor(orig_image_or_video)
 
         augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
 
-        orig_dims = list(image.shape)
-        batch = image.view([1] * max(4 - image.ndim, 0) + orig_dims)
+        orig_dims = list(image_or_video.shape)
+        expected_ndim = 5 if isinstance(orig_image_or_video, tv_tensors.Video) else 4
+        batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
-        # Sample the beta weights for combining the original and augmented image. To get Beta, we use a Dirichlet
-        # with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of augmented image.
+        # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
+        # Dirichlet with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of
+        # augmented image or video.
         m = self._sample_dirichlet(
             torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1)
         )
 
-        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images.
+        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos.
         combined_weights = self._sample_dirichlet(
             torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
-        ) * m[:, 1].view([batch_dims[0], -1])
+        ) * m[:, 1].reshape([batch_dims[0], -1])
 
-        mix = m[:, 0].view(batch_dims) * batch
+        mix = m[:, 0].reshape(batch_dims) * batch
         for i in range(self.mixture_width):
             aug = batch
             depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item())
             for _ in range(depth):
                 transform_id, (magnitudes_fn, signed) = self._get_random_item(augmentation_space)
 
-                magnitudes = magnitudes_fn(self._PARAMETER_MAX, params["height"], params["width"])
+                magnitudes = magnitudes_fn(self._PARAMETER_MAX, height, width)
                 if magnitudes is not None:
                     magnitude = float(magnitudes[int(torch.randint(self.severity, ()))])
                     if signed and torch.rand(()) <= 0.5:
@@ -478,15 +619,13 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
                 else:
                     magnitude = 0.0
 
-                aug = self._apply_image_transform(
-                    aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
-                )
-            mix.add_(combined_weights[:, i].view(batch_dims) * aug)
-        mix = mix.view(orig_dims).to(dtype=image.dtype)
+                aug = self._apply_image_or_video_transform(aug, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill)  # type: ignore[assignment]
+            mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
+        mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
-        if isinstance(inpt, features.Image):
-            mix = features.Image.new_like(inpt, mix)
-        elif isinstance(inpt, PIL.Image.Image):
-            mix = to_pil_image(mix)
+        if isinstance(orig_image_or_video, (tv_tensors.Image, tv_tensors.Video)):
+            mix = tv_tensors.wrap(mix, like=orig_image_or_video)
+        elif isinstance(orig_image_or_video, PIL.Image.Image):
+            mix = F.to_pil_image(mix)
 
-        return mix
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, mix)
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
new file mode 100644
index 00000000000..bf4ae55d232
--- /dev/null
+++ b/torchvision/transforms/v2/_color.py
@@ -0,0 +1,377 @@
+import collections.abc
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import torch
+from torchvision import transforms as _transforms
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._transform import _RandomApplyTransform
+from ._utils import query_chw
+
+
+class Grayscale(Transform):
+    """Convert images or videos to grayscale.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+    """
+
+    _v1_transform_cls = _transforms.Grayscale
+
+    def __init__(self, num_output_channels: int = 1):
+        super().__init__()
+        self.num_output_channels = num_output_channels
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=self.num_output_channels)
+
+
+class RandomGrayscale(_RandomApplyTransform):
+    """Randomly convert image or videos to grayscale with a probability of p (default 0.1).
+
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+    """
+
+    _v1_transform_cls = _transforms.RandomGrayscale
+
+    def __init__(self, p: float = 0.1) -> None:
+        super().__init__(p=p)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        num_input_channels, *_ = query_chw(flat_inputs)
+        return dict(num_input_channels=num_input_channels)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=params["num_input_channels"])
+
+
+class RGB(Transform):
+    """Convert images or videos to RGB (if they are already not RGB).
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.grayscale_to_rgb, inpt)
+
+
+class ColorJitter(Transform):
+    """Randomly change the brightness, contrast, saturation and hue of an image or video.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
+    _v1_transform_cls = _transforms.ColorJitter
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        return {attr: value or 0 for attr, value in super()._extract_params_for_v1_transform().items()}
+
+    def __init__(
+        self,
+        brightness: Optional[Union[float, Sequence[float]]] = None,
+        contrast: Optional[Union[float, Sequence[float]]] = None,
+        saturation: Optional[Union[float, Sequence[float]]] = None,
+        hue: Optional[Union[float, Sequence[float]]] = None,
+    ) -> None:
+        super().__init__()
+        self.brightness = self._check_input(brightness, "brightness")
+        self.contrast = self._check_input(contrast, "contrast")
+        self.saturation = self._check_input(saturation, "saturation")
+        self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+
+    def _check_input(
+        self,
+        value: Optional[Union[float, Sequence[float]]],
+        name: str,
+        center: float = 1.0,
+        bound: tuple[float, float] = (0, float("inf")),
+        clip_first_on_zero: bool = True,
+    ) -> Optional[tuple[float, float]]:
+        if value is None:
+            return None
+
+        if isinstance(value, (int, float)):
+            if value < 0:
+                raise ValueError(f"If {name} is a single number, it must be non negative.")
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0.0)
+        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
+            value = [float(v) for v in value]
+        else:
+            raise TypeError(f"{name}={value} should be a single number or a sequence with length 2.")
+
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound} and increasing, but got {value}.")
+
+        return None if value[0] == value[1] == center else (float(value[0]), float(value[1]))
+
+    @staticmethod
+    def _generate_value(left: float, right: float) -> float:
+        return torch.empty(1).uniform_(left, right).item()
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        fn_idx = torch.randperm(4)
+
+        b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1])
+        c = None if self.contrast is None else self._generate_value(self.contrast[0], self.contrast[1])
+        s = None if self.saturation is None else self._generate_value(self.saturation[0], self.saturation[1])
+        h = None if self.hue is None else self._generate_value(self.hue[0], self.hue[1])
+
+        return dict(fn_idx=fn_idx, brightness_factor=b, contrast_factor=c, saturation_factor=s, hue_factor=h)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        output = inpt
+        brightness_factor = params["brightness_factor"]
+        contrast_factor = params["contrast_factor"]
+        saturation_factor = params["saturation_factor"]
+        hue_factor = params["hue_factor"]
+        for fn_id in params["fn_idx"]:
+            if fn_id == 0 and brightness_factor is not None:
+                output = self._call_kernel(F.adjust_brightness, output, brightness_factor=brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                output = self._call_kernel(F.adjust_contrast, output, contrast_factor=contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                output = self._call_kernel(F.adjust_saturation, output, saturation_factor=saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                output = self._call_kernel(F.adjust_hue, output, hue_factor=hue_factor)
+        return output
+
+
+class RandomChannelPermutation(Transform):
+    """Randomly permute the channels of an image or video"""
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        return dict(permutation=torch.randperm(num_channels))
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.permute_channels, inpt, params["permutation"])
+
+
+class RandomPhotometricDistort(Transform):
+    """Randomly distorts the image or video as used in `SSD: Single Shot
+    MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
+
+    This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
+    under the hood to adjust the contrast, saturation, hue, brightness, and also
+    randomly permutes channels.
+
+    Args:
+        brightness (tuple of float (min, max), optional): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        contrast (tuple of float (min, max), optional): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers.
+        saturation (tuple of float (min, max), optional): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        hue (tuple of float (min, max), optional): How much to jitter hue.
+            hue_factor is chosen uniformly from [min, max].  Should have -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied.
+            Default is 0.5.
+    """
+
+    def __init__(
+        self,
+        brightness: tuple[float, float] = (0.875, 1.125),
+        contrast: tuple[float, float] = (0.5, 1.5),
+        saturation: tuple[float, float] = (0.5, 1.5),
+        hue: tuple[float, float] = (-0.05, 0.05),
+        p: float = 0.5,
+    ):
+        super().__init__()
+        self.brightness = brightness
+        self.contrast = contrast
+        self.hue = hue
+        self.saturation = saturation
+        self.p = p
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        params: dict[str, Any] = {
+            key: ColorJitter._generate_value(range[0], range[1]) if torch.rand(1) < self.p else None
+            for key, range in [
+                ("brightness_factor", self.brightness),
+                ("contrast_factor", self.contrast),
+                ("saturation_factor", self.saturation),
+                ("hue_factor", self.hue),
+            ]
+        }
+        params["contrast_before"] = bool(torch.rand(()) < 0.5)
+        params["channel_permutation"] = torch.randperm(num_channels) if torch.rand(1) < self.p else None
+        return params
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if params["brightness_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_brightness, inpt, brightness_factor=params["brightness_factor"])
+        if params["contrast_factor"] is not None and params["contrast_before"]:
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
+        if params["saturation_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_saturation, inpt, saturation_factor=params["saturation_factor"])
+        if params["hue_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_hue, inpt, hue_factor=params["hue_factor"])
+        if params["contrast_factor"] is not None and not params["contrast_before"]:
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
+        if params["channel_permutation"] is not None:
+            inpt = self._call_kernel(F.permute_channels, inpt, permutation=params["channel_permutation"])
+        return inpt
+
+
+class RandomEqualize(_RandomApplyTransform):
+    """Equalize the histogram of the given image or video with a given probability.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomEqualize
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.equalize, inpt)
+
+
+class RandomInvert(_RandomApplyTransform):
+    """Inverts the colors of the given image or video with a given probability.
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomInvert
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.invert, inpt)
+
+
+class RandomPosterize(_RandomApplyTransform):
+    """Posterize the image or video with a given probability by reducing the
+    number of bits for each color channel.
+
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomPosterize
+
+    def __init__(self, bits: int, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.bits = bits
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.posterize, inpt, bits=self.bits)
+
+
+class RandomSolarize(_RandomApplyTransform):
+    """Solarize the image or video with a given probability by inverting all pixel
+    values above a threshold.
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomSolarize
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+        params["threshold"] = float(params["threshold"])
+        return params
+
+    def __init__(self, threshold: float, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.threshold = threshold
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.solarize, inpt, threshold=self.threshold)
+
+
+class RandomAutocontrast(_RandomApplyTransform):
+    """Autocontrast the pixels of the given image or video with a given probability.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomAutocontrast
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.autocontrast, inpt)
+
+
+class RandomAdjustSharpness(_RandomApplyTransform):
+    """Adjust the sharpness of the image or video with a given probability.
+
+    If the input is a :class:`torch.Tensor`,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomAdjustSharpness
+
+    def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.sharpness_factor = sharpness_factor
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.adjust_sharpness, inpt, sharpness_factor=self.sharpness_factor)
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
new file mode 100644
index 00000000000..95ec25a22f8
--- /dev/null
+++ b/torchvision/transforms/v2/_container.py
@@ -0,0 +1,180 @@
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from torch import nn
+from torchvision import transforms as _transforms
+from torchvision.transforms.v2 import Transform
+
+
+class Compose(Transform):
+    """Composes several transforms together.
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
+    def __init__(self, transforms: Sequence[Callable]) -> None:
+        super().__init__()
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        self.transforms = transforms
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+        for transform in self.transforms:
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
+
+
+class RandomApply(Transform):
+    """Apply randomly a list of transformations with a given probability.
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability of applying the list of transforms
+    """
+
+    _v1_transform_cls = _transforms.RandomApply
+
+    def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
+        super().__init__()
+
+        if not isinstance(transforms, (Sequence, nn.ModuleList)):
+            raise TypeError("Argument transforms should be a sequence of callables or a `nn.ModuleList`")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        self.transforms = transforms
+
+        if not (0.0 <= p <= 1.0):
+            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
+        self.p = p
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        return {"transforms": self.transforms, "p": self.p}
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+
+        if torch.rand(1) >= self.p:
+            return inputs if needs_unpacking else inputs[0]
+
+        for transform in self.transforms:
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
+
+
+class RandomChoice(Transform):
+    """Apply single transformation randomly picked from a list.
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (list of floats or None, optional): probability of each transform being picked.
+            If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
+            (default), all transforms have the same probability.
+    """
+
+    def __init__(
+        self,
+        transforms: Sequence[Callable],
+        p: Optional[list[float]] = None,
+    ) -> None:
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
+            raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}")
+
+        super().__init__()
+
+        self.transforms = transforms
+        total = sum(p)
+        self.p = [prob / total for prob in p]
+
+    def forward(self, *inputs: Any) -> Any:
+        idx = int(torch.multinomial(torch.tensor(self.p), 1))
+        transform = self.transforms[idx]
+        return transform(*inputs)
+
+
+class RandomOrder(Transform):
+    """Apply a list of transformations in a random order.
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+    """
+
+    def __init__(self, transforms: Sequence[Callable]) -> None:
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        super().__init__()
+        self.transforms = transforms
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+        for idx in torch.randperm(len(self.transforms)):
+            transform = self.transforms[idx]
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
new file mode 100644
index 00000000000..4e7d6170d4f
--- /dev/null
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -0,0 +1,50 @@
+import warnings
+from typing import Any, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torchvision.transforms import functional as _F
+
+from torchvision.transforms.v2 import Transform
+
+
+class ToTensor(Transform):
+    """[DEPRECATED] Use ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`` instead.
+
+    Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    .. warning::
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
+        Please use instead ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])``.
+        Output is equivalent up to float precision.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
+    _transformed_types = (PIL.Image.Image, np.ndarray)
+
+    def __init__(self) -> None:
+        warnings.warn(
+            "The transform `ToTensor()` is deprecated and will be removed in a future release. "
+            "Instead, please use `v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`."
+            "Output is equivalent up to float precision."
+        )
+        super().__init__()
+
+    def transform(self, inpt: Union[PIL.Image.Image, np.ndarray], params: dict[str, Any]) -> torch.Tensor:
+        return _F.to_tensor(inpt)
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
new file mode 100644
index 00000000000..1418a6b4953
--- /dev/null
+++ b/torchvision/transforms/v2/_geometry.py
@@ -0,0 +1,1417 @@
+import math
+import numbers
+import warnings
+from collections.abc import Sequence
+from typing import Any, Callable, Literal, Optional, Union
+
+import PIL.Image
+import torch
+
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import _get_perspective_coeffs
+from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._utils import _FillType
+
+from ._transform import _RandomApplyTransform
+from ._utils import (
+    _check_padding_arg,
+    _check_padding_mode_arg,
+    _check_sequence_input,
+    _get_fill,
+    _setup_angle,
+    _setup_fill_arg,
+    _setup_number_or_seq,
+    _setup_size,
+    get_bounding_boxes,
+    has_all,
+    has_any,
+    is_pure_tensor,
+    query_size,
+)
+
+
+class RandomHorizontalFlip(_RandomApplyTransform):
+    """Horizontally flip the input with a given probability.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomHorizontalFlip
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.horizontal_flip, inpt)
+
+
+class RandomVerticalFlip(_RandomApplyTransform):
+    """Vertically flip the input with a given probability.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomVerticalFlip
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.vertical_flip, inpt)
+
+
+class Resize(Transform):
+    """Resize the input to the given size.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        size (sequence, int, or None): Desired
+            output size.
+
+            - If size is a sequence like (h, w), output size will be matched to this.
+            - If size is an int, smaller edge of the image will be matched to this
+              number.  i.e, if height > width, then image will be rescaled to
+              (size * height / width, size).
+            - If size is None, the output shape is determined by the ``max_size``
+              parameter.
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image.
+
+            - If ``size`` is an int: if the longer edge of the image is greater
+              than ``max_size`` after being resized according to ``size``,
+              ``size`` will be overruled so that the longer edge is equal to
+              ``max_size``. As a result, the smaller edge may be shorter than
+              ``size``. This is only supported if ``size`` is an int (or a
+              sequence of length 1 in torchscript mode).
+            - If ``size`` is None: the longer edge of the image will be matched
+              to max_size.  i.e, if height > width, then image will be rescaled
+              to (max_size, max_size * width / height).
+
+            This should be left to ``None`` (default) when ``size`` is a
+            sequence.
+
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    _v1_transform_cls = _transforms.Resize
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int], None],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        max_size: Optional[int] = None,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+
+        if isinstance(size, int):
+            size = [size]
+        elif isinstance(size, Sequence) and len(size) in {1, 2}:
+            size = list(size)
+        elif size is None:
+            if not isinstance(max_size, int):
+                raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
+        else:
+            raise ValueError(
+                f"size can be an integer, a sequence of one or two integers, or None, but got {size} instead."
+            )
+        self.size = size
+
+        self.interpolation = interpolation
+        self.max_size = max_size
+        self.antialias = antialias
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize,
+            inpt,
+            self.size,
+            interpolation=self.interpolation,
+            max_size=self.max_size,
+            antialias=self.antialias,
+        )
+
+
+class CenterCrop(Transform):
+    """Crop the input at the center.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
+    _v1_transform_cls = _transforms.CenterCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.center_crop, inpt, output_size=self.size)
+
+
+class RandomResizedCrop(Transform):
+    """Crop a random portion of the input and resize it to a given size.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    A crop of the original input is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    _v1_transform_cls = _transforms.RandomResizedCrop
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        scale: tuple[float, float] = (0.08, 1.0),
+        ratio: tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if not isinstance(scale, Sequence) or len(scale) != 2:
+            raise TypeError("Scale should be a sequence of two floats.")
+        if not isinstance(ratio, Sequence) or len(ratio) != 2:
+            raise TypeError("Ratio should be a sequence of two floats.")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
+        area = height * width
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                i = torch.randint(0, height - h + 1, size=(1,)).item()
+                j = torch.randint(0, width - w + 1, size=(1,)).item()
+                break
+        else:
+            # Fallback to central crop
+            in_ratio = float(width) / float(height)
+            if in_ratio < min(self.ratio):
+                w = width
+                h = int(round(w / min(self.ratio)))
+            elif in_ratio > max(self.ratio):
+                h = height
+                w = int(round(h * max(self.ratio)))
+            else:  # whole image
+                w = width
+                h = height
+            i = (height - h) // 2
+            j = (width - w) // 2
+
+        return dict(top=i, left=j, height=h, width=w)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resized_crop, inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class FiveCrop(Transform):
+    """Crop the image or video into four corners and the central crop.
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+    Example:
+        >>> class BatchMultiCrop(transforms.Transform):
+        ...     def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
+        ...         images_or_videos, labels = sample
+        ...         batch_size = len(images_or_videos)
+        ...         image_or_video = images_or_videos[0]
+        ...         images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
+        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
+        ...         return images_or_videos, labels
+        ...
+        >>> image = tv_tensors.Image(torch.rand(3, 256, 256))
+        >>> label = 3
+        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
+        >>> images, labels = transform(image, label)
+        >>> images.shape
+        torch.Size([5, 3, 224, 224])
+        >>> labels
+        tensor([3, 3, 3, 3, 3])
+    """
+
+    _v1_transform_cls = _transforms.FiveCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.five_crop, inpt, self.size)
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
+
+
+class TenCrop(Transform):
+    """Crop the image or video into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
+    """
+
+    _v1_transform_cls = _transforms.TenCrop
+
+    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.vertical_flip = vertical_flip
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.ten_crop, inpt, self.size, vertical_flip=self.vertical_flip)
+
+
+class Pad(Transform):
+    """Pad the input on all sides with the given "pad" value.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is "constant".
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    _v1_transform_cls = _transforms.Pad
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        return params
+
+    def __init__(
+        self,
+        padding: Union[int, Sequence[int]],
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        _check_padding_arg(padding)
+        _check_padding_mode_arg(padding_mode)
+
+        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
+        if not isinstance(padding, int):
+            padding = list(padding)
+        self.padding = padding
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(F.pad, inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+
+class RandomZoomOut(_RandomApplyTransform):
+    """ "Zoom out" transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
+    Output spatial size is randomly sampled from original size up to a maximum size configured
+    with ``side_range`` parameter:
+
+    .. code-block:: python
+
+        r = uniform_sample(side_range[0], side_range[1])
+        output_width = input_width * r
+        output_height = input_height * r
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
+            scale the input size.
+        p (float, optional): probability that the zoom operation will be performed.
+    """
+
+    def __init__(
+        self,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        side_range: Sequence[float] = (1.0, 4.0),
+        p: float = 0.5,
+    ) -> None:
+        super().__init__(p=p)
+
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
+
+        self.side_range = side_range
+        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
+            raise ValueError(f"Invalid side range provided {side_range}.")
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_h, orig_w = query_size(flat_inputs)
+
+        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
+        canvas_width = int(orig_w * r)
+        canvas_height = int(orig_h * r)
+
+        r = torch.rand(2)
+        left = int((canvas_width - orig_w) * r[0])
+        top = int((canvas_height - orig_h) * r[1])
+        right = canvas_width - (left + orig_w)
+        bottom = canvas_height - (top + orig_h)
+        padding = [left, top, right, bottom]
+
+        return dict(padding=padding)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(F.pad, inpt, **params, fill=fill)
+
+
+class RandomRotation(Transform):
+    """Rotate the input by angle.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be [-degrees, +degrees].
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center (see note below) and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+            .. note::
+
+                In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
+                center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
+                differences of the resulting image size compared to using the image center in the first place. Thus, when
+                setting ``expand=True``, it's best to leave ``center=None`` (default).
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    _v1_transform_cls = _transforms.RandomRotation
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        expand: bool = False,
+        center: Optional[list[float]] = None,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        self.interpolation = interpolation
+        self.expand = expand
+
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        return dict(angle=angle)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.rotate,
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            expand=self.expand,
+            center=self.center,
+            fill=fill,
+        )
+
+
+class RandomAffine(Transform):
+    """Random affine transformation the input keeping center invariant.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    _v1_transform_cls = _transforms.RandomAffine
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        translate: Optional[Sequence[float]] = None,
+        scale: Optional[Sequence[float]] = None,
+        shear: Optional[Union[int, float, Sequence[float]]] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        center: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2,))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError("translation values should be between 0 and 1")
+        self.translate = translate
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2,))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        self.interpolation = interpolation
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
+
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        if self.translate is not None:
+            max_dx = float(self.translate[0] * width)
+            max_dy = float(self.translate[1] * height)
+            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
+            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
+            translate = (tx, ty)
+        else:
+            translate = (0, 0)
+
+        if self.scale is not None:
+            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+        else:
+            scale = 1.0
+
+        shear_x = shear_y = 0.0
+        if self.shear is not None:
+            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
+            if len(self.shear) == 4:
+                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
+
+        shear = (shear_x, shear_y)
+        return dict(angle=angle, translate=translate, scale=scale, shear=shear)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.affine,
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            fill=fill,
+            center=self.center,
+        )
+
+
+class RandomCrop(Transform):
+    """Crop the input at a random location.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image, applied before cropping. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    _v1_transform_cls = _transforms.RandomCrop
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        padding = self.padding
+        if padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = padding
+            padding = [pad_left, pad_top, pad_right, pad_bottom]
+        params["padding"] = padding
+
+        return params
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        padding: Optional[Union[int, Sequence[int]]] = None,
+        pad_if_needed: bool = False,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if pad_if_needed or padding is not None:
+            if padding is not None:
+                _check_padding_arg(padding)
+            _check_padding_mode_arg(padding_mode)
+
+        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        padded_height, padded_width = query_size(flat_inputs)
+
+        if self.padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = self.padding
+            padded_height += pad_top + pad_bottom
+            padded_width += pad_left + pad_right
+        else:
+            pad_left = pad_right = pad_top = pad_bottom = 0
+
+        cropped_height, cropped_width = self.size
+
+        if self.pad_if_needed:
+            if padded_height < cropped_height:
+                diff = cropped_height - padded_height
+
+                pad_top += diff
+                pad_bottom += diff
+                padded_height += 2 * diff
+
+            if padded_width < cropped_width:
+                diff = cropped_width - padded_width
+
+                pad_left += diff
+                pad_right += diff
+                padded_width += 2 * diff
+
+        if padded_height < cropped_height or padded_width < cropped_width:
+            raise ValueError(
+                f"Required crop size {(cropped_height, cropped_width)} is larger than "
+                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
+            )
+
+        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+        needs_pad = any(padding)
+
+        needs_vert_crop, top = (
+            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
+            if padded_height > cropped_height
+            else (False, 0)
+        )
+        needs_horz_crop, left = (
+            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
+            if padded_width > cropped_width
+            else (False, 0)
+        )
+
+        return dict(
+            needs_crop=needs_vert_crop or needs_horz_crop,
+            top=top,
+            left=left,
+            height=cropped_height,
+            width=cropped_width,
+            needs_pad=needs_pad,
+            padding=padding,
+        )
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if params["needs_pad"]:
+            fill = _get_fill(self._fill, type(inpt))
+            inpt = self._call_kernel(F.pad, inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
+
+        if params["needs_crop"]:
+            inpt = self._call_kernel(
+                F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+            )
+
+        return inpt
+
+
+class RandomPerspective(_RandomApplyTransform):
+    """Perform a random perspective transformation of the input with a given probability.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float, optional): probability of the input being transformed. Default is 0.5.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
+    _v1_transform_cls = _transforms.RandomPerspective
+
+    def __init__(
+        self,
+        distortion_scale: float = 0.5,
+        p: float = 0.5,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__(p=p)
+
+        if not (0 <= distortion_scale <= 1):
+            raise ValueError("Argument distortion_scale value should be between 0 and 1")
+
+        self.distortion_scale = distortion_scale
+        self.interpolation = interpolation
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
+
+        distortion_scale = self.distortion_scale
+
+        half_height = height // 2
+        half_width = width // 2
+        bound_height = int(distortion_scale * half_height) + 1
+        bound_width = int(distortion_scale * half_width) + 1
+        topleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        topright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        botright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        botleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return dict(coefficients=perspective_coeffs)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.perspective,
+            inpt,
+            startpoints=None,
+            endpoints=None,
+            fill=fill,
+            interpolation=self.interpolation,
+            **params,
+        )
+
+
+class ElasticTransform(Transform):
+    """Transform the input with elastic transformations.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to transform the input.
+
+    .. note::
+        Implementation to transform bounding boxes is approximative (not exact).
+        We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
+        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
+        Our assumption is that ``displacement * displacement`` is small and can be ignored.
+        Large displacements would lead to large errors in the approximation.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
+    _v1_transform_cls = _transforms.ElasticTransform
+
+    def __init__(
+        self,
+        alpha: Union[float, Sequence[float]] = 50.0,
+        sigma: Union[float, Sequence[float]] = 5.0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__()
+        self.alpha = _setup_number_or_seq(alpha, "alpha")
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
+
+        self.interpolation = interpolation
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        size = list(query_size(flat_inputs))
+
+        dx = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[0] > 0.0:
+            kx = int(8 * self.sigma[0] + 1)
+            # if kernel size is even we have to make it odd
+            if kx % 2 == 0:
+                kx += 1
+            dx = self._call_kernel(F.gaussian_blur, dx, [kx, kx], list(self.sigma))
+        dx = dx * self.alpha[0] / size[0]
+
+        dy = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[1] > 0.0:
+            ky = int(8 * self.sigma[1] + 1)
+            # if kernel size is even we have to make it odd
+            if ky % 2 == 0:
+                ky += 1
+            dy = self._call_kernel(F.gaussian_blur, dy, [ky, ky], list(self.sigma))
+        dy = dy * self.alpha[1] / size[1]
+        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
+        return dict(displacement=displacement)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.elastic,
+            inpt,
+            **params,
+            fill=fill,
+            interpolation=self.interpolation,
+        )
+
+
+class RandomIoUCrop(Transform):
+    """Random IoU crop transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
+
+    .. warning::
+        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        after or later in the transforms pipeline.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_scale (float, optional): Minimum factors to scale the input size.
+        max_scale (float, optional): Maximum factors to scale the input size.
+        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
+        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
+        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
+            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
+        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
+            Default, 40.
+    """
+
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[list[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if not (
+            has_all(flat_inputs, tv_tensors.BoundingBoxes)
+            and has_any(flat_inputs, PIL.Image.Image, tv_tensors.Image, is_pure_tensor)
+        ):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
+                "and bounding boxes. Sample can also contain masks."
+            )
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_h, orig_w = query_size(flat_inputs)
+        bboxes = get_bounding_boxes(flat_inputs)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
+                return dict()
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # check for any valid boxes with centers within the crop area
+                xyxy_bboxes = F.convert_bounding_box_format(
+                    bboxes.as_subclass(torch.Tensor),
+                    bboxes.format,
+                    tv_tensors.BoundingBoxFormat.XYXY,
+                )
+                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
+                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
+                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
+                ious = box_iou(
+                    xyxy_bboxes,
+                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+
+        if len(params) < 1:
+            return inpt
+
+        output = self._call_kernel(
+            F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+        )
+
+        if isinstance(output, tv_tensors.BoundingBoxes):
+            # We "mark" the invalid boxes as degenreate, and they can be
+            # removed by a later call to SanitizeBoundingBoxes()
+            output[~params["is_within_crop_area"]] = 0
+
+        return output
+
+
+class ScaleJitter(Transform):
+    """Perform Large Scale Jitter on the input according to
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
+            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
+        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        target_size: tuple[int, int],
+        scale_range: tuple[float, float] = (0.1, 2.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ):
+        super().__init__()
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_height, orig_width = query_size(flat_inputs)
+
+        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
+        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class RandomShortestSize(Transform):
+    """Randomly resize the input.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
+        max_size (int, optional): Maximum spatial size. Default, None.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        min_size: Union[list[int], tuple[int], int],
+        max_size: Optional[int] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ):
+        super().__init__()
+        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
+        self.max_size = max_size
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_height, orig_width = query_size(flat_inputs)
+
+        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
+        r = min_size / min(orig_height, orig_width)
+        if self.max_size is not None:
+            r = min(r, self.max_size / max(orig_height, orig_width))
+
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class RandomResize(Transform):
+    """Randomly resize the input.
+
+    This transformation can be used together with ``RandomCrop`` as data augmentations to train
+    models on image segmentation task.
+
+    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
+
+    .. code-block:: python
+
+        size = uniform_sample(min_size, max_size)
+        output_width = size
+        output_height = size
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int): Minimum output size for random sampling
+        max_size (int): Maximum output size for random sampling
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        min_size: int,
+        max_size: int,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        size = int(torch.randint(self.min_size, self.max_size, ()))
+        return dict(size=[size])
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
new file mode 100644
index 00000000000..39f223f0398
--- /dev/null
+++ b/torchvision/transforms/v2/_meta.py
@@ -0,0 +1,81 @@
+from typing import Any, Union
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE
+
+
+class ConvertBoundingBoxFormat(Transform):
+    """Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
+
+    Args:
+        format (str or tv_tensors.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.tv_tensors.BoundingBoxFormat` and
+            string values match the enums, e.g. "XYXY" or "XYWH" etc.
+    """
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def __init__(self, format: Union[str, tv_tensors.BoundingBoxFormat]) -> None:
+        super().__init__()
+        self.format = format
+
+    def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        return F.convert_bounding_box_format(inpt, new_format=self.format)  # type: ignore[return-value, arg-type]
+
+
+class ClampBoundingBoxes(Transform):
+    """Clamp bounding boxes to their corresponding image dimensions.
+
+    Args:
+        clamping_mode: Default is "auto" which relies on the input box'
+            ``clamping_mode`` attribute. Read more in :ref:`clamping_mode_tuto`
+            for more details on how to use this transform.
+    """
+
+    def __init__(self, clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto") -> None:
+        super().__init__()
+        self.clamping_mode = clamping_mode
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        return F.clamp_bounding_boxes(inpt, clamping_mode=self.clamping_mode)  # type: ignore[return-value]
+
+
+class ClampKeyPoints(Transform):
+    """Clamp keypoints to their corresponding image dimensions.
+
+    The clamping is done according to the keypoints' ``canvas_size`` meta-data.
+    """
+
+    _transformed_types = (tv_tensors.KeyPoints,)
+
+    def transform(self, inpt: tv_tensors.KeyPoints, params: dict[str, Any]) -> tv_tensors.KeyPoints:
+        return F.clamp_keypoints(inpt)  # type: ignore[return-value]
+
+
+class SetClampingMode(Transform):
+    """Sets the ``clamping_mode`` attribute of the bounding boxes for future transforms.
+
+
+
+    Args:
+        clamping_mode: The clamping mode to set. Possible values are: "soft",
+            "hard", or ``None``. Read more in :ref:`clamping_mode_tuto` for more
+            details on how to use this transform.
+    """
+
+    def __init__(self, clamping_mode: CLAMPING_MODE_TYPE) -> None:
+        super().__init__()
+        self.clamping_mode = clamping_mode
+
+        if self.clamping_mode not in (None, "soft", "hard"):
+            raise ValueError(f"clamping_mode must be soft, hard or None, got {clamping_mode}")
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        out: tv_tensors.BoundingBoxes = inpt.clone()  # type: ignore[assignment]
+        out.clamping_mode = self.clamping_mode
+        return out
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
new file mode 100644
index 00000000000..dfd521b13be
--- /dev/null
+++ b/torchvision/transforms/v2/_misc.py
@@ -0,0 +1,452 @@
+import warnings
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import PIL.Image
+
+import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._utils import _parse_labels_getter, _setup_number_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
+
+
+# TODO: do we want/need to expose this?
+class Identity(Transform):
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return inpt
+
+
+class Lambda(Transform):
+    """Apply a user-defined function as a transform.
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
+    _transformed_types = (object,)
+
+    def __init__(self, lambd: Callable[[Any], Any], *types: type):
+        super().__init__()
+        self.lambd = lambd
+        self.types = types or self._transformed_types
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if isinstance(inpt, self.types):
+            return self.lambd(inpt)
+        else:
+            return inpt
+
+    def extra_repr(self) -> str:
+        extras = []
+        name = getattr(self.lambd, "__name__", None)
+        if name:
+            extras.append(name)
+        extras.append(f"types={[type.__name__ for type in self.types]}")
+        return ", ".join(extras)
+
+
+class LinearTransformation(Transform):
+    """Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
+    _v1_transform_cls = _transforms.LinearTransformation
+
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
+
+    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
+        super().__init__()
+        if transformation_matrix.size(0) != transformation_matrix.size(1):
+            raise ValueError(
+                "transformation_matrix should be square. Got "
+                f"{tuple(transformation_matrix.size())} rectangular matrix."
+            )
+
+        if mean_vector.size(0) != transformation_matrix.size(0):
+            raise ValueError(
+                f"mean_vector should have the same length {mean_vector.size(0)}"
+                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
+            )
+
+        if transformation_matrix.device != mean_vector.device:
+            raise ValueError(
+                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
+            )
+
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
+        self.transformation_matrix = transformation_matrix
+        self.mean_vector = mean_vector
+
+    def check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        shape = inpt.shape
+        n = shape[-3] * shape[-2] * shape[-1]
+        if n != self.transformation_matrix.shape[0]:
+            raise ValueError(
+                "Input tensor and transformation matrix have incompatible shape."
+                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
+                + f"{self.transformation_matrix.shape[0]}"
+            )
+
+        if inpt.device.type != self.mean_vector.device.type:
+            raise ValueError(
+                "Input tensor should be on the same device as transformation matrix and mean vector. "
+                f"Got {inpt.device} vs {self.mean_vector.device}"
+            )
+
+        flat_inpt = inpt.reshape(-1, n) - self.mean_vector
+
+        transformation_matrix = self.transformation_matrix.to(flat_inpt.dtype)
+        output = torch.mm(flat_inpt, transformation_matrix)
+        output = output.reshape(shape)
+
+        if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+            output = tv_tensors.wrap(output, like=inpt)
+        return output
+
+
+class Normalize(Transform):
+    """Normalize a tensor image or video with mean and standard deviation.
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    _v1_transform_cls = _transforms.Normalize
+
+    def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
+        super().__init__()
+        self.mean = list(mean)
+        self.std = list(std)
+        self.inplace = inplace
+
+    def check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.normalize, inpt, mean=self.mean, std=self.std, inplace=self.inplace)
+
+
+class GaussianBlur(Transform):
+    """Blurs image with randomly chosen Gaussian blur kernel.
+
+    The convolution will be using reflection padding corresponding to the kernel size, to maintain the input shape.
+
+    If the input is a Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+    """
+
+    _v1_transform_cls = _transforms.GaussianBlur
+
+    def __init__(
+        self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
+        for ks in self.kernel_size:
+            if ks <= 0 or ks % 2 == 0:
+                raise ValueError("Kernel size value should be an odd and positive number.")
+
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
+
+        if not 0.0 < self.sigma[0] <= self.sigma[1]:
+            raise ValueError(f"sigma values should be positive and of the form (min, max). Got {self.sigma}")
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
+        return dict(sigma=[sigma, sigma])
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.gaussian_blur, inpt, self.kernel_size, **params)
+
+
+class GaussianNoise(Transform):
+    """Add gaussian noise to images or videos.
+
+    The input tensor is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    Each image or frame in a batch will be transformed independently i.e. the
+    noise added to each image will be different.
+
+    The input tensor is also expected to be of float dtype in ``[0, 1]``.
+    This transform does not support PIL images.
+
+    Args:
+        mean (float): Mean of the sampled normal distribution. Default is 0.
+        sigma (float): Standard deviation of the sampled normal distribution. Default is 0.1.
+        clip (bool, optional): Whether to clip the values in ``[0, 1]`` after adding noise. Default is True.
+    """
+
+    def __init__(self, mean: float = 0.0, sigma: float = 0.1, clip=True) -> None:
+        super().__init__()
+        self.mean = mean
+        self.sigma = sigma
+        self.clip = clip
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.gaussian_noise, inpt, mean=self.mean, sigma=self.sigma, clip=self.clip)
+
+
+class ToDtype(Transform):
+    """Converts the input to a specific dtype, optionally scaling the values for images or videos.
+
+    .. note::
+        ``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``.
+
+    Args:
+        dtype (``torch.dtype`` or dict of ``TVTensor`` -> ``torch.dtype``): The dtype to convert to.
+            If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted
+            to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`.
+            A dict can be passed to specify per-tv_tensor conversions, e.g.
+            ``dtype={tv_tensors.Image: torch.float32, tv_tensors.Mask: torch.int64, "others":None}``. The "others"
+            key can be used as a catch-all for any other tv_tensor type, and ``None`` means no conversion.
+        scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
+            Default: ``False``.
+    """
+
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(
+        self, dtype: Union[torch.dtype, dict[Union[type, str], Optional[torch.dtype]]], scale: bool = False
+    ) -> None:
+        super().__init__()
+
+        if not isinstance(dtype, (dict, torch.dtype)):
+            raise ValueError(f"dtype must be a dict or a torch.dtype, got {type(dtype)} instead")
+
+        if (
+            isinstance(dtype, dict)
+            and torch.Tensor in dtype
+            and any(cls in dtype for cls in [tv_tensors.Image, tv_tensors.Video])
+        ):
+            warnings.warn(
+                "Got `dtype` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
+            )
+        self.dtype = dtype
+        self.scale = scale
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if isinstance(self.dtype, torch.dtype):
+            # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
+            # is a simple torch.dtype
+            if not is_pure_tensor(inpt) and not isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                return inpt
+
+            dtype: Optional[torch.dtype] = self.dtype
+        elif type(inpt) in self.dtype:
+            dtype = self.dtype[type(inpt)]
+        elif "others" in self.dtype:
+            dtype = self.dtype["others"]
+        else:
+            raise ValueError(
+                f"No dtype was specified for type {type(inpt)}. "
+                "If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. "
+                "If you're passing a dict as dtype, "
+                'you can use "others" as a catch-all key '
+                'e.g. dtype={tv_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
+            )
+
+        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (tv_tensors.Image, tv_tensors.Video))
+        if dtype is None:
+            if self.scale and supports_scaling:
+                warnings.warn(
+                    "scale was set to True but no dtype was specified for images or videos: no scaling will be done."
+                )
+            return inpt
+
+        return self._call_kernel(F.to_dtype, inpt, dtype=dtype, scale=self.scale)
+
+
+class ConvertImageDtype(Transform):
+    """[DEPRECATED] Use ``v2.ToDtype(dtype, scale=True)`` instead.
+
+    Convert input image to the given ``dtype`` and scale the values accordingly.
+
+    .. warning::
+        Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
+    _v1_transform_cls = _transforms.ConvertImageDtype
+
+    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.to_dtype, inpt, dtype=self.dtype, scale=True)
+
+
+class SanitizeBoundingBoxes(Transform):
+    """Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
+
+    This transform removes bounding boxes and their associated labels/masks that:
+
+    - are below a given ``min_size`` or ``min_area``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals.
+
+    It can also sanitize other tensors like the "iscrowd" or "area" properties from COCO
+    (see ``labels_getter`` parameter).
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional): The size below which bounding boxes are removed. Default is 1.
+        min_area (float, optional): The area below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input
+            (or anything else that needs to be sanitized along with the bounding boxes).
+            By default, this will try to find a "labels" key in the input (case-insensitive), if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+
+            It can also be a callable that takes the same input as the transform, and returns either:
+
+            - A single tensor (the labels)
+            - A tuple/list of tensors, each of which will be subject to the same sanitization as the bounding boxes.
+              This is useful to sanitize multiple tensors like the labels, and the "iscrowd" or "area" properties
+              from COCO.
+
+            If ``labels_getter`` is None then only bounding boxes are sanitized.
+    """
+
+    def __init__(
+        self,
+        min_size: float = 1.0,
+        min_area: float = 1.0,
+        labels_getter: Union[Callable[[Any], Any], str, None] = "default",
+    ) -> None:
+        super().__init__()
+
+        if min_size < 1:
+            raise ValueError(f"min_size must be >= 1, got {min_size}.")
+        self.min_size = min_size
+
+        if min_area < 1:
+            raise ValueError(f"min_area must be >= 1, got {min_area}.")
+        self.min_area = min_area
+
+        self.labels_getter = labels_getter
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        labels = self._labels_getter(inputs)
+        if labels is not None:
+            msg = "The labels in the input to forward() must be a tensor or None, got {type} instead."
+            if isinstance(labels, torch.Tensor):
+                labels = (labels,)
+            elif isinstance(labels, (tuple, list)):
+                for entry in labels:
+                    if not isinstance(entry, torch.Tensor):
+                        # TODO: we don't need to enforce tensors, just that entries are indexable as t[bool_mask]
+                        raise ValueError(msg.format(type=type(entry)))
+            else:
+                raise ValueError(msg.format(type=type(labels)))
+
+        flat_inputs, spec = tree_flatten(inputs)
+        boxes = get_bounding_boxes(flat_inputs)
+
+        if labels is not None:
+            for label in labels:
+                if boxes.shape[0] != label.shape[0]:
+                    raise ValueError(
+                        f"Number of boxes (shape={boxes.shape}) and must match the number of labels."
+                        f"Found labels with shape={label.shape})."
+                    )
+
+        valid = F._misc._get_sanitize_bounding_boxes_mask(
+            boxes,
+            format=boxes.format,
+            canvas_size=boxes.canvas_size,
+            min_size=self.min_size,
+            min_area=self.min_area,
+        )
+
+        params = dict(valid=valid, labels=labels)
+        flat_outputs = [self.transform(inpt, params) for inpt in flat_inputs]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        is_label = params["labels"] is not None and any(inpt is label for label in params["labels"])
+        is_bounding_boxes_or_mask = isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask))
+
+        if not (is_label or is_bounding_boxes_or_mask):
+            return inpt
+
+        output = inpt[params["valid"]]
+
+        if is_label:
+            return output
+        else:
+            return tv_tensors.wrap(output, like=inpt)
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
new file mode 100644
index 00000000000..0642a741e35
--- /dev/null
+++ b/torchvision/transforms/v2/_temporal.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+import torch
+from torchvision.transforms.v2 import functional as F, Transform
+
+
+class UniformTemporalSubsample(Transform):
+    """Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
+
+    Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
+
+    When ``num_samples`` is larger than the size of temporal dimension of the video, it
+    will sample frames based on nearest neighbor interpolation.
+
+    Args:
+        num_samples (int): The number of equispaced samples to be selected
+    """
+
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(self, num_samples: int):
+        super().__init__()
+        self.num_samples = num_samples
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.uniform_temporal_subsample, inpt, self.num_samples)
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
new file mode 100644
index 00000000000..ac84fcb6c82
--- /dev/null
+++ b/torchvision/transforms/v2/_transform.py
@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+import enum
+from typing import Any, Callable
+
+import PIL.Image
+import torch
+from torch import nn
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import tv_tensors
+from torchvision.transforms.v2._utils import check_type, has_any, is_pure_tensor
+from torchvision.utils import _log_api_usage_once
+
+from .functional._utils import _get_kernel
+
+
+class Transform(nn.Module):
+    """Base class to implement your own v2 transforms.
+
+    See  :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py` for
+    more details.
+    """
+
+    # Class attribute defining transformed types. Other types are passed-through without any transformation
+    # We support both Types and callables that are able to do further checks on the type of the input.
+    _transformed_types: tuple[type | Callable[[Any], bool], ...] = (torch.Tensor, PIL.Image.Image)
+
+    def __init__(self) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        pass
+
+    # When v2 was introduced, this method was private and called
+    # `_get_params()`. Now it's publicly exposed as `make_params()`. It cannot
+    # be exposed as `get_params()` because there is already a `get_params()`
+    # methods for v2 transforms: it's the v1's `get_params()` that we have  to
+    # keep in order to guarantee 100% BC with v1. (It's defined in
+    # __init_subclass__ below).
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        """Method to override for custom transforms.
+
+        See :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`"""
+        return dict()
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        kernel = _get_kernel(functional, type(inpt), allow_passthrough=True)
+        return kernel(inpt, *args, **kwargs)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        """Method to override for custom transforms.
+
+        See :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`"""
+        raise NotImplementedError
+
+    def forward(self, *inputs: Any) -> Any:
+        """Do not override this! Use ``transform()`` instead."""
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+
+        self.check_inputs(flat_inputs)
+
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self.make_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
+
+        flat_outputs = [
+            self.transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _needs_transform_list(self, flat_inputs: list[Any]) -> list[bool]:
+        # Below is a heuristic on how to deal with pure tensor inputs:
+        # 1. Pure tensors, i.e. tensors that are not a tv_tensor, are passed through if there is an explicit image
+        #    (`tv_tensors.Image` or `PIL.Image.Image`) or video (`tv_tensors.Video`) in the sample.
+        # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
+        #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
+        #    of `tree_flatten`, which recurses depth-first through the input.
+        #
+        # This heuristic stems from two requirements:
+        # 1. We need to keep BC for single input pure tensors and treat them as images.
+        # 2. We don't want to treat all pure tensors as images, because some datasets like `CelebA` or `Widerface`
+        #    return supplemental numerical data as tensors that cannot be transformed as images.
+        #
+        # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
+        # tries to transform multiple pure tensors at the same time, expecting them all to be treated as images.
+        # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
+
+        needs_transform_list = []
+        transform_pure_tensor = not has_any(flat_inputs, tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)
+        for inpt in flat_inputs:
+            needs_transform = True
+
+            if not check_type(inpt, self._transformed_types):
+                needs_transform = False
+            elif is_pure_tensor(inpt):
+                if transform_pure_tensor:
+                    transform_pure_tensor = False
+                else:
+                    needs_transform = False
+            needs_transform_list.append(needs_transform)
+        return needs_transform_list
+
+    def extra_repr(self) -> str:
+        extra = []
+        for name, value in self.__dict__.items():
+            if name.startswith("_") or name == "training":
+                continue
+
+            if not isinstance(value, (bool, int, float, str, tuple, list, enum.Enum)):
+                continue
+
+            extra.append(f"{name}={value}")
+
+        return ", ".join(extra)
+
+    # This attribute should be set on all transforms that have a v1 equivalent. Doing so enables two things:
+    # 1. In case the v1 transform has a static `get_params` method, it will also be available under the same name on
+    #    the v2 transform. See `__init_subclass__` for details.
+    # 2. The v2 transform will be JIT scriptable. See `_extract_params_for_v1_transform` and `__prepare_scriptable__`
+    #    for details.
+    _v1_transform_cls: type[nn.Module] | None = None
+
+    def __init_subclass__(cls) -> None:
+        # Since `get_params` is a `@staticmethod`, we have to bind it to the class itself rather than to an instance.
+        # This method is called after subclassing has happened, i.e. `cls` is the subclass, e.g. `Resize`.
+        if cls._v1_transform_cls is not None and hasattr(cls._v1_transform_cls, "get_params"):
+            cls.get_params = staticmethod(cls._v1_transform_cls.get_params)  # type: ignore[attr-defined]
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
+        # v2 transform instance. It extracts all available public attributes that are specific to that transform and
+        # not `nn.Module` in general.
+        # Overwrite this method on the v2 transform class if the above is not sufficient. For example, this might happen
+        # if the v2 transform introduced new parameters that are not support by the v1 transform.
+        common_attrs = nn.Module().__dict__.keys()
+        return {
+            attr: value
+            for attr, value in self.__dict__.items()
+            if not attr.startswith("_") and attr not in common_attrs
+        }
+
+    def __prepare_scriptable__(self) -> nn.Module:
+        # This method is called early on when `torch.jit.script`'ing an `nn.Module` instance. If it succeeds, the return
+        # value is used for scripting over the original object that should have been scripted. Since the v1 transforms
+        # are JIT scriptable, and we made sure that for single image inputs v1 and v2 are equivalent, we just return the
+        # equivalent v1 transform here. This of course only makes transforms v2 JIT scriptable as long as transforms v1
+        # is around.
+        if self._v1_transform_cls is None:
+            raise RuntimeError(
+                f"Transform {type(self).__name__} cannot be JIT scripted. "
+                "torchscript is only supported for backward compatibility with transforms "
+                "which are already in torchvision.transforms. "
+                "For torchscript support (on tensors only), you can use the functional API instead."
+            )
+
+        return self._v1_transform_cls(**self._extract_params_for_v1_transform())
+
+
+class _RandomApplyTransform(Transform):
+    def __init__(self, p: float = 0.5) -> None:
+        if not (0.0 <= p <= 1.0):
+            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
+
+        super().__init__()
+        self.p = p
+
+    def forward(self, *inputs: Any) -> Any:
+        # We need to almost duplicate `Transform.forward()` here since we always want to check the inputs, but return
+        # early afterwards in case the random check triggers. The same result could be achieved by calling
+        # `super().forward()` after the random check, but that would call `self.check_inputs` twice.
+
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+
+        self.check_inputs(flat_inputs)
+
+        if torch.rand(1) >= self.p:
+            return inputs
+
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self.make_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
+
+        flat_outputs = [
+            self.transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
new file mode 100644
index 00000000000..d9cbf502bb6
--- /dev/null
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -0,0 +1,84 @@
+from typing import Any, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+from torchvision.transforms.v2._utils import is_pure_tensor
+
+
+class PILToTensor(Transform):
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
+    """
+
+    _transformed_types = (PIL.Image.Image,)
+
+    def transform(self, inpt: PIL.Image.Image, params: dict[str, Any]) -> torch.Tensor:
+        return F.pil_to_tensor(inpt)
+
+
+class ToImage(Transform):
+    """Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
+    ; this does not scale values.
+
+    This transform does not support torchscript.
+    """
+
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
+
+    def transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: dict[str, Any]
+    ) -> tv_tensors.Image:
+        return F.to_image(inpt)
+
+
+class ToPILImage(Transform):
+    """Convert a tensor or an ndarray to PIL Image
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+              ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, np.ndarray)
+
+    def __init__(self, mode: Optional[str] = None) -> None:
+        super().__init__()
+        self.mode = mode
+
+    def transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: dict[str, Any]
+    ) -> PIL.Image.Image:
+        return F.to_pil_image(inpt, mode=self.mode)
+
+
+class ToPureTensor(Transform):
+    """Convert all TVTensors to pure tensors, removing associated metadata (if any).
+
+    This doesn't scale or change the values, only the type.
+    """
+
+    _transformed_types = (tv_tensors.TVTensor,)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> torch.Tensor:
+        return inpt.as_subclass(torch.Tensor)
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
new file mode 100644
index 00000000000..5ed871d0554
--- /dev/null
+++ b/torchvision/transforms/v2/_utils.py
@@ -0,0 +1,230 @@
+from __future__ import annotations
+
+import collections.abc
+import numbers
+from collections.abc import Sequence
+from contextlib import suppress
+
+from typing import Any, Callable, Literal
+
+import PIL.Image
+import torch
+
+from torchvision import tv_tensors
+
+from torchvision._utils import sequence_to_str
+
+from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
+
+
+def _setup_number_or_seq(arg: int | float | Sequence[int | float], name: str) -> Sequence[float]:
+    if not isinstance(arg, (int, float, Sequence)):
+        raise TypeError(f"{name} should be a number or a sequence of numbers. Got {type(arg)}")
+    if isinstance(arg, Sequence) and len(arg) not in (1, 2):
+        raise ValueError(f"If {name} is a sequence its length should be 1 or 2. Got {len(arg)}")
+    if isinstance(arg, Sequence):
+        for element in arg:
+            if not isinstance(element, (int, float)):
+                raise ValueError(f"{name} should be a sequence of numbers. Got {type(element)}")
+
+    if isinstance(arg, (int, float)):
+        arg = [float(arg), float(arg)]
+    elif isinstance(arg, Sequence):
+        if len(arg) == 1:
+            arg = [float(arg[0]), float(arg[0])]
+        else:
+            arg = [float(arg[0]), float(arg[1])]
+    return arg
+
+
+def _check_fill_arg(fill: _FillType | dict[type | str, _FillType]) -> None:
+    if isinstance(fill, dict):
+        for value in fill.values():
+            _check_fill_arg(value)
+    else:
+        if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
+            raise TypeError("Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed.")
+
+
+def _convert_fill_arg(fill: _FillType) -> _FillTypeJIT:
+    # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
+    # So, we can't reassign fill to 0
+    # if fill is None:
+    #     fill = 0
+    if fill is None:
+        return fill
+
+    if not isinstance(fill, (int, float)):
+        fill = [float(v) for v in list(fill)]
+    return fill  # type: ignore[return-value]
+
+
+def _setup_fill_arg(fill: _FillType | dict[type | str, _FillType]) -> dict[type | str, _FillTypeJIT]:
+    _check_fill_arg(fill)
+
+    if isinstance(fill, dict):
+        for k, v in fill.items():
+            fill[k] = _convert_fill_arg(v)
+        return fill  # type: ignore[return-value]
+    else:
+        return {"others": _convert_fill_arg(fill)}
+
+
+def _get_fill(fill_dict, inpt_type):
+    if inpt_type in fill_dict:
+        return fill_dict[inpt_type]
+    elif "others" in fill_dict:
+        return fill_dict["others"]
+    else:
+        RuntimeError("This should never happen, please open an issue on the torchvision repo if you hit this.")
+
+
+def _check_padding_arg(padding: int | Sequence[int]) -> None:
+
+    err_msg = f"Padding must be an int or a 1, 2, or 4 element of tuple or list, got {padding}."
+    if isinstance(padding, (tuple, list)):
+        if len(padding) not in [1, 2, 4] or not all(isinstance(p, int) for p in padding):
+            raise ValueError(err_msg)
+    elif not isinstance(padding, int):
+        raise ValueError(err_msg)
+
+
+# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums)
+# https://github.com/pytorch/vision/issues/6250
+def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+
+def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
+    """
+    This heuristic covers three cases:
+
+    1. The input is tuple or list whose second item is a labels tensor. This happens for already batched
+       classification inputs for MixUp and CutMix (typically after the Dataloder).
+    2. The input is a tuple or list whose second item is a dictionary that contains the labels tensor
+       under a label-like (see below) key. This happens for the inputs of detection models.
+    3. The input is a dictionary that is structured as the one from 2.
+
+    What is "label-like" key? We first search for an case-insensitive match of 'labels' inside the keys of the
+    dictionary. This is the name our detection models expect. If we can't find that, we look for a case-insensitive
+    match of the term 'label' anywhere inside the key, i.e. 'FooLaBeLBar'. If we can't find that either, the dictionary
+    contains no "label-like" key.
+    """
+
+    if isinstance(inputs, (tuple, list)):
+        inputs = inputs[1]
+
+    # MixUp, CutMix
+    if is_pure_tensor(inputs):
+        return inputs
+
+    if not isinstance(inputs, collections.abc.Mapping):
+        raise ValueError(
+            f"When using the default labels_getter, the input passed to forward must be a dictionary or a two-tuple "
+            f"whose second item is a dictionary or a tensor, but got {inputs} instead."
+        )
+
+    candidate_key = None
+    with suppress(StopIteration):
+        candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
+    if candidate_key is None:
+        with suppress(StopIteration):
+            candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
+    if candidate_key is None:
+        raise ValueError(
+            "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
+            "If there are no labels in the sample by design, pass labels_getter=None."
+        )
+
+    return inputs[candidate_key]
+
+
+def _parse_labels_getter(labels_getter: str | Callable[[Any], Any] | None) -> Callable[[Any], Any]:
+    if labels_getter == "default":
+        return _find_labels_default_heuristic
+    elif callable(labels_getter):
+        return labels_getter
+    elif labels_getter is None:
+        return lambda _: None
+    else:
+        raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.")
+
+
+def get_bounding_boxes(flat_inputs: list[Any]) -> tv_tensors.BoundingBoxes:
+    """Return the Bounding Boxes in the input.
+
+    Assumes only one ``BoundingBoxes`` object is present.
+    """
+    # This assumes there is only one bbox per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, tv_tensors.BoundingBoxes))
+    except StopIteration:
+        raise ValueError("No bounding boxes were found in the sample")
+
+
+def query_chw(flat_inputs: list[Any]) -> tuple[int, int, int]:
+    """Return Channel, Height, and Width."""
+    chws = {
+        tuple(get_dimensions(inpt))
+        for inpt in flat_inputs
+        if check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+    }
+    if not chws:
+        raise TypeError("No image or video was found in the sample")
+    elif len(chws) > 1:
+        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
+    c, h, w = chws.pop()
+    return c, h, w
+
+
+def query_size(flat_inputs: list[Any]) -> tuple[int, int]:
+    """Return Height and Width."""
+    sizes = {
+        tuple(get_size(inpt))
+        for inpt in flat_inputs
+        if check_type(
+            inpt,
+            (
+                is_pure_tensor,
+                tv_tensors.Image,
+                PIL.Image.Image,
+                tv_tensors.Video,
+                tv_tensors.Mask,
+                tv_tensors.BoundingBoxes,
+                tv_tensors.KeyPoints,
+            ),
+        )
+    }
+    if not sizes:
+        raise TypeError("No image, video, mask, bounding box of keypoint was found in the sample")
+    elif len(sizes) > 1:
+        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
+    h, w = sizes.pop()
+    return h, w
+
+
+def check_type(obj: Any, types_or_checks: tuple[type | Callable[[Any], bool], ...]) -> bool:
+    for type_or_check in types_or_checks:
+        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
+            return True
+    return False
+
+
+def has_any(flat_inputs: list[Any], *types_or_checks: type | Callable[[Any], bool]) -> bool:
+    for inpt in flat_inputs:
+        if check_type(inpt, types_or_checks):
+            return True
+    return False
+
+
+def has_all(flat_inputs: list[Any], *types_or_checks: type | Callable[[Any], bool]) -> bool:
+    for type_or_check in types_or_checks:
+        for inpt in flat_inputs:
+            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
+                break
+        else:
+            return False
+    return True
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
new file mode 100644
index 00000000000..96767d30c99
--- /dev/null
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -0,0 +1,166 @@
+from torchvision.transforms import InterpolationMode  # usort: skip
+
+from ._utils import is_pure_tensor, register_kernel  # usort: skip
+
+from ._meta import (
+    clamp_bounding_boxes,
+    clamp_keypoints,
+    convert_bounding_box_format,
+    get_dimensions_image,
+    get_dimensions_video,
+    get_dimensions,
+    get_num_frames_video,
+    get_num_frames,
+    get_image_num_channels,
+    get_num_channels_image,
+    get_num_channels_video,
+    get_num_channels,
+    get_size_bounding_boxes,
+    get_size_keypoints,
+    get_size_image,
+    get_size_mask,
+    get_size_video,
+    get_size,
+)  # usort: skip
+
+from ._augment import erase, erase_image, erase_video, jpeg, jpeg_image, jpeg_video
+from ._color import (
+    adjust_brightness,
+    adjust_brightness_image,
+    adjust_brightness_video,
+    adjust_contrast,
+    adjust_contrast_image,
+    adjust_contrast_video,
+    adjust_gamma,
+    adjust_gamma_image,
+    adjust_gamma_video,
+    adjust_hue,
+    adjust_hue_image,
+    adjust_hue_video,
+    adjust_saturation,
+    adjust_saturation_image,
+    adjust_saturation_video,
+    adjust_sharpness,
+    adjust_sharpness_image,
+    adjust_sharpness_video,
+    autocontrast,
+    autocontrast_image,
+    autocontrast_video,
+    equalize,
+    equalize_image,
+    equalize_video,
+    grayscale_to_rgb,
+    grayscale_to_rgb_image,
+    invert,
+    invert_image,
+    invert_video,
+    permute_channels,
+    permute_channels_image,
+    permute_channels_video,
+    posterize,
+    posterize_image,
+    posterize_video,
+    rgb_to_grayscale,
+    rgb_to_grayscale_image,
+    solarize,
+    solarize_image,
+    solarize_video,
+    to_grayscale,
+)
+from ._geometry import (
+    affine,
+    affine_bounding_boxes,
+    affine_image,
+    affine_keypoints,
+    affine_mask,
+    affine_video,
+    center_crop,
+    center_crop_bounding_boxes,
+    center_crop_image,
+    center_crop_keypoints,
+    center_crop_mask,
+    center_crop_video,
+    crop,
+    crop_bounding_boxes,
+    crop_image,
+    crop_keypoints,
+    crop_mask,
+    crop_video,
+    elastic,
+    elastic_bounding_boxes,
+    elastic_image,
+    elastic_keypoints,
+    elastic_mask,
+    elastic_transform,
+    elastic_video,
+    five_crop,
+    five_crop_image,
+    five_crop_video,
+    hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
+    horizontal_flip,
+    horizontal_flip_bounding_boxes,
+    horizontal_flip_image,
+    horizontal_flip_keypoints,
+    horizontal_flip_mask,
+    horizontal_flip_video,
+    pad,
+    pad_bounding_boxes,
+    pad_image,
+    pad_keypoints,
+    pad_mask,
+    pad_video,
+    perspective,
+    perspective_bounding_boxes,
+    perspective_image,
+    perspective_keypoints,
+    perspective_mask,
+    perspective_video,
+    resize,
+    resize_bounding_boxes,
+    resize_image,
+    resize_keypoints,
+    resize_mask,
+    resize_video,
+    resized_crop,
+    resized_crop_bounding_boxes,
+    resized_crop_image,
+    resized_crop_keypoints,
+    resized_crop_mask,
+    resized_crop_video,
+    rotate,
+    rotate_bounding_boxes,
+    rotate_image,
+    rotate_keypoints,
+    rotate_mask,
+    rotate_video,
+    ten_crop,
+    ten_crop_image,
+    ten_crop_video,
+    vertical_flip,
+    vertical_flip_bounding_boxes,
+    vertical_flip_image,
+    vertical_flip_keypoints,
+    vertical_flip_mask,
+    vertical_flip_video,
+    vflip,
+)
+from ._misc import (
+    convert_image_dtype,
+    gaussian_blur,
+    gaussian_blur_image,
+    gaussian_blur_video,
+    gaussian_noise,
+    gaussian_noise_image,
+    gaussian_noise_video,
+    normalize,
+    normalize_image,
+    normalize_video,
+    sanitize_bounding_boxes,
+    to_dtype,
+    to_dtype_image,
+    to_dtype_video,
+)
+from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
+from ._type_conversion import pil_to_tensor, to_image, to_pil_image
+
+from ._deprecated import get_image_size, to_tensor  # usort: skip
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
new file mode 100644
index 00000000000..a904d8d7cbd
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -0,0 +1,106 @@
+import io
+
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+from torchvision.io import decode_jpeg, encode_jpeg
+from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def erase(
+    inpt: torch.Tensor,
+    i: int,
+    j: int,
+    h: int,
+    w: int,
+    v: torch.Tensor,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomErase` for details."""
+    if torch.jit.is_scripting():
+        return erase_image(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+    _log_api_usage_once(erase)
+
+    kernel = _get_kernel(erase, type(inpt))
+    return kernel(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+
+@_register_kernel_internal(erase, torch.Tensor)
+@_register_kernel_internal(erase, tv_tensors.Image)
+def erase_image(
+    image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    if not inplace:
+        image = image.clone()
+
+    image[..., i : i + h, j : j + w] = v
+    return image
+
+
+@_register_kernel_internal(erase, PIL.Image.Image)
+def _erase_image_pil(
+    image: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = erase_image(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    return to_pil_image(output, mode=image.mode)
+
+
+@_register_kernel_internal(erase, tv_tensors.Video)
+def erase_video(
+    video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    return erase_image(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+
+def jpeg(image: torch.Tensor, quality: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.JPEG` for details."""
+    if torch.jit.is_scripting():
+        return jpeg_image(image, quality=quality)
+
+    _log_api_usage_once(jpeg)
+
+    kernel = _get_kernel(jpeg, type(image))
+    return kernel(image, quality=quality)
+
+
+@_register_kernel_internal(jpeg, torch.Tensor)
+@_register_kernel_internal(jpeg, tv_tensors.Image)
+def jpeg_image(image: torch.Tensor, quality: int) -> torch.Tensor:
+    original_shape = image.shape
+    image = image.view((-1,) + image.shape[-3:])
+
+    if image.shape[0] == 0:  # degenerate
+        return image.reshape(original_shape).clone()
+
+    images = []
+    for i in range(image.shape[0]):
+        # isinstance checks are needed for torchscript.
+        encoded_image = encode_jpeg(image[i], quality=quality)
+        assert isinstance(encoded_image, torch.Tensor)
+        decoded_image = decode_jpeg(encoded_image)
+        assert isinstance(decoded_image, torch.Tensor)
+        images.append(decoded_image)
+
+    images = torch.stack(images, dim=0).view(original_shape)
+    return images
+
+
+@_register_kernel_internal(jpeg, tv_tensors.Video)
+def jpeg_video(video: torch.Tensor, quality: int) -> torch.Tensor:
+    return jpeg_image(video, quality=quality)
+
+
+@_register_kernel_internal(jpeg, PIL.Image.Image)
+def _jpeg_image_pil(image: PIL.Image.Image, quality: int) -> PIL.Image.Image:
+    raw_jpeg = io.BytesIO()
+    image.save(raw_jpeg, format="JPEG", quality=quality)
+
+    # we need to copy since PIL.Image.open() will return PIL.JpegImagePlugin.JpegImageFile
+    # which is a sub-class of PIL.Image.Image. this will fail check_transform() test.
+    return PIL.Image.open(raw_jpeg).copy()
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
new file mode 100644
index 00000000000..be254c0d63a
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -0,0 +1,740 @@
+import PIL.Image
+import torch
+from torch.nn.functional import conv2d
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _max_value
+
+from torchvision.utils import _log_api_usage_once
+
+from ._misc import _num_value_bits, to_dtype_image
+from ._type_conversion import pil_to_tensor, to_pil_image
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Grayscale` for details."""
+    if torch.jit.is_scripting():
+        return rgb_to_grayscale_image(inpt, num_output_channels=num_output_channels)
+
+    _log_api_usage_once(rgb_to_grayscale)
+
+    kernel = _get_kernel(rgb_to_grayscale, type(inpt))
+    return kernel(inpt, num_output_channels=num_output_channels)
+
+
+# `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
+# superset in terms of functionality and has the same signature, we alias here to avoid disruption.
+to_grayscale = rgb_to_grayscale
+
+
+def _rgb_to_grayscale_image(
+    image: torch.Tensor, num_output_channels: int = 1, preserve_dtype: bool = True
+) -> torch.Tensor:
+    # TODO: Maybe move the validation that num_output_channels is 1 or 3 to this function instead of callers.
+    if image.shape[-3] == 1 and num_output_channels == 1:
+        return image.clone()
+    if image.shape[-3] == 1 and num_output_channels == 3:
+        s = [1] * len(image.shape)
+        s[-3] = 3
+        return image.repeat(s)
+    r, g, b = image.unbind(dim=-3)
+    l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
+    l_img = l_img.unsqueeze(dim=-3)
+    if preserve_dtype:
+        l_img = l_img.to(image.dtype)
+    if num_output_channels == 3:
+        l_img = l_img.expand(image.shape)
+    return l_img
+
+
+@_register_kernel_internal(rgb_to_grayscale, torch.Tensor)
+@_register_kernel_internal(rgb_to_grayscale, tv_tensors.Image)
+def rgb_to_grayscale_image(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _rgb_to_grayscale_image(image, num_output_channels=num_output_channels, preserve_dtype=True)
+
+
+@_register_kernel_internal(rgb_to_grayscale, PIL.Image.Image)
+def _rgb_to_grayscale_image_pil(image: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _FP.to_grayscale(image, num_output_channels=num_output_channels)
+
+
+def grayscale_to_rgb(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RGB` for details."""
+    if torch.jit.is_scripting():
+        return grayscale_to_rgb_image(inpt)
+
+    _log_api_usage_once(grayscale_to_rgb)
+
+    kernel = _get_kernel(grayscale_to_rgb, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(grayscale_to_rgb, torch.Tensor)
+@_register_kernel_internal(grayscale_to_rgb, tv_tensors.Image)
+def grayscale_to_rgb_image(image: torch.Tensor) -> torch.Tensor:
+    if image.shape[-3] >= 3:
+        # Image already has RGB channels. We don't need to do anything.
+        return image
+    # rgb_to_grayscale can be used to add channels so we reuse that function.
+    return _rgb_to_grayscale_image(image, num_output_channels=3, preserve_dtype=True)
+
+
+@_register_kernel_internal(grayscale_to_rgb, PIL.Image.Image)
+def grayscale_to_rgb_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return image.convert(mode="RGB")
+
+
+def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
+    ratio = float(ratio)
+    fp = image1.is_floating_point()
+    bound = _max_value(image1.dtype)
+    output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
+    return output if fp else output.to(image1.dtype)
+
+
+def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    """Adjust brightness."""
+
+    if torch.jit.is_scripting():
+        return adjust_brightness_image(inpt, brightness_factor=brightness_factor)
+
+    _log_api_usage_once(adjust_brightness)
+
+    kernel = _get_kernel(adjust_brightness, type(inpt))
+    return kernel(inpt, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, torch.Tensor)
+@_register_kernel_internal(adjust_brightness, tv_tensors.Image)
+def adjust_brightness_image(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    fp = image.is_floating_point()
+    bound = _max_value(image.dtype)
+    output = image.mul(brightness_factor).clamp_(0, bound)
+    return output if fp else output.to(image.dtype)
+
+
+@_register_kernel_internal(adjust_brightness, PIL.Image.Image)
+def _adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
+    return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, tv_tensors.Video)
+def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    return adjust_brightness_image(video, brightness_factor=brightness_factor)
+
+
+def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    """Adjust saturation."""
+    if torch.jit.is_scripting():
+        return adjust_saturation_image(inpt, saturation_factor=saturation_factor)
+
+    _log_api_usage_once(adjust_saturation)
+
+    kernel = _get_kernel(adjust_saturation, type(inpt))
+    return kernel(inpt, saturation_factor=saturation_factor)
+
+
+@_register_kernel_internal(adjust_saturation, torch.Tensor)
+@_register_kernel_internal(adjust_saturation, tv_tensors.Image)
+def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
+    if not image.is_floating_point():
+        grayscale_image = grayscale_image.floor_()
+
+    return _blend(image, grayscale_image, saturation_factor)
+
+
+_adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
+
+
+@_register_kernel_internal(adjust_saturation, tv_tensors.Video)
+def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    return adjust_saturation_image(video, saturation_factor=saturation_factor)
+
+
+def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.RandomAutocontrast`"""
+    if torch.jit.is_scripting():
+        return adjust_contrast_image(inpt, contrast_factor=contrast_factor)
+
+    _log_api_usage_once(adjust_contrast)
+
+    kernel = _get_kernel(adjust_contrast, type(inpt))
+    return kernel(inpt, contrast_factor=contrast_factor)
+
+
+@_register_kernel_internal(adjust_contrast, torch.Tensor)
+@_register_kernel_internal(adjust_contrast, tv_tensors.Image)
+def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+    fp = image.is_floating_point()
+    if c == 3:
+        grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
+        if not fp:
+            grayscale_image = grayscale_image.floor_()
+    else:
+        grayscale_image = image if fp else image.to(torch.float32)
+    mean = torch.mean(grayscale_image, dim=(-3, -2, -1), keepdim=True)
+    return _blend(image, mean, contrast_factor)
+
+
+_adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
+
+
+@_register_kernel_internal(adjust_contrast, tv_tensors.Video)
+def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    return adjust_contrast_image(video, contrast_factor=contrast_factor)
+
+
+def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.RandomAdjustSharpness`"""
+    if torch.jit.is_scripting():
+        return adjust_sharpness_image(inpt, sharpness_factor=sharpness_factor)
+
+    _log_api_usage_once(adjust_sharpness)
+
+    kernel = _get_kernel(adjust_sharpness, type(inpt))
+    return kernel(inpt, sharpness_factor=sharpness_factor)
+
+
+@_register_kernel_internal(adjust_sharpness, torch.Tensor)
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Image)
+def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    num_channels, height, width = image.shape[-3:]
+    if num_channels not in (1, 3):
+        raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
+
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    if image.numel() == 0 or height <= 2 or width <= 2:
+        return image
+
+    bound = _max_value(image.dtype)
+    fp = image.is_floating_point()
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.reshape(-1, num_channels, height, width)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    # The following is a normalized 3x3 kernel with 1s in the edges and a 5 in the middle.
+    kernel_dtype = image.dtype if fp else torch.float32
+    a, b = 1.0 / 13.0, 5.0 / 13.0
+    kernel = torch.tensor([[a, a, a], [a, b, a], [a, a, a]], dtype=kernel_dtype, device=image.device)
+    kernel = kernel.expand(num_channels, 1, 3, 3)
+
+    # We copy and cast at the same time to avoid modifications on the original data
+    output = image.to(dtype=kernel_dtype, copy=True)
+    blurred_degenerate = conv2d(output, kernel, groups=num_channels)
+    if not fp:
+        # it is better to round before cast
+        blurred_degenerate = blurred_degenerate.round_()
+
+    # Create a view on the underlying output while pointing at the same data. We do this to avoid indexing twice.
+    view = output[..., 1:-1, 1:-1]
+
+    # We speed up blending by minimizing flops and doing in-place. The 2 blend options are mathematically equivalent:
+    # x+(1-r)*(y-x) = x + (1-r)*y - (1-r)*x = x*r + y*(1-r)
+    view.add_(blurred_degenerate.sub_(view), alpha=(1.0 - sharpness_factor))
+
+    # The actual data of output have been modified by the above. We only need to clamp and cast now.
+    output = output.clamp_(0, bound)
+    if not fp:
+        output = output.to(image.dtype)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    return output
+
+
+_adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
+
+
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Video)
+def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    return adjust_sharpness_image(video, sharpness_factor=sharpness_factor)
+
+
+def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    """Adjust hue"""
+    if torch.jit.is_scripting():
+        return adjust_hue_image(inpt, hue_factor=hue_factor)
+
+    _log_api_usage_once(adjust_hue)
+
+    kernel = _get_kernel(adjust_hue, type(inpt))
+    return kernel(inpt, hue_factor=hue_factor)
+
+
+def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
+    r, g, _ = image.unbind(dim=-3)
+
+    # Implementation is based on
+    # https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/src/libImaging/Convert.c#L330
+    minc, maxc = torch.aminmax(image, dim=-3)
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    channels_range = maxc - minc
+    # Since `eqc => channels_range = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = channels_range / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    channels_range_divisor = torch.where(eqc, ones, channels_range).unsqueeze_(dim=-3)
+    rc, gc, bc = ((maxc.unsqueeze(dim=-3) - image) / channels_range_divisor).unbind(dim=-3)
+
+    mask_maxc_neq_r = maxc != r
+    mask_maxc_eq_g = maxc == g
+
+    hg = rc.add(2.0).sub_(bc).mul_(mask_maxc_eq_g & mask_maxc_neq_r)
+    hr = bc.sub_(gc).mul_(~mask_maxc_neq_r)
+    hb = gc.add_(4.0).sub_(rc).mul_(mask_maxc_neq_r.logical_and_(mask_maxc_eq_g.logical_not_()))
+
+    h = hr.add_(hg).add_(hb)
+    h = h.mul_(1.0 / 6.0).add_(1.0).fmod_(1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
+    h, s, v = img.unbind(dim=-3)
+    h6 = h.mul(6)
+    i = torch.floor(h6)
+    f = h6.sub_(i)
+    i = i.to(dtype=torch.int32)
+
+    sxf = s * f
+    one_minus_s = 1.0 - s
+    q = (1.0 - sxf).mul_(v).clamp_(0.0, 1.0)
+    t = sxf.add_(one_minus_s).mul_(v).clamp_(0.0, 1.0)
+    p = one_minus_s.mul_(v).clamp_(0.0, 1.0)
+    i.remainder_(6)
+
+    vpqt = torch.stack((v, p, q, t), dim=-3)
+
+    # vpqt -> rgb mapping based on i
+    select = torch.tensor([[0, 2, 1, 1, 3, 0], [3, 0, 0, 2, 1, 1], [1, 1, 3, 0, 0, 2]], dtype=torch.long)
+    select = select.to(device=img.device, non_blocking=True)
+
+    select = select[:, i]
+    if select.ndim > 3:
+        # if input.shape is (B, ..., C, H, W) then
+        # select.shape is (C, B, ...,  H, W)
+        # thus we move C axis to get (B, ..., C, H, W)
+        select = select.moveaxis(0, -3)
+
+    return vpqt.gather(-3, select)
+
+
+@_register_kernel_internal(adjust_hue, torch.Tensor)
+@_register_kernel_internal(adjust_hue, tv_tensors.Image)
+def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    orig_dtype = image.dtype
+    image = to_dtype_image(image, torch.float32, scale=True)
+
+    image = _rgb_to_hsv(image)
+    h, s, v = image.unbind(dim=-3)
+    h.add_(hue_factor).remainder_(1.0)
+    image = torch.stack((h, s, v), dim=-3)
+    image_hue_adj = _hsv_to_rgb(image)
+
+    return to_dtype_image(image_hue_adj, orig_dtype, scale=True)
+
+
+_adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
+
+
+@_register_kernel_internal(adjust_hue, tv_tensors.Video)
+def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    return adjust_hue_image(video, hue_factor=hue_factor)
+
+
+def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    """Adjust gamma."""
+    if torch.jit.is_scripting():
+        return adjust_gamma_image(inpt, gamma=gamma, gain=gain)
+
+    _log_api_usage_once(adjust_gamma)
+
+    kernel = _get_kernel(adjust_gamma, type(inpt))
+    return kernel(inpt, gamma=gamma, gain=gain)
+
+
+@_register_kernel_internal(adjust_gamma, torch.Tensor)
+@_register_kernel_internal(adjust_gamma, tv_tensors.Image)
+def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    # The input image is either assumed to be at [0, 1] scale (if float) or is converted to that scale (if integer).
+    # Since the gamma is non-negative, the output remains at [0, 1] scale.
+    if not torch.is_floating_point(image):
+        output = to_dtype_image(image, torch.float32, scale=True).pow_(gamma)
+    else:
+        output = image.pow(gamma)
+
+    if gain != 1.0:
+        # The clamp operation is needed only if multiplication is performed. It's only when gain != 1, that the scale
+        # of the output can go beyond [0, 1].
+        output = output.mul_(gain).clamp_(0.0, 1.0)
+
+    return to_dtype_image(output, image.dtype, scale=True)
+
+
+_adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
+
+
+@_register_kernel_internal(adjust_gamma, tv_tensors.Video)
+def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    return adjust_gamma_image(video, gamma=gamma, gain=gain)
+
+
+def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomPosterize` for details."""
+    if torch.jit.is_scripting():
+        return posterize_image(inpt, bits=bits)
+
+    _log_api_usage_once(posterize)
+
+    kernel = _get_kernel(posterize, type(inpt))
+    return kernel(inpt, bits=bits)
+
+
+@_register_kernel_internal(posterize, torch.Tensor)
+@_register_kernel_internal(posterize, tv_tensors.Image)
+def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor:
+    if not isinstance(bits, int) or not 0 <= bits <= 8:
+        raise TypeError(f"bits must be a positive integer in the range [0, 8], got {bits} instead.")
+
+    if image.is_floating_point():
+        levels = 1 << bits
+        return image.mul(levels).floor_().clamp_(0, levels - 1).mul_(1.0 / levels)
+    else:
+        num_value_bits = _num_value_bits(image.dtype)
+        if bits >= num_value_bits:
+            return image
+
+        mask = ((1 << bits) - 1) << (num_value_bits - bits)
+        return image & mask
+
+
+_posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
+
+
+@_register_kernel_internal(posterize, tv_tensors.Video)
+def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
+    return posterize_image(video, bits=bits)
+
+
+def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomSolarize` for details."""
+    if torch.jit.is_scripting():
+        return solarize_image(inpt, threshold=threshold)
+
+    _log_api_usage_once(solarize)
+
+    kernel = _get_kernel(solarize, type(inpt))
+    return kernel(inpt, threshold=threshold)
+
+
+@_register_kernel_internal(solarize, torch.Tensor)
+@_register_kernel_internal(solarize, tv_tensors.Image)
+def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor:
+    if threshold > _max_value(image.dtype):
+        raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
+
+    return torch.where(image >= threshold, invert_image(image), image)
+
+
+_solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
+
+
+@_register_kernel_internal(solarize, tv_tensors.Video)
+def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
+    return solarize_image(video, threshold=threshold)
+
+
+def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomAutocontrast` for details."""
+    if torch.jit.is_scripting():
+        return autocontrast_image(inpt)
+
+    _log_api_usage_once(autocontrast)
+
+    kernel = _get_kernel(autocontrast, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(autocontrast, torch.Tensor)
+@_register_kernel_internal(autocontrast, tv_tensors.Image)
+def autocontrast_image(image: torch.Tensor) -> torch.Tensor:
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    bound = _max_value(image.dtype)
+    fp = image.is_floating_point()
+    float_image = image if fp else image.to(torch.float32)
+
+    minimum = float_image.amin(dim=(-2, -1), keepdim=True)
+    maximum = float_image.amax(dim=(-2, -1), keepdim=True)
+
+    eq_idxs = maximum == minimum
+    inv_scale = maximum.sub_(minimum).mul_(1.0 / bound)
+    minimum[eq_idxs] = 0.0
+    inv_scale[eq_idxs] = 1.0
+
+    if fp:
+        diff = float_image.sub(minimum)
+    else:
+        diff = float_image.sub_(minimum)
+
+    return diff.div_(inv_scale).clamp_(0, bound).to(image.dtype)
+
+
+_autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
+
+
+@_register_kernel_internal(autocontrast, tv_tensors.Video)
+def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
+    return autocontrast_image(video)
+
+
+def equalize(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomEqualize` for details."""
+    if torch.jit.is_scripting():
+        return equalize_image(inpt)
+
+    _log_api_usage_once(equalize)
+
+    kernel = _get_kernel(equalize, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(equalize, torch.Tensor)
+@_register_kernel_internal(equalize, tv_tensors.Image)
+def equalize_image(image: torch.Tensor) -> torch.Tensor:
+    if image.numel() == 0:
+        return image
+
+    # 1. The algorithm below can easily be extended to support arbitrary integer dtypes. However, the histogram that
+    #    would be needed to computed will have at least `torch.iinfo(dtype).max + 1` values. That is perfectly fine for
+    #    `torch.int8`, `torch.uint8`, and `torch.int16`, at least questionable for `torch.int32` and completely
+    #    unfeasible for `torch.int64`.
+    # 2. Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
+    #    could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
+    #    to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it slower
+    #    and more complicated to implement than a simple conversion and a fast histogram implementation for integers.
+    # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
+    # by far the most common, we choose it as base.
+    output_dtype = image.dtype
+    image = to_dtype_image(image, torch.uint8, scale=True)
+
+    # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
+    # corresponds to adding 1 to index 127 in the histogram.
+    batch_shape = image.shape[:-2]
+    flat_image = image.flatten(start_dim=-2).to(torch.long)
+    hist = flat_image.new_zeros(batch_shape + (256,), dtype=torch.int32)
+    hist.scatter_add_(dim=-1, index=flat_image, src=hist.new_ones(1).expand_as(flat_image))
+    cum_hist = hist.cumsum(dim=-1)
+
+    # The simplest form of lookup-table (LUT) that also achieves histogram equalization is
+    # `lut = cum_hist / flat_image.shape[-1] * 255`
+    # However, PIL uses a more elaborate scheme:
+    # https://github.com/python-pillow/Pillow/blob/eb59cb61d5239ee69cbbf12709a0c6fd7314e6d7/src/PIL/ImageOps.py#L368-L385
+    # `lut = ((cum_hist + num_non_max_pixels // (2 * 255)) // num_non_max_pixels) * 255`
+
+    # The last non-zero element in the histogram is the first element in the cumulative histogram with the maximum
+    # value. Thus, the "max" in `num_non_max_pixels` does not refer to 255 as the maximum value of uint8 images, but
+    # rather the maximum value in the image, which might be or not be 255.
+    index = cum_hist.argmax(dim=-1)
+    num_non_max_pixels = flat_image.shape[-1] - hist.gather(dim=-1, index=index.unsqueeze_(-1))
+
+    # This is performance optimization that saves us one multiplication later. With this, the LUT computation simplifies
+    # to `lut = (cum_hist + step // 2) // step` and thus saving the final multiplication by 255 while keeping the
+    # division count the same. PIL uses the variable name `step` for this, so we keep that for easier comparison.
+    step = num_non_max_pixels.div_(255, rounding_mode="floor")
+
+    # Although it looks like we could return early if we find `step == 0` like PIL does, that is unfortunately not as
+    # easy due to our support for batched images. We can only return early if `(step == 0).all()` holds. If it doesn't,
+    # we have to go through the computation below anyway. Since `step == 0` is an edge case anyway, it makes no sense to
+    # pay the runtime cost for checking it every time.
+    valid_equalization = step.ne(0).unsqueeze_(-1)
+
+    # `lut[k]` is computed with `cum_hist[k-1]` with `lut[0] == (step // 2) // step == 0`. Thus, we perform the
+    # computation only for `lut[1:]` with `cum_hist[:-1]` and add `lut[0] == 0` afterwards.
+    cum_hist = cum_hist[..., :-1]
+    (
+        cum_hist.add_(step // 2)
+        # We need the `clamp_`(min=1) call here to avoid zero division since they fail for integer dtypes. This has no
+        # effect on the returned result of this kernel since images inside the batch with `step == 0` are returned as is
+        # instead of equalized version.
+        .div_(step.clamp_(min=1), rounding_mode="floor")
+        # We need the `clamp_` call here since PILs LUT computation scheme can produce values outside the valid value
+        # range of uint8 images
+        .clamp_(0, 255)
+    )
+    lut = cum_hist.to(torch.uint8)
+    lut = torch.cat([lut.new_zeros(1).expand(batch_shape + (1,)), lut], dim=-1)
+    equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
+
+    output = torch.where(valid_equalization, equalized_image, image)
+    return to_dtype_image(output, output_dtype, scale=True)
+
+
+_equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
+
+
+@_register_kernel_internal(equalize, tv_tensors.Video)
+def equalize_video(video: torch.Tensor) -> torch.Tensor:
+    return equalize_image(video)
+
+
+def invert(inpt: torch.Tensor) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.RandomInvert`."""
+    if torch.jit.is_scripting():
+        return invert_image(inpt)
+
+    _log_api_usage_once(invert)
+
+    kernel = _get_kernel(invert, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(invert, torch.Tensor)
+@_register_kernel_internal(invert, tv_tensors.Image)
+def invert_image(image: torch.Tensor) -> torch.Tensor:
+    if image.is_floating_point():
+        return 1.0 - image
+    elif image.dtype == torch.uint8:
+        return image.bitwise_not()
+    else:  # signed integer dtypes
+        # We can't use `Tensor.bitwise_not` here, since we want to retain the leading zero bit that encodes the sign
+        return image.bitwise_xor((1 << _num_value_bits(image.dtype)) - 1)
+
+
+_invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
+
+
+@_register_kernel_internal(invert, tv_tensors.Video)
+def invert_video(video: torch.Tensor) -> torch.Tensor:
+    return invert_image(video)
+
+
+def permute_channels(inpt: torch.Tensor, permutation: list[int]) -> torch.Tensor:
+    """Permute the channels of the input according to the given permutation.
+
+    This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and
+    :class:`torchvision.tv_tensors.Image` and :class:`torchvision.tv_tensors.Video`.
+
+    Example:
+        >>> rgb_image = torch.rand(3, 256, 256)
+        >>> bgr_image = F.permute_channels(rgb_image, permutation=[2, 1, 0])
+
+    Args:
+        permutation (List[int]): Valid permutation of the input channel indices. The index of the element determines the
+            channel index in the input and the value determines the channel index in the output. For example,
+            ``permutation=[2, 0 , 1]``
+
+            - takes ``ìnpt[..., 0, :, :]`` and puts it at ``output[..., 2, :, :]``,
+            - takes ``ìnpt[..., 1, :, :]`` and puts it at ``output[..., 0, :, :]``, and
+            - takes ``ìnpt[..., 2, :, :]`` and puts it at ``output[..., 1, :, :]``.
+
+    Raises:
+        ValueError: If ``len(permutation)`` doesn't match the number of channels in the input.
+    """
+    if torch.jit.is_scripting():
+        return permute_channels_image(inpt, permutation=permutation)
+
+    _log_api_usage_once(permute_channels)
+
+    kernel = _get_kernel(permute_channels, type(inpt))
+    return kernel(inpt, permutation=permutation)
+
+
+@_register_kernel_internal(permute_channels, torch.Tensor)
+@_register_kernel_internal(permute_channels, tv_tensors.Image)
+def permute_channels_image(image: torch.Tensor, permutation: list[int]) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    if len(permutation) != num_channels:
+        raise ValueError(
+            f"Length of permutation does not match number of channels: " f"{len(permutation)} != {num_channels}"
+        )
+
+    if image.numel() == 0:
+        return image
+
+    image = image.reshape(-1, num_channels, height, width)
+    image = image[:, permutation, :, :]
+    return image.reshape(shape)
+
+
+@_register_kernel_internal(permute_channels, PIL.Image.Image)
+def _permute_channels_image_pil(image: PIL.Image.Image, permutation: list[int]) -> PIL.Image.Image:
+    return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation))
+
+
+@_register_kernel_internal(permute_channels, tv_tensors.Video)
+def permute_channels_video(video: torch.Tensor, permutation: list[int]) -> torch.Tensor:
+    return permute_channels_image(video, permutation=permutation)
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
new file mode 100644
index 00000000000..3131b5e8c49
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -0,0 +1,24 @@
+import warnings
+from typing import Any
+
+import torch
+
+from torchvision.transforms import functional as _F
+
+
+@torch.jit.unused
+def to_tensor(inpt: Any) -> torch.Tensor:
+    """[DEPREACTED] Use to_image() and to_dtype() instead."""
+    warnings.warn(
+        "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
+        "Instead, please use `to_image(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
+    )
+    return _F.to_tensor(inpt)
+
+
+def get_image_size(inpt: torch.Tensor) -> list[int]:
+    warnings.warn(
+        "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
+        "Instead, please use `get_size(...)` which returns `[h, w]` instead of `[w, h]`."
+    )
+    return _F.get_image_size(inpt)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
new file mode 100644
index 00000000000..04bb1d2aea7
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -0,0 +1,3003 @@
+import math
+import numbers
+import warnings
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
+
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _pad_symmetric
+from torchvision.transforms.functional import (
+    _compute_resized_output_size as __compute_resized_output_size,
+    _get_perspective_coeffs,
+    _interpolation_modes_from_int,
+    InterpolationMode,
+    pil_modes_mapping,
+    pil_to_tensor,
+    to_pil_image,
+)
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE
+
+from torchvision.utils import _log_api_usage_once
+
+from ._meta import _get_size_image_pil, clamp_bounding_boxes, clamp_keypoints, convert_bounding_box_format
+
+from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal
+
+
+def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise ValueError(
+            f"Argument interpolation should be an `InterpolationMode` or a corresponding Pillow integer constant, "
+            f"but got {interpolation}."
+        )
+    return interpolation
+
+
+def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomHorizontalFlip` for details."""
+    if torch.jit.is_scripting():
+        return horizontal_flip_image(inpt)
+
+    _log_api_usage_once(horizontal_flip)
+
+    kernel = _get_kernel(horizontal_flip, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(horizontal_flip, torch.Tensor)
+@_register_kernel_internal(horizontal_flip, tv_tensors.Image)
+def horizontal_flip_image(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-1)
+
+
+@_register_kernel_internal(horizontal_flip, PIL.Image.Image)
+def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return _FP.hflip(image)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.Mask)
+def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image(mask)
+
+
+def horizontal_flip_keypoints(keypoints: torch.Tensor, canvas_size: tuple[int, int]):
+    shape = keypoints.shape
+    keypoints = keypoints.clone().reshape(-1, 2)
+    keypoints[..., 0] = keypoints[..., 0].sub_(canvas_size[1] - 1).neg_()
+    return clamp_keypoints(keypoints.reshape(shape), canvas_size=canvas_size)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _horizontal_flip_keypoints_dispatch(keypoints: tv_tensors.KeyPoints):
+    out = horizontal_flip_keypoints(keypoints.as_subclass(torch.Tensor), canvas_size=keypoints.canvas_size)
+    return tv_tensors.wrap(out, like=keypoints)
+
+
+def horizontal_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: tuple[int, int]
+) -> torch.Tensor:
+    shape = bounding_boxes.shape
+
+    if tv_tensors.is_rotated_bounding_format(format):
+        bounding_boxes = (
+            bounding_boxes.clone().reshape(-1, 5)
+            if format != tv_tensors.BoundingBoxFormat.XYXYXYXY
+            else bounding_boxes.clone().reshape(-1, 8)
+        )
+    else:
+        bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.CXCYWH:
+        bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes[:, 0::2].sub_(canvas_size[1]).neg_()
+        bounding_boxes = bounding_boxes[:, [2, 3, 0, 1, 6, 7, 4, 5]]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR:
+        angle_rad = bounding_boxes[:, 4].mul(torch.pi).div(180)
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2].mul(angle_rad.cos())).sub_(canvas_size[1]).neg_()
+        bounding_boxes[:, 1].sub_(bounding_boxes[:, 2].mul(angle_rad.sin()))
+        bounding_boxes[:, 4].neg_()
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
+        bounding_boxes[:, 4].neg_()
+
+    return bounding_boxes.reshape(shape)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _horizontal_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
+    output = horizontal_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.Video)
+def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image(video)
+
+
+def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomVerticalFlip` for details."""
+    if torch.jit.is_scripting():
+        return vertical_flip_image(inpt)
+
+    _log_api_usage_once(vertical_flip)
+
+    kernel = _get_kernel(vertical_flip, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(vertical_flip, torch.Tensor)
+@_register_kernel_internal(vertical_flip, tv_tensors.Image)
+def vertical_flip_image(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-2)
+
+
+@_register_kernel_internal(vertical_flip, PIL.Image.Image)
+def _vertical_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return _FP.vflip(image)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.Mask)
+def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image(mask)
+
+
+def vertical_flip_keypoints(keypoints: torch.Tensor, canvas_size: tuple[int, int]) -> torch.Tensor:
+    shape = keypoints.shape
+    keypoints = keypoints.clone().reshape(-1, 2)
+    keypoints[..., 1] = keypoints[..., 1].sub_(canvas_size[0] - 1).neg_()
+    return clamp_keypoints(keypoints.reshape(shape), canvas_size=canvas_size)
+
+
+def vertical_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: tuple[int, int]
+) -> torch.Tensor:
+    shape = bounding_boxes.shape
+
+    if tv_tensors.is_rotated_bounding_format(format):
+        bounding_boxes = (
+            bounding_boxes.clone().reshape(-1, 5)
+            if format != tv_tensors.BoundingBoxFormat.XYXYXYXY
+            else bounding_boxes.clone().reshape(-1, 8)
+        )
+    else:
+        bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
+        bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.CXCYWH:
+        bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes[:, 1::2].sub_(canvas_size[0]).neg_()
+        bounding_boxes = bounding_boxes[:, [2, 3, 0, 1, 6, 7, 4, 5]]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR:
+        angle_rad = bounding_boxes[:, 4].mul(torch.pi).div(180)
+        bounding_boxes[:, 1].sub_(bounding_boxes[:, 2].mul(angle_rad.sin())).sub_(canvas_size[0]).neg_()
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2].mul(angle_rad.cos()))
+        bounding_boxes[:, 4].neg_().add_(180)
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
+        bounding_boxes[:, 4].neg_().add_(180)
+
+    return bounding_boxes.reshape(shape)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _vertical_flip_keypoints_dispatch(inpt: tv_tensors.KeyPoints) -> tv_tensors.KeyPoints:
+    output = vertical_flip_keypoints(inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _vertical_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
+    output = vertical_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.Video)
+def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image(video)
+
+
+# We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
+# prevalent and well understood. Thus, we just alias them without deprecating the old names.
+hflip = horizontal_flip
+vflip = vertical_flip
+
+
+def _compute_resized_output_size(
+    canvas_size: tuple[int, int], size: Optional[list[int]], max_size: Optional[int] = None
+) -> list[int]:
+    if isinstance(size, int):
+        size = [size]
+    elif max_size is not None and size is not None and len(size) != 1:
+        raise ValueError(
+            "max_size should only be passed if size is None or specifies the length of the smaller edge, "
+            "i.e. size should be an int or a sequence of length 1 in torchscript mode."
+        )
+    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size, allow_size_none=True)
+
+
+def resize(
+    inpt: torch.Tensor,
+    size: Optional[list[int]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Resize` for details."""
+    if torch.jit.is_scripting():
+        return resize_image(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+    _log_api_usage_once(resize)
+
+    kernel = _get_kernel(resize, type(inpt))
+    return kernel(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
+# This is an internal helper method for resize_image. We should put it here instead of keeping it
+# inside resize_image due to torchscript.
+# uint8 dtype support for bilinear and bicubic is limited to cpu and
+# according to our benchmarks on eager, non-AVX CPUs should still prefer u8->f32->interpolate->u8 path for bilinear
+def _do_native_uint8_resize_on_cpu(interpolation: InterpolationMode) -> bool:
+    if interpolation == InterpolationMode.BILINEAR:
+        if torch.compiler.is_compiling():
+            return True
+        else:
+            return "AVX2" in torch.backends.cpu.get_cpu_capability()
+
+    return interpolation == InterpolationMode.BICUBIC
+
+
+@_register_kernel_internal(resize, torch.Tensor)
+@_register_kernel_internal(resize, tv_tensors.Image)
+def resize_image(
+    image: torch.Tensor,
+    size: Optional[list[int]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+    antialias = False if antialias is None else antialias
+    align_corners: Optional[bool] = None
+    if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
+        align_corners = False
+    else:
+        # The default of antialias is True from 0.17, so we don't warn or
+        # error if other interpolation modes are used. This is documented.
+        antialias = False
+
+    shape = image.shape
+    numel = image.numel()
+    num_channels, old_height, old_width = shape[-3:]
+    new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+    elif numel > 0:
+        dtype = image.dtype
+        acceptable_dtypes = [torch.float32, torch.float64]
+        if interpolation == InterpolationMode.NEAREST or interpolation == InterpolationMode.NEAREST_EXACT:
+            # uint8 dtype can be included for cpu and cuda input if nearest mode
+            acceptable_dtypes.append(torch.uint8)
+        elif image.device.type == "cpu":
+            if _do_native_uint8_resize_on_cpu(interpolation):
+                acceptable_dtypes.append(torch.uint8)
+
+        image = image.reshape(-1, num_channels, old_height, old_width)
+        strides = image.stride()
+        if image.is_contiguous(memory_format=torch.channels_last) and image.shape[0] == 1 and numel != strides[0]:
+            # There is a weird behaviour in torch core where the output tensor of `interpolate()` can be allocated as
+            # contiguous even though the input is un-ambiguously channels_last (https://github.com/pytorch/pytorch/issues/68430).
+            # In particular this happens for the typical torchvision use-case of single CHW images where we fake the batch dim
+            # to become 1CHW. Below, we restride those tensors to trick torch core into properly allocating the output as
+            # channels_last, thus preserving the memory format of the input. This is not just for format consistency:
+            # for uint8 bilinear images, this also avoids an extra copy (re-packing) of the output and saves time.
+            # TODO: when https://github.com/pytorch/pytorch/issues/68430 is fixed (possibly by https://github.com/pytorch/pytorch/pull/100373),
+            # we should be able to remove this hack.
+            new_strides = list(strides)
+            new_strides[0] = numel
+            image = image.as_strided((1, num_channels, old_height, old_width), new_strides)
+
+        need_cast = dtype not in acceptable_dtypes
+        if need_cast:
+            image = image.to(dtype=torch.float32)
+
+        image = interpolate(
+            image,
+            size=[new_height, new_width],
+            mode=interpolation.value,
+            align_corners=align_corners,
+            antialias=antialias,
+        )
+
+        if need_cast:
+            if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
+                # This path is hit on non-AVX archs, or on GPU.
+                image = image.clamp_(min=0, max=255)
+            if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+                image = image.round_()
+            image = image.to(dtype=dtype)
+
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+def _resize_image_pil(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+) -> PIL.Image.Image:
+    old_height, old_width = image.height, image.width
+    new_height, new_width = _compute_resized_output_size(
+        (old_height, old_width),
+        size=size,  # type: ignore[arg-type]
+        max_size=max_size,
+    )
+
+    interpolation = _check_interpolation(interpolation)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+
+    return image.resize((new_width, new_height), resample=pil_modes_mapping[interpolation])
+
+
+@_register_kernel_internal(resize, PIL.Image.Image)
+def __resize_image_pil_dispatch(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return _resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
+
+
+def resize_mask(mask: torch.Tensor, size: Optional[list[int]], max_size: Optional[int] = None) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = resize_image(mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(resize, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _resize_mask_dispatch(
+    inpt: tv_tensors.Mask, size: list[int], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.Mask:
+    output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def resize_keypoints(
+    keypoints: torch.Tensor,
+    size: Optional[list[int]],
+    canvas_size: tuple[int, int],
+    max_size: Optional[int] = None,
+):
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return keypoints, canvas_size
+
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    ratios = torch.tensor([w_ratio, h_ratio], device=keypoints.device)
+    keypoints = keypoints.mul(ratios).to(keypoints.dtype)
+
+    return keypoints, (new_height, new_width)
+
+
+@_register_kernel_internal(resize, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _resize_keypoints_dispatch(
+    keypoints: tv_tensors.KeyPoints,
+    size: Optional[list[int]],
+    max_size: Optional[int] = None,
+    **kwargs: Any,
+) -> tv_tensors.KeyPoints:
+    out, canvas_size = resize_keypoints(
+        keypoints.as_subclass(torch.Tensor),
+        size,
+        canvas_size=keypoints.canvas_size,
+        max_size=max_size,
+    )
+    return tv_tensors.wrap(out, like=keypoints, canvas_size=canvas_size)
+
+
+def _parallelogram_to_bounding_boxes(parallelogram: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a parallelogram to a rectangle while keeping two points unchanged.
+    This function transforms a parallelogram represented by 8 coordinates (4 points) into a rectangle.
+    The two diagonally opposed points of the parallelogram forming the longest diagonal remain fixed.
+    The other points are adjusted to form a proper rectangle.
+
+    Note:
+        This function is not applied in-place and will return a copy of the input tensor.
+
+    Args:
+        parallelogram (torch.Tensor): Tensor of shape (..., 8) containing coordinates of parallelograms.
+                                     Format is [x1, y1, x2, y2, x3, y3, x4, y4].
+
+    Returns:
+        torch.Tensor: Tensor of same shape as input containing the rectangle coordinates.
+                     The output maintains the same dtype as the input.
+    """
+    original_shape = parallelogram.shape
+    dtype = parallelogram.dtype
+    acceptable_dtypes = [torch.float32, torch.float64]
+    need_cast = dtype not in acceptable_dtypes
+    if need_cast:
+        # Up-case to avoid overflow for square operations
+        parallelogram = parallelogram.to(torch.float32)
+
+    x1, y1, x2, y2, x3, y3, x4, y4 = parallelogram.unbind(-1)
+    cx = (x1 + x3) / 2
+    cy = (y1 + y3) / 2
+
+    # Calculate width, height, and rotation angle of the parallelogram
+    wp = torch.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+    hp = torch.sqrt((x4 - x1) ** 2 + (y4 - y1) ** 2)
+    r12 = torch.atan2(y1 - y2, x2 - x1)
+    r14 = torch.atan2(y1 - y4, x4 - x1)
+    r_rad = r12 - r14
+    sign = torch.where(r_rad > torch.pi / 2, -1, 1)
+    cos, sin = r_rad.cos(), r_rad.sin()
+
+    # Calculate width, height, and rotation angle of the rectangle
+    w = torch.where(wp < hp, wp * sin, wp + hp * cos * sign)
+    h = torch.where(wp > hp, hp * sin, hp + wp * cos * sign)
+    r_rad = torch.where(hp > wp, r14 + torch.pi / 2, r12)
+    cos, sin = r_rad.cos(), r_rad.sin()
+
+    x1 = cx - w / 2 * cos - h / 2 * sin
+    y1 = cy - h / 2 * cos + w / 2 * sin
+    x2 = cx + w / 2 * cos - h / 2 * sin
+    y2 = cy - h / 2 * cos - w / 2 * sin
+    x3 = cx + w / 2 * cos + h / 2 * sin
+    y3 = cy + h / 2 * cos - w / 2 * sin
+    x4 = cx - w / 2 * cos + h / 2 * sin
+    y4 = cy + h / 2 * cos + w / 2 * sin
+    out_boxes = torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1).reshape(original_shape)
+
+    if need_cast:
+        out_boxes = out_boxes.to(dtype)
+    return out_boxes
+
+
+def resize_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    canvas_size: tuple[int, int],
+    size: Optional[list[int]],
+    max_size: Optional[int] = None,
+    format: tv_tensors.BoundingBoxFormat = tv_tensors.BoundingBoxFormat.XYXY,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    # We set the default format as `tv_tensors.BoundingBoxFormat.XYXY`
+    # to ensure backward compatibility.
+    # Indeed before the introduction of rotated bounding box format
+    # this function did not received `format` parameter as input.
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return bounding_boxes, canvas_size
+
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    if tv_tensors.is_rotated_bounding_format(format):
+        original_shape = bounding_boxes.shape
+        xyxyxyxy_boxes = convert_bounding_box_format(
+            bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, inplace=False
+        ).reshape(-1, 8)
+
+        ratios = torch.tensor(
+            [w_ratio, h_ratio, w_ratio, h_ratio, w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device
+        )
+        transformed_points = xyxyxyxy_boxes.mul(ratios)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+        out_bboxes = clamp_bounding_boxes(
+            out_bboxes,
+            format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+            canvas_size=(new_height, new_width),
+            clamping_mode=clamping_mode,
+        )
+        return (
+            convert_bounding_box_format(
+                out_bboxes,
+                old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+                new_format=format,
+                inplace=False,
+            )
+            .to(bounding_boxes.dtype)
+            .reshape(original_shape),
+            (new_height, new_width),
+        )
+    else:
+        ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device)
+        return (
+            bounding_boxes.mul(ratios).to(bounding_boxes.dtype),
+            (new_height, new_width),
+        )
+
+
+@_register_kernel_internal(resize, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _resize_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, size: Optional[list[int]], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = resize_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        size=size,
+        max_size=max_size,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(resize, tv_tensors.Video)
+def resize_video(
+    video: torch.Tensor,
+    size: Optional[list[int]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    return resize_image(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
+def affine(
+    inpt: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomAffine` for details."""
+    if torch.jit.is_scripting():
+        return affine_image(
+            inpt,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+
+    _log_api_usage_once(affine)
+
+    kernel = _get_kernel(affine, type(inpt))
+    return kernel(
+        inpt,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
+def _affine_parse_args(
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    center: Optional[list[float]] = None,
+) -> tuple[float, list[float], list[float], Optional[list[float]]]:
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError("Shear should be either a single value or a sequence of two values")
+
+    if not isinstance(interpolation, InterpolationMode):
+        raise TypeError("Argument interpolation should be a InterpolationMode")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None:
+        if not isinstance(center, (list, tuple)):
+            raise TypeError("Argument center should be a sequence")
+        else:
+            center = [float(c) for c in center]
+
+    return angle, translate, shear, center
+
+
+def _get_inverse_affine_matrix(
+    center: list[float], angle: float, translate: list[float], scale: float, shear: list[float], inverted: bool = True
+) -> list[float]:
+    # Helper method to compute inverse matrix for affine transformation
+
+    # Pillow requires inverse affine transformation matrix:
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    #
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RotateScaleShear is rotation with scale and shear matrix
+    #
+    #       RotateScaleShear(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    cx, cy = center
+    tx, ty = translate
+
+    # Cached results
+    cos_sy = math.cos(sy)
+    tan_sx = math.tan(sx)
+    rot_minus_sy = rot - sy
+    cx_plus_tx = cx + tx
+    cy_plus_ty = cy + ty
+
+    # Rotate Scale Shear (RSS) without scaling
+    a = math.cos(rot_minus_sy) / cos_sy
+    b = -(a * tan_sx + math.sin(rot))
+    c = math.sin(rot_minus_sy) / cos_sy
+    d = math.cos(rot) - c * tan_sx
+
+    if inverted:
+        # Inverted rotation matrix with scale and shear
+        # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+        matrix = [d / scale, -b / scale, 0.0, -c / scale, a / scale, 0.0]
+        # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+        # and then apply center translation: C * RSS^-1 * C^-1 * T^-1
+        matrix[2] += cx - matrix[0] * cx_plus_tx - matrix[1] * cy_plus_ty
+        matrix[5] += cy - matrix[3] * cx_plus_tx - matrix[4] * cy_plus_ty
+    else:
+        matrix = [a * scale, b * scale, 0.0, c * scale, d * scale, 0.0]
+        # Apply inverse of center translation: RSS * C^-1
+        # and then apply translation and center : T * C * RSS * C^-1
+        matrix[2] += cx_plus_tx - matrix[0] * cx - matrix[1] * cy
+        matrix[5] += cy_plus_ty - matrix[3] * cx - matrix[4] * cy
+
+    return matrix
+
+
+def _compute_affine_output_size(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+    if torch.compiler.is_compiling() and not torch.jit.is_scripting():
+        return _compute_affine_output_size_python(matrix, w, h)
+    else:
+        return _compute_affine_output_size_tensor(matrix, w, h)
+
+
+def _compute_affine_output_size_tensor(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    pts = torch.tensor(
+        [
+            [-half_w, -half_h, 1.0],
+            [-half_w, half_h, 1.0],
+            [half_w, half_h, 1.0],
+            [half_w, -half_h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, max_vals = new_pts.aminmax(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    halfs = torch.tensor((half_w, half_h))
+    min_vals.add_(halfs)
+    max_vals.add_(halfs)
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    inv_tol = 1.0 / tol
+    cmax = max_vals.mul_(inv_tol).trunc_().mul_(tol).ceil_()
+    cmin = min_vals.mul_(inv_tol).trunc_().mul_(tol).floor_()
+    size = cmax.sub_(cmin)
+    return int(size[0]), int(size[1])  # w, h
+
+
+def _compute_affine_output_size_python(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+    # Mostly copied from PIL implementation:
+    # The only difference is with transformed points as input matrix has zero translation part here and
+    # PIL has a centered translation part.
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    a, b, c, d, e, f = matrix
+    xx = []
+    yy = []
+
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    for x, y in ((-half_w, -half_h), (half_w, -half_h), (half_w, half_h), (-half_w, half_h)):
+        nx = a * x + b * y + c
+        ny = d * x + e * y + f
+        xx.append(nx + half_w)
+        yy.append(ny + half_h)
+
+    nw = math.ceil(max(xx)) - math.floor(min(xx))
+    nh = math.ceil(max(yy)) - math.floor(min(yy))
+    return int(nw), int(nh)  # w, h
+
+
+def _apply_grid_transform(img: torch.Tensor, grid: torch.Tensor, mode: str, fill: _FillTypeJIT) -> torch.Tensor:
+    input_shape = img.shape
+    output_height, output_width = grid.shape[1], grid.shape[2]
+    num_channels, input_height, input_width = input_shape[-3:]
+    output_shape = input_shape[:-3] + (num_channels, output_height, output_width)
+
+    if img.numel() == 0:
+        return img.reshape(output_shape)
+
+    img = img.reshape(-1, num_channels, input_height, input_width)
+    squashed_batch_size = img.shape[0]
+
+    # We are using context knowledge that grid should have float dtype
+    fp = img.dtype == grid.dtype
+    float_img = img if fp else img.to(grid.dtype)
+
+    if squashed_batch_size > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(squashed_batch_size, -1, -1, -1)
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones(
+            (squashed_batch_size, 1, input_height, input_width), dtype=float_img.dtype, device=float_img.device
+        )
+        float_img = torch.cat((float_img, mask), dim=1)
+
+    float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3)
+        mask = mask.expand_as(float_img)
+        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]  # type: ignore[arg-type]
+        fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1)
+        if mode == "nearest":
+            float_img = torch.where(mask < 0.5, fill_img.expand_as(float_img), float_img)
+        else:  # 'bilinear'
+            # The following is mathematically equivalent to:
+            # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill
+            float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img)
+
+    img = float_img.round_().to(img.dtype) if not fp else float_img
+
+    return img.reshape(output_shape)
+
+
+def _assert_grid_transform_inputs(
+    image: torch.Tensor,
+    matrix: Optional[list[float]],
+    interpolation: str,
+    fill: _FillTypeJIT,
+    supported_interpolation_modes: list[str],
+    coeffs: Optional[list[float]] = None,
+) -> None:
+    if matrix is not None:
+        if not isinstance(matrix, list):
+            raise TypeError("Argument matrix should be a list")
+        elif len(matrix) != 6:
+            raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None:
+        if isinstance(fill, (tuple, list)):
+            length = len(fill)
+            num_channels = image.shape[-3]
+            if length > 1 and length != num_channels:
+                raise ValueError(
+                    "The number of elements in 'fill' cannot broadcast to match the number of "
+                    f"channels of the image ({length} != {num_channels})"
+                )
+        elif not isinstance(fill, (int, float)):
+            raise ValueError("Argument fill should be either int, float, tuple or list")
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _affine_grid(
+    theta: torch.Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> torch.Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+    dtype = theta.dtype
+    device = theta.device
+
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace((1.0 - ow) * 0.5, (ow - 1.0) * 0.5, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace((1.0 - oh) * 0.5, (oh - 1.0) * 0.5, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2).div_(torch.tensor([0.5 * w, 0.5 * h], dtype=dtype, device=device))
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
+@_register_kernel_internal(affine, torch.Tensor)
+@_register_kernel_internal(affine, tv_tensors.Image)
+def affine_image(
+    image: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
+    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
+
+    height, width = image.shape[-2:]
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
+
+    translate_f = [float(t) for t in translate]
+    matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
+
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height)
+    return _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+
+@_register_kernel_internal(affine, PIL.Image.Image)
+def _affine_image_pil(
+    image: PIL.Image.Image,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
+
+    # center = (img_size[0] * 0.5 + 0.5, img_size[1] * 0.5 + 0.5)
+    # it is visually better to estimate the center without 0.5 offset
+    # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
+    if center is None:
+        height, width = _get_size_image_pil(image)
+        center = [width * 0.5, height * 0.5]
+    matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+
+    return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
+
+
+# TODO: Consider merging/unifying this with the bbox implementation
+def _affine_keypoints_with_expand(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    expand: bool = False,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    if keypoints.numel() == 0:
+        return keypoints, canvas_size
+
+    original_dtype = keypoints.dtype
+    original_shape = keypoints.shape
+    keypoints = keypoints.clone() if keypoints.is_floating_point() else keypoints.float()
+    dtype = keypoints.dtype
+    device = keypoints.device
+
+    angle, translate, shear, center = _affine_parse_args(
+        angle, translate, scale, shear, InterpolationMode.NEAREST, center
+    )
+
+    if center is None:
+        height, width = canvas_size
+        center = [width * 0.5, height * 0.5]
+
+    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
+    transposed_affine_matrix = (
+        torch.tensor(
+            affine_vector,
+            dtype=dtype,
+            device=device,
+        )
+        .reshape(2, 3)
+        .T
+    )
+
+    # 1) We transform points into a tensor of points with shape (N, 3), where N is the number of points.
+    points = keypoints.reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
+    # 2) Now let's transform the points using affine matrix
+    transformed_points = torch.matmul(points, transposed_affine_matrix)
+
+    if expand:
+        # Compute minimum point for transformed image frame:
+        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+        height, width = canvas_size
+        points = torch.tensor(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, float(height), 1.0],
+                [float(width), float(height), 1.0],
+                [float(width), 0.0, 1.0],
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        new_points = torch.matmul(points, transposed_affine_matrix)
+        tr = torch.amin(new_points, dim=0, keepdim=True)
+        # Translate keypoints
+        transformed_points.sub_(tr)
+        # Estimate meta-data for image with inverted=True
+        affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
+        canvas_size = (new_height, new_width)
+
+    out_keypoints = clamp_keypoints(transformed_points, canvas_size=canvas_size).reshape(original_shape)
+    out_keypoints = out_keypoints.to(original_dtype)
+
+    return out_keypoints, canvas_size
+
+
+def affine_keypoints(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+):
+    return _affine_keypoints_with_expand(
+        keypoints=keypoints,
+        canvas_size=canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        expand=False,
+    )
+
+
+@_register_kernel_internal(affine, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _affine_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = affine_keypoints(
+        inpt.as_subclass(torch.Tensor),
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def _affine_bounding_boxes_with_expand(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    expand: bool = False,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes, canvas_size
+
+    original_shape = bounding_boxes.shape
+    dtype = bounding_boxes.dtype
+    need_cast = not bounding_boxes.is_floating_point()
+    bounding_boxes = bounding_boxes.float() if need_cast else bounding_boxes.clone()
+    device = bounding_boxes.device
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+    intermediate_shape = 8 if is_rotated else 4
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=intermediate_format, inplace=True)
+    ).reshape(-1, intermediate_shape)
+
+    angle, translate, shear, center = _affine_parse_args(
+        angle, translate, scale, shear, InterpolationMode.NEAREST, center
+    )
+
+    if center is None:
+        height, width = canvas_size
+        center = [width * 0.5, height * 0.5]
+
+    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
+    transposed_affine_matrix = (
+        torch.tensor(
+            affine_vector,
+            dtype=bounding_boxes.dtype,
+            device=device,
+        )
+        .reshape(2, 3)
+        .T
+    )
+    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
+    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
+    # Single point structure is similar to
+    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
+    if is_rotated:
+        points = bounding_boxes.reshape(-1, 2)
+    else:
+        points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=bounding_boxes.dtype)], dim=-1)
+    # 2) Now let's transform the points using affine matrix
+    transformed_points = torch.matmul(points, transposed_affine_matrix)
+    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
+    # and compute bounding box from 4 transformed points:
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 8)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
+
+    if expand:
+        # Compute minimum point for transformed image frame:
+        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+        height, width = canvas_size
+        points = torch.tensor(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, float(height), 1.0],
+                [float(width), float(height), 1.0],
+                [float(width), 0.0, 1.0],
+            ],
+            dtype=bounding_boxes.dtype,
+            device=device,
+        )
+        new_points = torch.matmul(points, transposed_affine_matrix)
+        tr = torch.amin(new_points, dim=0, keepdim=True)
+        # Translate bounding boxes
+        out_bboxes.sub_(tr.repeat((1, 4 if is_rotated else 2)))
+        # Estimate meta-data for image with inverted=True
+        affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
+        canvas_size = (new_height, new_width)
+
+    out_bboxes = clamp_bounding_boxes(
+        out_bboxes, format=intermediate_format, canvas_size=canvas_size, clamping_mode=clamping_mode
+    )
+    out_bboxes = convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    if need_cast:
+        out_bboxes = out_bboxes.to(dtype)
+    return out_bboxes, canvas_size
+
+
+def affine_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> torch.Tensor:
+    out_box, _ = _affine_bounding_boxes_with_expand(
+        bounding_boxes,
+        format=format,
+        canvas_size=canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        expand=False,
+        clamping_mode=clamping_mode,
+    )
+    return out_box
+
+
+@_register_kernel_internal(affine, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _affine_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.BoundingBoxes:
+    output = affine_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def affine_mask(
+    mask: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = affine_image(
+        mask,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=InterpolationMode.NEAREST,
+        fill=fill,
+        center=center,
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(affine, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _affine_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = affine_mask(
+        inpt.as_subclass(torch.Tensor),
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        fill=fill,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(affine, tv_tensors.Video)
+def affine_video(
+    video: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    return affine_image(
+        video,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
+def rotate(
+    inpt: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomRotation` for details."""
+    if torch.jit.is_scripting():
+        return rotate_image(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+    _log_api_usage_once(rotate)
+
+    kernel = _get_kernel(rotate, type(inpt))
+    return kernel(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+@_register_kernel_internal(rotate, torch.Tensor)
+@_register_kernel_internal(rotate, tv_tensors.Image)
+def rotate_image(
+    image: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    angle = angle % 360  # shift angle to [0, 360) range
+
+    # fast path: transpose without affine transform
+    if center is None:
+        if angle == 0:
+            return image.clone()
+        if angle == 180:
+            return torch.rot90(image, k=2, dims=(-2, -1))
+
+        if expand or image.shape[-1] == image.shape[-2]:
+            if angle == 90:
+                return torch.rot90(image, k=1, dims=(-2, -1))
+            if angle == 270:
+                return torch.rot90(image, k=3, dims=(-2, -1))
+
+    interpolation = _check_interpolation(interpolation)
+
+    input_height, input_width = image.shape[-2:]
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [input_width, input_height])]
+
+    # due to current incoherence of rotation angle direction between affine and rotate implementations
+    # we need to set -angle.
+    matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
+
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    output_width, output_height = (
+        _compute_affine_output_size(matrix, input_width, input_height) if expand else (input_width, input_height)
+    )
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=input_width, h=input_height, ow=output_width, oh=output_height)
+    return _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+
+@_register_kernel_internal(rotate, PIL.Image.Image)
+def _rotate_image_pil(
+    image: PIL.Image.Image,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+
+    return _FP.rotate(
+        image, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center  # type: ignore[arg-type]
+    )
+
+
+def rotate_keypoints(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    return _affine_keypoints_with_expand(
+        keypoints=keypoints,
+        canvas_size=canvas_size,
+        angle=-angle,
+        translate=[0.0, 0.0],
+        scale=1.0,
+        shear=[0.0, 0.0],
+        center=center,
+        expand=expand,
+    )
+
+
+@_register_kernel_internal(rotate, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _rotate_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints, angle: float, expand: bool = False, center: Optional[list[float]] = None, **kwargs
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = rotate_keypoints(
+        inpt, canvas_size=inpt.canvas_size, angle=angle, center=center, expand=expand
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def rotate_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    return _affine_bounding_boxes_with_expand(
+        bounding_boxes,
+        format=format,
+        canvas_size=canvas_size,
+        angle=-angle,
+        translate=[0.0, 0.0],
+        scale=1.0,
+        shear=[0.0, 0.0],
+        center=center,
+        expand=expand,
+        clamping_mode=clamping_mode,
+    )
+
+
+@_register_kernel_internal(rotate, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _rotate_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, angle: float, expand: bool = False, center: Optional[list[float]] = None, **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = rotate_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        expand=expand,
+        center=center,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def rotate_mask(
+    mask: torch.Tensor,
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = rotate_image(
+        mask,
+        angle=angle,
+        expand=expand,
+        interpolation=InterpolationMode.NEAREST,
+        fill=fill,
+        center=center,
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(rotate, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _rotate_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(rotate, tv_tensors.Video)
+def rotate_video(
+    video: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    return rotate_image(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+def pad(
+    inpt: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Pad` for details."""
+    if torch.jit.is_scripting():
+        return pad_image(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    _log_api_usage_once(pad)
+
+    kernel = _get_kernel(pad, type(inpt))
+    return kernel(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+
+def _parse_pad_padding(padding: Union[int, list[int]]) -> list[int]:
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif isinstance(padding, (tuple, list)):
+        if len(padding) == 1:
+            pad_left = pad_right = pad_top = pad_bottom = padding[0]
+        elif len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        elif len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+        else:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+    else:
+        raise TypeError(f"`padding` should be an integer or tuple or list of integers, but got {padding}")
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
+
+
+@_register_kernel_internal(pad, torch.Tensor)
+@_register_kernel_internal(pad, tv_tensors.Image)
+def pad_image(
+    image: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    # Be aware that while `padding` has order `[left, top, right, bottom]`, `torch_padding` uses
+    # `[left, right, top, bottom]`. This stems from the fact that we align our API with PIL, but need to use `torch_pad`
+    # internally.
+    torch_padding = _parse_pad_padding(padding)
+
+    if padding_mode not in ("constant", "edge", "reflect", "symmetric"):
+        raise ValueError(
+            f"`padding_mode` should be either `'constant'`, `'edge'`, `'reflect'` or `'symmetric'`, "
+            f"but got `'{padding_mode}'`."
+        )
+
+    if fill is None:
+        fill = 0
+
+    if isinstance(fill, (int, float)):
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+    elif len(fill) == 1:
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill[0], padding_mode=padding_mode)
+    else:
+        return _pad_with_vector_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+
+
+def _pad_with_scalar_fill(
+    image: torch.Tensor,
+    torch_padding: list[int],
+    fill: Union[int, float],
+    padding_mode: str,
+) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    batch_size = 1
+    for s in shape[:-3]:
+        batch_size *= s
+
+    image = image.reshape(batch_size, num_channels, height, width)
+
+    if padding_mode == "edge":
+        # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
+        # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
+        # name.
+        padding_mode = "replicate"
+
+    if padding_mode == "constant":
+        image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
+    elif padding_mode in ("reflect", "replicate"):
+        # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
+        # TODO: See https://github.com/pytorch/pytorch/issues/40763
+        dtype = image.dtype
+        if not image.is_floating_point():
+            needs_cast = True
+            image = image.to(torch.float32)
+        else:
+            needs_cast = False
+
+        image = torch_pad(image, torch_padding, mode=padding_mode)
+
+        if needs_cast:
+            image = image.to(dtype)
+    else:  # padding_mode == "symmetric"
+        image = _pad_symmetric(image, torch_padding)
+
+    new_height, new_width = image.shape[-2:]
+
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+# TODO: This should be removed once torch_pad supports non-scalar padding values
+def _pad_with_vector_fill(
+    image: torch.Tensor,
+    torch_padding: list[int],
+    fill: list[float],
+    padding_mode: str,
+) -> torch.Tensor:
+    if padding_mode != "constant":
+        raise ValueError(f"Padding mode '{padding_mode}' is not supported if fill is not scalar")
+
+    output = _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    left, right, top, bottom = torch_padding
+
+    # We are creating the tensor in the autodetected dtype first and convert to the right one after to avoid an implicit
+    # float -> int conversion. That happens for example for the valid input of a uint8 image with floating point fill
+    # value.
+    fill = torch.tensor(fill, device=image.device).to(dtype=image.dtype).reshape(-1, 1, 1)
+
+    if top > 0:
+        output[..., :top, :] = fill
+    if left > 0:
+        output[..., :, :left] = fill
+    if bottom > 0:
+        output[..., -bottom:, :] = fill
+    if right > 0:
+        output[..., :, -right:] = fill
+    return output
+
+
+_pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
+
+
+@_register_kernel_internal(pad, tv_tensors.Mask)
+def pad_mask(
+    mask: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    if fill is None:
+        fill = 0
+
+    if isinstance(fill, (tuple, list)):
+        raise ValueError("Non-scalar fill value is not supported")
+
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = pad_image(mask, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+def pad_keypoints(
+    keypoints: torch.Tensor, canvas_size: tuple[int, int], padding: list[int], padding_mode: str = "constant"
+):
+    SUPPORTED_MODES = ["constant"]
+    if padding_mode not in SUPPORTED_MODES:
+        # TODO: add support of other padding modes
+        raise ValueError(
+            f"Padding mode '{padding_mode}' is not supported with KeyPoints"
+            f" (supported modes are {', '.join(SUPPORTED_MODES)})"
+        )
+    left, right, top, bottom = _parse_pad_padding(padding)
+    pad = torch.tensor([left, top], dtype=keypoints.dtype, device=keypoints.device)
+    canvas_size = (canvas_size[0] + top + bottom, canvas_size[1] + left + right)
+    return clamp_keypoints(keypoints + pad, canvas_size), canvas_size
+
+
+@_register_kernel_internal(pad, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _pad_keypoints_dispatch(
+    keypoints: tv_tensors.KeyPoints, padding: list[int], padding_mode: str = "constant", **kwargs
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = pad_keypoints(
+        keypoints.as_subclass(torch.Tensor),
+        canvas_size=keypoints.canvas_size,
+        padding=padding,
+        padding_mode=padding_mode,
+    )
+    return tv_tensors.wrap(output, like=keypoints, canvas_size=canvas_size)
+
+
+def pad_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    padding: list[int],
+    padding_mode: str = "constant",
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    if padding_mode not in ["constant"]:
+        # TODO: add support of other padding modes
+        raise ValueError(f"Padding mode '{padding_mode}' is not supported with bounding boxes")
+
+    left, right, top, bottom = _parse_pad_padding(padding)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        pad = [left, top, left, top, left, top, left, top]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR or format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        pad = [left, top, 0, 0, 0]
+    elif format == tv_tensors.BoundingBoxFormat.XYXY:
+        pad = [left, top, left, top]
+    else:
+        pad = [left, top, 0, 0]
+    bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
+
+    height, width = canvas_size
+    height += top + bottom
+    width += left + right
+    canvas_size = (height, width)
+
+    return (
+        clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode),
+        canvas_size,
+    )
+
+
+@_register_kernel_internal(pad, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _pad_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, padding: list[int], padding_mode: str = "constant", **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = pad_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        padding=padding,
+        padding_mode=padding_mode,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(pad, tv_tensors.Video)
+def pad_video(
+    video: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    return pad_image(video, padding, fill=fill, padding_mode=padding_mode)
+
+
+def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
+    if torch.jit.is_scripting():
+        return crop_image(inpt, top=top, left=left, height=height, width=width)
+
+    _log_api_usage_once(crop)
+
+    kernel = _get_kernel(crop, type(inpt))
+    return kernel(inpt, top=top, left=left, height=height, width=width)
+
+
+@_register_kernel_internal(crop, torch.Tensor)
+@_register_kernel_internal(crop, tv_tensors.Image)
+def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    h, w = image.shape[-2:]
+
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        image = image[..., max(top, 0) : bottom, max(left, 0) : right]
+        torch_padding = [
+            max(min(right, 0) - left, 0),
+            max(right - max(w, left), 0),
+            max(min(bottom, 0) - top, 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    return image[..., top:bottom, left:right]
+
+
+_crop_image_pil = _FP.crop
+_register_kernel_internal(crop, PIL.Image.Image)(_crop_image_pil)
+
+
+def crop_keypoints(
+    keypoints: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+
+    keypoints = keypoints - torch.tensor([left, top], dtype=keypoints.dtype, device=keypoints.device)
+    canvas_size = (height, width)
+
+    return clamp_keypoints(keypoints, canvas_size=canvas_size), canvas_size
+
+
+@_register_kernel_internal(crop, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _crop_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints, top: int, left: int, height: int, width: int
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = crop_keypoints(inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+
+    # Crop or implicit pad if left and/or top have negative values:
+    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        sub = [left, top, left, top, left, top, left, top]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR or format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        sub = [left, top, 0, 0, 0]
+    elif format == tv_tensors.BoundingBoxFormat.XYXY:
+        sub = [left, top, left, top]
+    else:
+        sub = [left, top, 0, 0]
+
+    bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
+    canvas_size = (height, width)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes = _parallelogram_to_bounding_boxes(bounding_boxes)
+
+    return (
+        clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode),
+        canvas_size,
+    )
+
+
+@_register_kernel_internal(crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(crop, tv_tensors.Mask)
+def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = crop_image(mask, top, left, height, width)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(crop, tv_tensors.Video)
+def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    return crop_image(video, top, left, height, width)
+
+
+def perspective(
+    inpt: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomPerspective` for details."""
+    if torch.jit.is_scripting():
+        return perspective_image(
+            inpt,
+            startpoints=startpoints,
+            endpoints=endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
+        )
+
+    _log_api_usage_once(perspective)
+
+    kernel = _get_kernel(perspective, type(inpt))
+    return kernel(
+        inpt,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        interpolation=interpolation,
+        fill=fill,
+        coefficients=coefficients,
+    )
+
+
+def _perspective_grid(coeffs: list[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device, dtype=dtype)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device, dtype=dtype).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device))
+    shape = (1, oh * ow, 3)
+    output_grid1 = base_grid.view(shape).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(shape).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1.div_(output_grid2).sub_(1.0)
+    return output_grid.view(1, oh, ow, 2)
+
+
+def _perspective_coefficients(
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]],
+) -> list[float]:
+    if coefficients is not None:
+        if startpoints is not None and endpoints is not None:
+            raise ValueError("The startpoints/endpoints and the coefficients shouldn't be defined concurrently.")
+        elif len(coefficients) != 8:
+            raise ValueError("Argument coefficients should have 8 float values")
+        return coefficients
+    elif startpoints is not None and endpoints is not None:
+        return _get_perspective_coeffs(startpoints, endpoints)
+    else:
+        raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
+
+
+@_register_kernel_internal(perspective, torch.Tensor)
+@_register_kernel_internal(perspective, tv_tensors.Image)
+def perspective_image(
+    image: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+
+    _assert_grid_transform_inputs(
+        image,
+        matrix=None,
+        interpolation=interpolation.value,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    oh, ow = image.shape[-2:]
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
+    return _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+
+@_register_kernel_internal(perspective, PIL.Image.Image)
+def _perspective_image_pil(
+    image: PIL.Image.Image,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> PIL.Image.Image:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+    return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
+
+
+def perspective_keypoints(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+):
+    if keypoints.numel() == 0:
+        return keypoints
+    dtype = keypoints.dtype if torch.is_floating_point(keypoints) else torch.float32
+    device = keypoints.device
+    original_shape = keypoints.shape
+
+    keypoints = keypoints.clone().reshape(-1, 2)
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+
+    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
+    if denom == 0:
+        raise RuntimeError(
+            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform keypoints. "
+            f"Denominator is zero, denom={denom}"
+        )
+
+    theta1, theta2 = _compute_perspective_thetas(perspective_coeffs, dtype, device, denom)
+    points = torch.cat([keypoints, torch.ones(keypoints.shape[0], 1, device=keypoints.device)], dim=-1)
+
+    numer_points = torch.matmul(points, theta1.T)
+    denom_points = torch.matmul(points, theta2.T)
+    transformed_points = numer_points.div_(denom_points)
+    return clamp_keypoints(transformed_points.to(keypoints.dtype), canvas_size).reshape(original_shape)
+
+
+@_register_kernel_internal(perspective, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _perspective_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.KeyPoints:
+    output = perspective_keypoints(
+        inpt.as_subclass(torch.Tensor),
+        canvas_size=inpt.canvas_size,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        coefficients=coefficients,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def perspective_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> torch.Tensor:
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
+
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+
+    original_shape = bounding_boxes.shape
+    original_dtype = bounding_boxes.dtype
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=intermediate_format)
+    ).reshape(-1, 8 if is_rotated else 4)
+
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+    device = bounding_boxes.device
+
+    # perspective_coeffs are computed as endpoint -> start point
+    # We have to invert perspective_coeffs for bboxes:
+    # (x, y) - end point and (x_out, y_out) - start point
+    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # and we would like to get:
+    # x = (inv_coeffs[0] * x_out + inv_coeffs[1] * y_out + inv_coeffs[2])
+    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
+    # y = (inv_coeffs[3] * x_out + inv_coeffs[4] * y_out + inv_coeffs[5])
+    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
+    # and compute inv_coeffs in terms of coeffs
+
+    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
+    if denom == 0:
+        raise RuntimeError(
+            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform bounding boxes. "
+            f"Denominator is zero, denom={denom}"
+        )
+
+    theta1, theta2 = _compute_perspective_thetas(perspective_coeffs, dtype, device, denom)
+
+    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
+    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
+    # Single point structure is similar to
+    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
+    points = bounding_boxes if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+    points = points.reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
+    # 2) Now let's transform the points using perspective matrices
+    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+
+    numer_points = torch.matmul(points, theta1.T)
+    denom_points = torch.matmul(points, theta2.T)
+    transformed_points = numer_points.div_(denom_points)
+
+    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
+    # and compute bounding box from 4 transformed points:
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 8)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
+
+    out_bboxes = clamp_bounding_boxes(
+        out_bboxes, format=intermediate_format, canvas_size=canvas_size, clamping_mode=clamping_mode
+    )
+
+    out_bboxes = convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    out_bboxes = out_bboxes.to(original_dtype)
+    return out_bboxes
+
+
+def _compute_perspective_thetas(
+    perspective_coeffs: list[float],
+    dtype: torch.dtype,
+    device: torch.device,
+    denom: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    inv_coeffs = [
+        (perspective_coeffs[4] - perspective_coeffs[5] * perspective_coeffs[7]) / denom,
+        (-perspective_coeffs[1] + perspective_coeffs[2] * perspective_coeffs[7]) / denom,
+        (perspective_coeffs[1] * perspective_coeffs[5] - perspective_coeffs[2] * perspective_coeffs[4]) / denom,
+        (-perspective_coeffs[3] + perspective_coeffs[5] * perspective_coeffs[6]) / denom,
+        (perspective_coeffs[0] - perspective_coeffs[2] * perspective_coeffs[6]) / denom,
+        (-perspective_coeffs[0] * perspective_coeffs[5] + perspective_coeffs[2] * perspective_coeffs[3]) / denom,
+        (-perspective_coeffs[4] * perspective_coeffs[6] + perspective_coeffs[3] * perspective_coeffs[7]) / denom,
+        (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
+    ]
+
+    theta1 = torch.tensor(
+        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
+        dtype=dtype,
+        device=device,
+    )
+
+    theta2 = torch.tensor(
+        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
+    )
+
+    return theta1, theta2
+
+
+@_register_kernel_internal(perspective, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _perspective_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.BoundingBoxes:
+    output = perspective_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        coefficients=coefficients,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def perspective_mask(
+    mask: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = perspective_image(
+        mask, startpoints, endpoints, interpolation=InterpolationMode.NEAREST, fill=fill, coefficients=coefficients
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(perspective, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _perspective_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = perspective_mask(
+        inpt.as_subclass(torch.Tensor),
+        startpoints=startpoints,
+        endpoints=endpoints,
+        fill=fill,
+        coefficients=coefficients,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(perspective, tv_tensors.Video)
+def perspective_video(
+    video: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    return perspective_image(
+        video, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+    )
+
+
+def elastic(
+    inpt: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.ElasticTransform` for details."""
+    if torch.jit.is_scripting():
+        return elastic_image(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+    _log_api_usage_once(elastic)
+
+    kernel = _get_kernel(elastic, type(inpt))
+    return kernel(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+
+elastic_transform = elastic
+
+
+@_register_kernel_internal(elastic, torch.Tensor)
+@_register_kernel_internal(elastic, tv_tensors.Image)
+def elastic_image(
+    image: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
+    interpolation = _check_interpolation(interpolation)
+
+    height, width = image.shape[-2:]
+    device = image.device
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+
+    # Patch: elastic transform should support (cpu,f16) input
+    is_cpu_half = device.type == "cpu" and dtype == torch.float16
+    if is_cpu_half:
+        image = image.to(torch.float32)
+        dtype = torch.float32
+
+    # We are aware that if input image dtype is uint8 and displacement is float64 then
+    # displacement will be cast to float32 and all computations will be done with float32
+    # We can fix this later if needed
+
+    expected_shape = (1, height, width, 2)
+    if expected_shape != displacement.shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    grid = _create_identity_grid((height, width), device=device, dtype=dtype).add_(
+        displacement.to(dtype=dtype, device=device)
+    )
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+    if is_cpu_half:
+        output = output.to(torch.float16)
+
+    return output
+
+
+@_register_kernel_internal(elastic, PIL.Image.Image)
+def _elastic_image_pil(
+    image: PIL.Image.Image,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = elastic_image(t_img, displacement, interpolation=interpolation, fill=fill)
+    return to_pil_image(output, mode=image.mode)
+
+
+def _create_identity_grid(size: tuple[int, int], device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+    sy, sx = size
+    base_grid = torch.empty(1, sy, sx, 2, device=device, dtype=dtype)
+    x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device, dtype=dtype)
+    base_grid[..., 0].copy_(x_grid)
+
+    y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device, dtype=dtype).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+
+    return base_grid
+
+
+def elastic_keypoints(
+    keypoints: torch.Tensor, canvas_size: tuple[int, int], displacement: torch.Tensor
+) -> torch.Tensor:
+    expected_shape = (1, canvas_size[0], canvas_size[1], 2)
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+    elif displacement.shape != expected_shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    if keypoints.numel() == 0:
+        return keypoints
+
+    device = keypoints.device
+    dtype = keypoints.dtype if torch.is_floating_point(keypoints) else torch.float32
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
+
+    original_shape = keypoints.shape
+    keypoints = keypoints.clone().reshape(-1, 2)
+
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
+    inv_grid = id_grid.sub_(displacement)
+
+    index_xy = keypoints.to(dtype=torch.long)
+    index_x, index_y = index_xy[:, 0], index_xy[:, 1]
+    # Unlike bounding boxes, this may not work well.
+    index_x.clamp_(0, inv_grid.shape[2] - 1)
+    index_y.clamp_(0, inv_grid.shape[1] - 1)
+
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
+
+    return clamp_keypoints(transformed_points.to(keypoints.dtype), canvas_size=canvas_size).reshape(original_shape)
+
+
+@_register_kernel_internal(elastic, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _elastic_keypoints_dispatch(inpt: tv_tensors.KeyPoints, displacement: torch.Tensor, **kwargs):
+    output = elastic_keypoints(inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size, displacement=displacement)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def elastic_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    displacement: torch.Tensor,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> torch.Tensor:
+    expected_shape = (1, canvas_size[0], canvas_size[1], 2)
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+    elif displacement.shape != expected_shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
+
+    # TODO: add in docstring about approximation we are doing for grid inversion
+    device = bounding_boxes.device
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
+
+    original_shape = bounding_boxes.shape
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
+    intermediate_format = tv_tensors.BoundingBoxFormat.CXCYWHR if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes.clone(), old_format=format, new_format=intermediate_format)
+    ).reshape(-1, 5 if is_rotated else 4)
+
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
+    # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
+    # This is not an exact inverse of the grid
+    inv_grid = id_grid.sub_(displacement)
+
+    # Get points from bboxes
+    points = bounding_boxes[:, :2] if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+    points = points.reshape(-1, 2)
+    if points.is_floating_point():
+        points = points.ceil_()
+    index_xy = points.to(dtype=torch.long)
+    index_x, index_y = index_xy[:, 0], index_xy[:, 1]
+
+    # Transform points:
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
+
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 2)
+        out_bboxes = torch.cat([transformed_points, bounding_boxes[:, 2:]], dim=1).to(bounding_boxes.dtype)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype)
+
+    out_bboxes = clamp_bounding_boxes(
+        out_bboxes, format=intermediate_format, canvas_size=canvas_size, clamping_mode=clamping_mode
+    )
+
+    return convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=False
+    ).reshape(original_shape)
+
+
+@_register_kernel_internal(elastic, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _elastic_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, displacement: torch.Tensor, **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output = elastic_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        displacement=displacement,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def elastic_mask(
+    mask: torch.Tensor,
+    displacement: torch.Tensor,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = elastic_image(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST, fill=fill)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(elastic, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _elastic_mask_dispatch(
+    inpt: tv_tensors.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
+) -> tv_tensors.Mask:
+    output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(elastic, tv_tensors.Video)
+def elastic_video(
+    video: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    return elastic_image(video, displacement, interpolation=interpolation, fill=fill)
+
+
+def center_crop(inpt: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
+    if torch.jit.is_scripting():
+        return center_crop_image(inpt, output_size=output_size)
+
+    _log_api_usage_once(center_crop)
+
+    kernel = _get_kernel(center_crop, type(inpt))
+    return kernel(inpt, output_size=output_size)
+
+
+def _center_crop_parse_output_size(output_size: list[int]) -> list[int]:
+    if isinstance(output_size, numbers.Number):
+        s = int(output_size)
+        return [s, s]
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        return [output_size[0], output_size[0]]
+    else:
+        return list(output_size)
+
+
+def _center_crop_compute_padding(crop_height: int, crop_width: int, image_height: int, image_width: int) -> list[int]:
+    return [
+        (crop_width - image_width) // 2 if crop_width > image_width else 0,
+        (crop_height - image_height) // 2 if crop_height > image_height else 0,
+        (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+        (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+    ]
+
+
+def _center_crop_compute_crop_anchor(
+    crop_height: int, crop_width: int, image_height: int, image_width: int
+) -> tuple[int, int]:
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return crop_top, crop_left
+
+
+@_register_kernel_internal(center_crop, torch.Tensor)
+@_register_kernel_internal(center_crop, tv_tensors.Image)
+def center_crop_image(image: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    shape = image.shape
+    if image.numel() == 0:
+        return image.reshape(shape[:-2] + (crop_height, crop_width))
+    image_height, image_width = shape[-2:]
+
+    if crop_height > image_height or crop_width > image_width:
+        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+        image = torch_pad(image, _parse_pad_padding(padding_ltrb), value=0.0)
+
+        image_height, image_width = image.shape[-2:]
+        if crop_width == image_width and crop_height == image_height:
+            return image
+
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
+    return image[..., crop_top : (crop_top + crop_height), crop_left : (crop_left + crop_width)]
+
+
+@_register_kernel_internal(center_crop, PIL.Image.Image)
+def _center_crop_image_pil(image: PIL.Image.Image, output_size: list[int]) -> PIL.Image.Image:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    image_height, image_width = _get_size_image_pil(image)
+
+    if crop_height > image_height or crop_width > image_width:
+        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+        image = _pad_image_pil(image, padding_ltrb, fill=0)
+
+        image_height, image_width = _get_size_image_pil(image)
+        if crop_width == image_width and crop_height == image_height:
+            return image
+
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
+    return _crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
+
+
+def center_crop_keypoints(inpt: torch.Tensor, canvas_size: tuple[int, int], output_size: list[int]):
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
+    return crop_keypoints(inpt, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
+
+
+@_register_kernel_internal(center_crop, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _center_crop_keypoints_dispatch(inpt: tv_tensors.KeyPoints, output_size: list[int]) -> tv_tensors.KeyPoints:
+    output, canvas_size = center_crop_keypoints(
+        inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size, output_size=output_size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def center_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    output_size: list[int],
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
+    return crop_bounding_boxes(
+        bounding_boxes,
+        format,
+        top=crop_top,
+        left=crop_left,
+        height=crop_height,
+        width=crop_width,
+        clamping_mode=clamping_mode,
+    )
+
+
+@_register_kernel_internal(center_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _center_crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, output_size: list[int]
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = center_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        output_size=output_size,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(center_crop, tv_tensors.Mask)
+def center_crop_mask(mask: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = center_crop_image(image=mask, output_size=output_size)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(center_crop, tv_tensors.Video)
+def center_crop_video(video: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    return center_crop_image(video, output_size)
+
+
+def resized_crop(
+    inpt: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomResizedCrop` for details."""
+    if torch.jit.is_scripting():
+        return resized_crop_image(
+            inpt,
+            top=top,
+            left=left,
+            height=height,
+            width=width,
+            size=size,
+            interpolation=interpolation,
+            antialias=antialias,
+        )
+
+    _log_api_usage_once(resized_crop)
+
+    kernel = _get_kernel(resized_crop, type(inpt))
+    return kernel(
+        inpt,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+        antialias=antialias,
+    )
+
+
+@_register_kernel_internal(resized_crop, torch.Tensor)
+@_register_kernel_internal(resized_crop, tv_tensors.Image)
+def resized_crop_image(
+    image: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    image = crop_image(image, top, left, height, width)
+    return resize_image(image, size, interpolation=interpolation, antialias=antialias)
+
+
+def _resized_crop_image_pil(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+) -> PIL.Image.Image:
+    image = _crop_image_pil(image, top, left, height, width)
+    return _resize_image_pil(image, size, interpolation=interpolation)
+
+
+@_register_kernel_internal(resized_crop, PIL.Image.Image)
+def _resized_crop_image_pil_dispatch(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return _resized_crop_image_pil(
+        image,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+    )
+
+
+def resized_crop_keypoints(
+    keypoints: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    keypoints, canvas_size = crop_keypoints(keypoints, top, left, height, width)
+    return resize_keypoints(keypoints, size=size, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _resized_crop_keypoints_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: list[int], **kwargs
+):
+    output, canvas_size = resized_crop_keypoints(
+        inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def resized_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    bounding_boxes, canvas_size = crop_bounding_boxes(
+        bounding_boxes, format, top, left, height, width, clamping_mode=clamping_mode
+    )
+    return resize_bounding_boxes(
+        bounding_boxes, format=format, canvas_size=canvas_size, size=size, clamping_mode=clamping_mode
+    )
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _resized_crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: list[int], **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = resized_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def resized_crop_mask(
+    mask: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+) -> torch.Tensor:
+    mask = crop_mask(mask, top, left, height, width)
+    return resize_mask(mask, size)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _resized_crop_mask_dispatch(
+    inpt: tv_tensors.Mask, top: int, left: int, height: int, width: int, size: list[int], **kwargs
+) -> tv_tensors.Mask:
+    output = resized_crop_mask(
+        inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.Video)
+def resized_crop_video(
+    video: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    return resized_crop_image(
+        video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
+    )
+
+
+def five_crop(
+    inpt: torch.Tensor, size: list[int]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """See :class:`~torchvision.transforms.v2.FiveCrop` for details."""
+    if torch.jit.is_scripting():
+        return five_crop_image(inpt, size=size)
+
+    _log_api_usage_once(five_crop)
+
+    kernel = _get_kernel(five_crop, type(inpt))
+    return kernel(inpt, size=size)
+
+
+def _parse_five_crop_size(size: list[int]) -> list[int]:
+    if isinstance(size, numbers.Number):
+        s = int(size)
+        size = [s, s]
+    elif isinstance(size, (tuple, list)) and len(size) == 1:
+        s = size[0]
+        size = [s, s]
+
+    if len(size) != 2:
+        raise ValueError("Please provide only two dimensions (h, w) for size.")
+
+    return size
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Image)
+def five_crop_image(
+    image: torch.Tensor, size: list[int]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    crop_height, crop_width = _parse_five_crop_size(size)
+    image_height, image_width = image.shape[-2:]
+
+    if crop_width > image_width or crop_height > image_height:
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
+
+    tl = crop_image(image, 0, 0, crop_height, crop_width)
+    tr = crop_image(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = crop_image(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = crop_image(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = center_crop_image(image, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, PIL.Image.Image)
+def _five_crop_image_pil(
+    image: PIL.Image.Image, size: list[int]
+) -> tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
+    crop_height, crop_width = _parse_five_crop_size(size)
+    image_height, image_width = _get_size_image_pil(image)
+
+    if crop_width > image_width or crop_height > image_height:
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
+
+    tl = _crop_image_pil(image, 0, 0, crop_height, crop_width)
+    tr = _crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = _crop_image_pil(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = _crop_image_pil(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = _center_crop_image_pil(image, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Video)
+def five_crop_video(
+    video: torch.Tensor, size: list[int]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    return five_crop_image(video, size)
+
+
+def ten_crop(
+    inpt: torch.Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """See :class:`~torchvision.transforms.v2.TenCrop` for details."""
+    if torch.jit.is_scripting():
+        return ten_crop_image(inpt, size=size, vertical_flip=vertical_flip)
+
+    _log_api_usage_once(ten_crop)
+
+    kernel = _get_kernel(ten_crop, type(inpt))
+    return kernel(inpt, size=size, vertical_flip=vertical_flip)
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Image)
+def ten_crop_image(
+    image: torch.Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    non_flipped = five_crop_image(image, size)
+
+    if vertical_flip:
+        image = vertical_flip_image(image)
+    else:
+        image = horizontal_flip_image(image)
+
+    flipped = five_crop_image(image, size)
+
+    return non_flipped + flipped
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, PIL.Image.Image)
+def _ten_crop_image_pil(
+    image: PIL.Image.Image, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+]:
+    non_flipped = _five_crop_image_pil(image, size)
+
+    if vertical_flip:
+        image = _vertical_flip_image_pil(image)
+    else:
+        image = _horizontal_flip_image_pil(image)
+
+    flipped = _five_crop_image_pil(image, size)
+
+    return non_flipped + flipped
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Video)
+def ten_crop_video(
+    video: torch.Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    return ten_crop_image(video, size, vertical_flip=vertical_flip)
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
new file mode 100644
index 00000000000..4568b39ab59
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -0,0 +1,685 @@
+from typing import Optional, Union
+
+import PIL.Image
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.tv_tensors import BoundingBoxFormat
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
+
+
+def get_dimensions(inpt: torch.Tensor) -> list[int]:
+    if torch.jit.is_scripting():
+        return get_dimensions_image(inpt)
+
+    _log_api_usage_once(get_dimensions)
+
+    kernel = _get_kernel(get_dimensions, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_dimensions, torch.Tensor)
+@_register_kernel_internal(get_dimensions, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_dimensions_image(image: torch.Tensor) -> list[int]:
+    chw = list(image.shape[-3:])
+    ndims = len(chw)
+    if ndims == 3:
+        return chw
+    elif ndims == 2:
+        chw.insert(0, 1)
+        return chw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+_get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
+
+
+@_register_kernel_internal(get_dimensions, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_dimensions_video(video: torch.Tensor) -> list[int]:
+    return get_dimensions_image(video)
+
+
+def get_num_channels(inpt: torch.Tensor) -> int:
+    if torch.jit.is_scripting():
+        return get_num_channels_image(inpt)
+
+    _log_api_usage_once(get_num_channels)
+
+    kernel = _get_kernel(get_num_channels, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_num_channels, torch.Tensor)
+@_register_kernel_internal(get_num_channels, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_num_channels_image(image: torch.Tensor) -> int:
+    chw = image.shape[-3:]
+    ndims = len(chw)
+    if ndims == 3:
+        return chw[0]
+    elif ndims == 2:
+        return 1
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+_get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
+
+
+@_register_kernel_internal(get_num_channels, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_num_channels_video(video: torch.Tensor) -> int:
+    return get_num_channels_image(video)
+
+
+# We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
+# deprecating the old names.
+get_image_num_channels = get_num_channels
+
+
+def get_size(inpt: torch.Tensor) -> list[int]:
+    if torch.jit.is_scripting():
+        return get_size_image(inpt)
+
+    _log_api_usage_once(get_size)
+
+    kernel = _get_kernel(get_size, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_size, torch.Tensor)
+@_register_kernel_internal(get_size, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_size_image(image: torch.Tensor) -> list[int]:
+    hw = list(image.shape[-2:])
+    ndims = len(hw)
+    if ndims == 2:
+        return hw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+@_register_kernel_internal(get_size, PIL.Image.Image)
+def _get_size_image_pil(image: PIL.Image.Image) -> list[int]:
+    width, height = _FP.get_image_size(image)
+    return [height, width]
+
+
+@_register_kernel_internal(get_size, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_size_video(video: torch.Tensor) -> list[int]:
+    return get_size_image(video)
+
+
+@_register_kernel_internal(get_size, tv_tensors.Mask, tv_tensor_wrapper=False)
+def get_size_mask(mask: torch.Tensor) -> list[int]:
+    return get_size_image(mask)
+
+
+@_register_kernel_internal(get_size, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def get_size_bounding_boxes(bounding_box: tv_tensors.BoundingBoxes) -> list[int]:
+    return list(bounding_box.canvas_size)
+
+
+@_register_kernel_internal(get_size, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def get_size_keypoints(keypoints: tv_tensors.KeyPoints) -> list[int]:
+    return list(keypoints.canvas_size)
+
+
+def get_num_frames(inpt: torch.Tensor) -> int:
+    if torch.jit.is_scripting():
+        return get_num_frames_video(inpt)
+
+    _log_api_usage_once(get_num_frames)
+
+    kernel = _get_kernel(get_num_frames, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_num_frames, torch.Tensor)
+@_register_kernel_internal(get_num_frames, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_num_frames_video(video: torch.Tensor) -> int:
+    return video.shape[-4]
+
+
+def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xyxy = xywh if inplace else xywh.clone()
+    xyxy[..., 2:] += xyxy[..., :2]
+    return xyxy
+
+
+def _xyxy_to_xywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xywh = xyxy if inplace else xyxy.clone()
+    xywh[..., 2:] -= xywh[..., :2]
+    return xywh
+
+
+def _cxcywh_to_xyxy(cxcywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        cxcywh = cxcywh.clone()
+
+    # Trick to do fast division by 2 and ceil, without casting. It produces the same result as
+    # `torchvision.ops._box_convert._box_cxcywh_to_xyxy`.
+    half_wh = cxcywh[..., 2:].div(-2, rounding_mode=None if cxcywh.is_floating_point() else "floor").abs_()
+    # (cx - width / 2) = x1, same for y1
+    cxcywh[..., :2].sub_(half_wh)
+    # (x1 + width) = x2, same for y2
+    cxcywh[..., 2:].add_(cxcywh[..., :2])
+
+    return cxcywh
+
+
+def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xyxy = xyxy.clone()
+
+    # (x2 - x1) = width, same for height
+    xyxy[..., 2:].sub_(xyxy[..., :2])
+    # (x1 * 2 + width) / 2 = x1 + width / 2 = x1 + (x2-x1)/2 = (x1 + x2)/2 = cx, same for cy
+    xyxy[..., :2].mul_(2).add_(xyxy[..., 2:]).div_(2, rounding_mode=None if xyxy.is_floating_point() else "floor")
+
+    return xyxy
+
+
+def _xyxy_to_keypoints(bounding_boxes: torch.Tensor) -> torch.Tensor:
+    return bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+
+
+def _xyxyxyxy_to_keypoints(bounding_boxes: torch.Tensor) -> torch.Tensor:
+    return bounding_boxes[:, [[0, 1], [2, 3], [4, 5], [6, 7]]]
+
+
+def _cxcywhr_to_xywhr(cxcywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        cxcywhr = cxcywhr.clone()
+
+    half_wh = cxcywhr[..., 2:-1].div(-2, rounding_mode=None if cxcywhr.is_floating_point() else "floor").abs_()
+    r_rad = cxcywhr[..., 4].mul(torch.pi).div(180.0)
+    cos, sin = r_rad.cos(), r_rad.sin()
+    # (cx - width / 2 * cos - height / 2 * sin) = x1
+    cxcywhr[..., 0].sub_(half_wh[..., 0].mul(cos)).sub_(half_wh[..., 1].mul(sin))
+    # (cy + width / 2 * sin - height / 2 * cos) = y1
+    cxcywhr[..., 1].add_(half_wh[..., 0].mul(sin)).sub_(half_wh[..., 1].mul(cos))
+
+    return cxcywhr
+
+
+def _xywhr_to_cxcywhr(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xywhr = xywhr.clone()
+
+    half_wh = xywhr[..., 2:-1].div(-2, rounding_mode=None if xywhr.is_floating_point() else "floor").abs_()
+    r_rad = xywhr[..., 4].mul(torch.pi).div(180.0)
+    cos, sin = r_rad.cos(), r_rad.sin()
+    # (x1 + width / 2 * cos + height / 2 * sin) = cx
+    xywhr[..., 0].add_(half_wh[..., 0].mul(cos)).add_(half_wh[..., 1].mul(sin))
+    # (y1 - width / 2 * sin + height / 2 * cos) = cy
+    xywhr[..., 1].sub_(half_wh[..., 0].mul(sin)).add_(half_wh[..., 1].mul(cos))
+
+    return xywhr
+
+
+def _xywhr_to_xyxyxyxy(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
+    # NOTE: This function cannot modify the input tensor inplace as it requires a dimension change.
+    if not inplace:
+        xywhr = xywhr.clone()
+
+    wh = xywhr[..., 2:-1]
+    r_rad = xywhr[..., 4].mul(torch.pi).div(180.0)
+    cos, sin = r_rad.cos(), r_rad.sin()
+    xywhr = xywhr[..., :2].tile((1, 4))
+    # x1 + w * cos = x2
+    xywhr[..., 2].add_(wh[..., 0].mul(cos))
+    # y1 - w * sin = y2
+    xywhr[..., 3].sub_(wh[..., 0].mul(sin))
+    # x1 + w * cos + h * sin = x3
+    xywhr[..., 4].add_(wh[..., 0].mul(cos).add(wh[..., 1].mul(sin)))
+    # y1 - w * sin + h * cos = y3
+    xywhr[..., 5].sub_(wh[..., 0].mul(sin).sub(wh[..., 1].mul(cos)))
+    # x1 + h * sin = x4
+    xywhr[..., 6].add_(wh[..., 1].mul(sin))
+    # y1 + h * cos = y4
+    xywhr[..., 7].add_(wh[..., 1].mul(cos))
+
+    return xywhr
+
+
+def _xyxyxyxy_to_xywhr(xyxyxyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    # NOTE: This function cannot modify the input tensor inplace as it requires a dimension change.
+    if not inplace:
+        xyxyxyxy = xyxyxyxy.clone()
+
+    dtype = xyxyxyxy.dtype
+    acceptable_dtypes = [torch.float32, torch.float64]  # Ensure consistency between CPU and GPU.
+    need_cast = dtype not in acceptable_dtypes
+    if need_cast:
+        # Up-case to avoid overflow for square operations
+        xyxyxyxy = xyxyxyxy.to(torch.float32)
+
+    r_rad = torch.atan2(xyxyxyxy[..., 1].sub(xyxyxyxy[..., 3]), xyxyxyxy[..., 2].sub(xyxyxyxy[..., 0]))
+    # x1, y1, (x2 - x1), (y2 - y1), (x3 - x2), (y3 - y2) x4, y4
+    xyxyxyxy[..., 4:6].sub_(xyxyxyxy[..., 2:4])
+    xyxyxyxy[..., 2:4].sub_(xyxyxyxy[..., :2])
+    # sqrt((x2 - x1) ** 2 + (y1 - y2) ** 2) = w
+    xyxyxyxy[..., 2] = xyxyxyxy[..., 2].pow(2).add(xyxyxyxy[..., 3].pow(2)).sqrt()
+    # sqrt((x2 - x3) ** 2 + (y2 - y3) ** 2) = h
+    xyxyxyxy[..., 3] = xyxyxyxy[..., 4].pow(2).add(xyxyxyxy[..., 5].pow(2)).sqrt()
+    xyxyxyxy[..., 4] = r_rad.div_(torch.pi).mul_(180.0)
+
+    if need_cast:
+        xyxyxyxy = xyxyxyxy.to(dtype)
+
+    return xyxyxyxy[..., :5]
+
+
+def _convert_bounding_box_format(
+    bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
+) -> torch.Tensor:
+
+    if new_format == old_format:
+        return bounding_boxes
+
+    if tv_tensors.is_rotated_bounding_format(old_format) ^ tv_tensors.is_rotated_bounding_format(new_format):
+        raise ValueError("Cannot convert between rotated and unrotated bounding boxes.")
+
+    # TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance
+    if old_format == BoundingBoxFormat.XYWH:
+        bounding_boxes = _xywh_to_xyxy(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.CXCYWH:
+        bounding_boxes = _cxcywh_to_xyxy(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.CXCYWHR:
+        bounding_boxes = _cxcywhr_to_xywhr(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes = _xyxyxyxy_to_xywhr(bounding_boxes, inplace)
+
+    if new_format == BoundingBoxFormat.XYWH:
+        bounding_boxes = _xyxy_to_xywh(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.CXCYWH:
+        bounding_boxes = _xyxy_to_cxcywh(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.CXCYWHR:
+        bounding_boxes = _xywhr_to_cxcywhr(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes = _xywhr_to_xyxyxyxy(bounding_boxes, inplace)
+
+    return bounding_boxes
+
+
+def convert_bounding_box_format(
+    inpt: torch.Tensor,
+    old_format: Optional[BoundingBoxFormat] = None,
+    new_format: Optional[BoundingBoxFormat] = None,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat` for details."""
+    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
+    # inputs as well as extract it from `tv_tensors.BoundingBoxes` inputs. However, putting a default value on
+    # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
+    # default error that would be thrown if `new_format` had no default value.
+    if new_format is None:
+        raise TypeError("convert_bounding_box_format() missing 1 required argument: 'new_format'")
+
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_bounding_box_format)
+
+    if isinstance(old_format, str):
+        old_format = BoundingBoxFormat[old_format.upper()]
+    if isinstance(new_format, str):
+        new_format = BoundingBoxFormat[new_format.upper()]
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+        if old_format is None:
+            raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
+        return _convert_bounding_box_format(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
+        if old_format is not None:
+            raise ValueError("For bounding box tv_tensor inputs, `old_format` must not be passed.")
+        output = _convert_bounding_box_format(
+            inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
+        )
+        return tv_tensors.wrap(output, like=inpt, format=new_format)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
+        )
+
+
+def _clamp_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    if clamping_mode is None:
+        return bounding_boxes.clone()
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    in_dtype = bounding_boxes.dtype
+    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    xyxy_boxes = convert_bounding_box_format(
+        bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY, inplace=True
+    )
+    # hard and soft modes are equivalent for non-rotated boxes
+    xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
+    xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
+    out_boxes = convert_bounding_box_format(
+        xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    )
+    return out_boxes.to(in_dtype)
+
+
+def _order_bounding_boxes_points(
+    bounding_boxes: torch.Tensor, indices: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Re-order points in bounding boxes based on specific criteria or provided indices.
+
+    This function reorders the points of bounding boxes either according to provided indices or
+    by a default ordering strategy. In the default strategy, (x1, y1) corresponds to the point
+    with the lowest x value. If multiple points have the same lowest x value, the point with the
+    lowest y value is chosen.
+
+    Args:
+        bounding_boxes (torch.Tensor): A tensor containing bounding box coordinates in format [x1, y1, x2, y2, x3, y3, x4, y4].
+        indices (torch.Tensor | None): Optional tensor containing indices for reordering. If None, default ordering is applied.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - indices: The indices used for reordering
+            - reordered_boxes: The bounding boxes with reordered points
+    """
+    if indices is None:
+        output_xyxyxyxy = bounding_boxes.reshape(-1, 8)
+        x, y = output_xyxyxyxy[..., 0::2], output_xyxyxyxy[..., 1::2]
+        y_max = torch.max(y.abs(), dim=1, keepdim=True)[0]
+        x_max = torch.max(x.abs(), dim=1, keepdim=True)[0]
+        _, x1 = (y / y_max + (x / x_max) * 100).min(dim=1)
+        indices = torch.ones_like(output_xyxyxyxy)
+        indices[..., 0] = x1.mul(2)
+        indices.cumsum_(1).remainder_(8)
+    return indices, bounding_boxes.gather(1, indices.to(torch.int64))
+
+
+def _get_slope_and_intercept(box: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the slope and y-intercept of the lines defined by consecutive vertices in a bounding box.
+    This function computes the slope (a) and y-intercept (b) for each line segment in a bounding box,
+    where each line is defined by two consecutive vertices.
+    """
+    x, y = box[..., ::2], box[..., 1::2]
+    a = y.diff(append=y[..., 0:1]) / x.diff(append=x[..., 0:1])
+    b = y - a * x
+    return a, b
+
+
+def _get_intersection_point(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Calculate the intersection point of two lines defined by their slopes and y-intercepts.
+    This function computes the intersection points between pairs of lines, where each line
+    is defined by the equation y = ax + b (slope and y-intercept form).
+    """
+    batch_size = a.shape[0]
+    x = b.diff(prepend=b[..., 3:4]).neg() / a.diff(prepend=a[..., 3:4])
+    y = a * x + b
+    return torch.cat((x.unsqueeze(-1), y.unsqueeze(-1)), dim=-1).view(batch_size, 8)
+
+
+def _clamp_y_intercept(
+    bounding_boxes: torch.Tensor,
+    original_bounding_boxes: torch.Tensor,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    """
+    Apply clamping to bounding box y-intercepts. This function handles two clamping strategies:
+    - Hard clamping: Ensures all box vertices stay within canvas boundaries, finding the largest
+      angle-preserving box enclosed within the original box and the image canvas.
+    - Soft clamping: Allows some vertices to extend beyond the canvas, finding the smallest
+      angle-preserving box that encloses the intersection of the original box and the image canvas.
+
+    The function first calculates the slopes and y-intercepts of the lines forming the bounding box,
+    then applies various constraints to ensure the clamping conditions are respected.
+    """
+
+    # Calculate slopes and y-intercepts for bounding boxes
+    a, b = _get_slope_and_intercept(bounding_boxes)
+    a1, a2, a3, a4 = a.unbind(-1)
+    b1, b2, b3, b4 = b.unbind(-1)
+
+    # Get y-intercepts from original bounding boxes
+    _, bm = _get_slope_and_intercept(original_bounding_boxes)
+    b1m, b2m, b3m, b4m = bm.unbind(-1)
+
+    # Soft clamping: Clamp y-intercepts within canvas boundaries
+    b1 = b2.clamp(b1, b3).clamp(0, canvas_size[0])
+    b4 = b3.clamp(b2, b4).clamp(0, canvas_size[0])
+
+    if clamping_mode is not None and clamping_mode == "hard":
+        # Hard clamping: Average b1 and b4, and adjust b2 and b3 for maximum area
+        b1 = b4 = (b1 + b4) / 2
+
+        # Calculate candidate values for b2 based on geometric constraints
+        b2_candidates = torch.stack(
+            [
+                b1 * a2 / a1,  # Constraint at y=0
+                b3 * a2 / a3,  # Constraint at y=0
+                (a1 - a2) * canvas_size[1] + b1,  # Constraint at x=canvas_width
+                (a3 - a2) * canvas_size[1] + b3,  # Constraint at x=canvas_width
+            ],
+            dim=1,
+        )
+        # Take maximum value that doesn't exceed original b2
+        b2 = torch.max(b2_candidates, dim=1)[0].clamp(max=b2)
+
+        # Calculate candidate values for b3 based on geometric constraints
+        b3_candidates = torch.stack(
+            [
+                canvas_size[0] * (1 - a3 / a4) + b4 * a3 / a4,  # Constraint at y=canvas_height
+                canvas_size[0] * (1 - a3 / a2) + b2 * a3 / a2,  # Constraint at y=canvas_height
+                (a2 - a3) * canvas_size[1] + b2,  # Constraint at x=canvas_width
+                (a4 - a3) * canvas_size[1] + b4,  # Constraint at x=canvas_width
+            ],
+            dim=1,
+        )
+        # Take minimum value that doesn't go below original b3
+        b3 = torch.min(b3_candidates, dim=1)[0].clamp(min=b3)
+
+    # Final clamping to ensure y-intercepts are within original box bounds
+    b1.clamp_(b1m, b3m)
+    b3.clamp_(b1m, b3m)
+    b2.clamp_(b2m, b4m)
+    b4.clamp_(b2m, b4m)
+
+    return torch.stack([b1, b2, b3, b4], dim=-1)
+
+
+def _clamp_along_y_axis(
+    bounding_boxes: torch.Tensor,
+    original_bounding_boxes: torch.Tensor,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    """
+    Adjusts bounding boxes along the y-axis based on specific conditions.
+
+    This function modifies the bounding boxes by evaluating different cases
+    and applying the appropriate transformation to ensure the bounding boxes
+    are clamped correctly along the y-axis.
+
+    Args:
+        bounding_boxes (torch.Tensor): A tensor containing bounding box coordinates.
+        original_bounding_boxes (torch.Tensor): The original bounding boxes before any clamping is applied.
+        canvas_size (tuple[int, int]): The size of the canvas as (height, width).
+        clamping_mode (str, optional): The clamping strategy to use.
+
+    Returns:
+        torch.Tensor: The adjusted bounding boxes.
+    """
+    original_shape = bounding_boxes.shape
+    bounding_boxes = bounding_boxes.reshape(-1, 8)
+    original_bounding_boxes = original_bounding_boxes.reshape(-1, 8)
+
+    # Calculate slopes (a) and y-intercepts (b) for all lines in the bounding boxes
+    a, b = _get_slope_and_intercept(bounding_boxes)
+    x1, y1, x2, y2, x3, y3, x4, y4 = bounding_boxes.unbind(-1)
+    b = _clamp_y_intercept(bounding_boxes, original_bounding_boxes, canvas_size, clamping_mode)
+
+    case_a = _get_intersection_point(a, b)
+    case_b = bounding_boxes.clone()
+    case_b[..., 0].clamp_(0)  # Clamp x1 to 0
+    case_b[..., 6].clamp_(0)  # Clamp x4 to 0
+    case_c = torch.zeros_like(case_b)
+
+    cond_a = (x1 < 0) & ~case_a.isnan().any(-1)  # First point is outside left boundary
+    cond_b = y1.isclose(y2) | y3.isclose(y4)  # First line is nearly vertical
+    cond_c = (x1 <= 0) & (x2 <= 0) & (x3 <= 0) & (x4 <= 0)  # All points outside left boundary
+    cond_c = cond_c | y1.isclose(y4) | y2.isclose(y3) | (cond_b & x1.isclose(x2))  # First line is nearly horizontal
+
+    for (cond, case) in zip(
+        [cond_a, cond_b, cond_c],
+        [case_a, case_b, case_c],
+    ):
+        bounding_boxes = torch.where(cond.unsqueeze(1).repeat(1, 8), case.reshape(-1, 8), bounding_boxes)
+
+    return bounding_boxes.reshape(original_shape)
+
+
+def _clamp_rotated_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    """
+    Clamp rotated bounding boxes to ensure they stay within the canvas boundaries.
+
+    This function handles rotated bounding boxes by:
+    1. Converting them to XYXYXYXY format (8 coordinates representing 4 corners).
+    2. Re-ordering the points in the bounding boxes to ensure (x1, y1) corresponds to the point with the lowest x value
+    2. Translates the points (x1, y1), (x2, y2) and (x3, y3)
+        to ensure the bounding box does not go out beyond the left boundary of the canvas.
+    3. Rotate the bounding box four times and apply the same transformation to each vertex to ensure
+        the box does not go beyond the top, right, and bottom boundaries.
+    3. Converting back to the original format and re-order the points as in the original input.
+
+    Args:
+        bounding_boxes (torch.Tensor): Tensor containing rotated bounding box coordinates
+        format (BoundingBoxFormat): The format of the input bounding boxes
+        canvas_size (tuple[int, int]): The size of the canvas as (height, width)
+
+    Returns:
+        torch.Tensor: Clamped bounding boxes in the original format and shape
+    """
+    if clamping_mode is None:
+        return bounding_boxes.clone()
+    original_shape = bounding_boxes.shape
+    bounding_boxes = bounding_boxes.clone()
+    out_boxes = (
+        convert_bounding_box_format(
+            bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, inplace=True
+        )
+    ).reshape(-1, 8)
+
+    original_boxes = out_boxes.clone()
+    for _ in range(4):  # Iterate over the 4 vertices.
+        indices, out_boxes = _order_bounding_boxes_points(out_boxes)
+        _, original_boxes = _order_bounding_boxes_points(original_boxes, indices)
+        out_boxes = _clamp_along_y_axis(out_boxes, original_boxes, canvas_size, clamping_mode)
+        _, out_boxes = _order_bounding_boxes_points(out_boxes, indices)
+        _, original_boxes = _order_bounding_boxes_points(original_boxes, indices)
+        # rotate 90 degrees counter clock wise
+        out_boxes[:, ::2], out_boxes[:, 1::2] = (
+            out_boxes[:, 1::2].clone(),
+            canvas_size[1] - out_boxes[:, ::2].clone(),
+        )
+        original_boxes[:, ::2], original_boxes[:, 1::2] = (
+            original_boxes[:, 1::2].clone(),
+            canvas_size[1] - original_boxes[:, ::2].clone(),
+        )
+        canvas_size = (canvas_size[1], canvas_size[0])
+
+    out_boxes = convert_bounding_box_format(
+        out_boxes, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    return out_boxes
+
+
+def clamp_bounding_boxes(
+    inpt: torch.Tensor,
+    format: Optional[BoundingBoxFormat] = None,
+    canvas_size: Optional[tuple[int, int]] = None,
+    clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto",
+) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ClampBoundingBoxes` for details."""
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_bounding_boxes)
+
+    if clamping_mode is not None and clamping_mode not in ("soft", "hard", "auto"):
+        raise ValueError(f"clamping_mode must be soft, hard, auto or None, got {clamping_mode}")
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+
+        if format is None or canvas_size is None or (clamping_mode is not None and clamping_mode == "auto"):
+            raise ValueError("For pure tensor inputs, `format`, `canvas_size` and `clamping_mode` have to be passed.")
+        if tv_tensors.is_rotated_bounding_format(format):
+            return _clamp_rotated_bounding_boxes(
+                inpt, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode
+            )
+        else:
+            return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
+        if format is not None or canvas_size is not None:
+            raise ValueError("For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed.")
+        if clamping_mode is not None and clamping_mode == "auto":
+            clamping_mode = inpt.clamping_mode
+        if tv_tensors.is_rotated_bounding_format(inpt.format):
+            output = _clamp_rotated_bounding_boxes(
+                inpt.as_subclass(torch.Tensor),
+                format=inpt.format,
+                canvas_size=inpt.canvas_size,
+                clamping_mode=clamping_mode,
+            )
+        else:
+            output = _clamp_bounding_boxes(
+                inpt.as_subclass(torch.Tensor),
+                format=inpt.format,
+                canvas_size=inpt.canvas_size,
+                clamping_mode=clamping_mode,
+            )
+        return tv_tensors.wrap(output, like=inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
+        )
+
+
+def _clamp_keypoints(keypoints: torch.Tensor, canvas_size: tuple[int, int]) -> torch.Tensor:
+    dtype = keypoints.dtype
+    keypoints = keypoints.clone() if keypoints.is_floating_point() else keypoints.float()
+    # Note that max is canvas_size[i] - 1 and not can canvas_size[i] like for
+    # bounding boxes.
+    keypoints[..., 0].clamp_(min=0, max=canvas_size[1] - 1)
+    keypoints[..., 1].clamp_(min=0, max=canvas_size[0] - 1)
+    return keypoints.to(dtype=dtype)
+
+
+def clamp_keypoints(
+    inpt: torch.Tensor,
+    canvas_size: Optional[tuple[int, int]] = None,
+) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ClampKeyPoints` for details."""
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_keypoints)
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+
+        if canvas_size is None:
+            raise ValueError("For pure tensor inputs, `canvas_size` has to be passed.")
+        return _clamp_keypoints(inpt, canvas_size=canvas_size)
+    elif isinstance(inpt, tv_tensors.KeyPoints):
+        if canvas_size is not None:
+            raise ValueError("For keypoints tv_tensor inputs, `canvas_size` must not be passed.")
+        output = _clamp_keypoints(inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size)
+        return tv_tensors.wrap(output, like=inpt)
+    else:
+        raise TypeError(f"Input can either be a plain tensor or a keypoints tv_tensor, but got {type(inpt)} instead.")
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
new file mode 100644
index 00000000000..797fe496afb
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -0,0 +1,432 @@
+import math
+from typing import Optional
+
+import PIL.Image
+import torch
+from torch.nn.functional import conv2d, pad as torch_pad
+
+from torchvision import tv_tensors
+from torchvision.transforms._functional_tensor import _max_value
+from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+
+from torchvision.utils import _log_api_usage_once
+
+from ._meta import _convert_bounding_box_format
+
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
+
+
+def normalize(
+    inpt: torch.Tensor,
+    mean: list[float],
+    std: list[float],
+    inplace: bool = False,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Normalize` for details."""
+    if torch.jit.is_scripting():
+        return normalize_image(inpt, mean=mean, std=std, inplace=inplace)
+
+    _log_api_usage_once(normalize)
+
+    kernel = _get_kernel(normalize, type(inpt))
+    return kernel(inpt, mean=mean, std=std, inplace=inplace)
+
+
+@_register_kernel_internal(normalize, torch.Tensor)
+@_register_kernel_internal(normalize, tv_tensors.Image)
+def normalize_image(image: torch.Tensor, mean: list[float], std: list[float], inplace: bool = False) -> torch.Tensor:
+    if not image.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
+
+    if image.ndim < 3:
+        raise ValueError(f"Expected tensor to be a tensor image of size (..., C, H, W). Got {image.shape}.")
+
+    if isinstance(std, (tuple, list)):
+        divzero = not all(std)
+    elif isinstance(std, (int, float)):
+        divzero = std == 0
+    else:
+        divzero = False
+    if divzero:
+        raise ValueError("std evaluated to zero, leading to division by zero.")
+
+    dtype = image.dtype
+    device = image.device
+    mean = torch.as_tensor(mean, dtype=dtype, device=device)
+    std = torch.as_tensor(std, dtype=dtype, device=device)
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+
+    if inplace:
+        image = image.sub_(mean)
+    else:
+        image = image.sub(mean)
+
+    return image.div_(std)
+
+
+@_register_kernel_internal(normalize, tv_tensors.Video)
+def normalize_video(video: torch.Tensor, mean: list[float], std: list[float], inplace: bool = False) -> torch.Tensor:
+    return normalize_image(video, mean, std, inplace=inplace)
+
+
+def gaussian_blur(inpt: torch.Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.GaussianBlur` for details."""
+    if torch.jit.is_scripting():
+        return gaussian_blur_image(inpt, kernel_size=kernel_size, sigma=sigma)
+
+    _log_api_usage_once(gaussian_blur)
+
+    kernel = _get_kernel(gaussian_blur, type(inpt))
+    return kernel(inpt, kernel_size=kernel_size, sigma=sigma)
+
+
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0))
+    x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
+    kernel1d = torch.softmax(x.div(sigma).pow(2).neg(), dim=0)
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: list[int], sigma: list[float], dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device)
+    kernel2d = kernel1d_y.unsqueeze(-1) * kernel1d_x
+    return kernel2d
+
+
+@_register_kernel_internal(gaussian_blur, torch.Tensor)
+@_register_kernel_internal(gaussian_blur, tv_tensors.Image)
+def gaussian_blur_image(
+    image: torch.Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None
+) -> torch.Tensor:
+    # TODO: consider deprecating integers from sigma on the future
+    if isinstance(kernel_size, int):
+        kernel_size = [kernel_size, kernel_size]
+    elif len(kernel_size) != 2:
+        raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
+    for ksize in kernel_size:
+        if ksize % 2 == 0 or ksize < 0:
+            raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
+
+    if sigma is None:
+        sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
+    else:
+        if isinstance(sigma, (list, tuple)):
+            length = len(sigma)
+            if length == 1:
+                s = sigma[0]
+                sigma = [s, s]
+            elif length != 2:
+                raise ValueError(f"If sigma is a sequence, its length should be 2. Got {length}")
+        elif isinstance(sigma, (int, float)):
+            s = float(sigma)
+            sigma = [s, s]
+        else:
+            raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
+    for s in sigma:
+        if s <= 0.0:
+            raise ValueError(f"sigma should have positive values. Got {sigma}")
+
+    if image.numel() == 0:
+        return image
+
+    dtype = image.dtype
+    shape = image.shape
+    ndim = image.ndim
+    if ndim == 3:
+        image = image.unsqueeze(dim=0)
+    elif ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+
+    fp = torch.is_floating_point(image)
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype if fp else torch.float32, device=image.device)
+    kernel = kernel.expand(shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    output = image if fp else image.to(dtype=torch.float32)
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    output = torch_pad(output, padding, mode="reflect")
+    output = conv2d(output, kernel, groups=shape[-3])
+
+    if ndim == 3:
+        output = output.squeeze(dim=0)
+    elif ndim > 4:
+        output = output.reshape(shape)
+
+    if not fp:
+        output = output.round_().to(dtype=dtype)
+
+    return output
+
+
+@_register_kernel_internal(gaussian_blur, PIL.Image.Image)
+def _gaussian_blur_image_pil(
+    image: PIL.Image.Image, kernel_size: list[int], sigma: Optional[list[float]] = None
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = gaussian_blur_image(t_img, kernel_size=kernel_size, sigma=sigma)
+    return to_pil_image(output, mode=image.mode)
+
+
+@_register_kernel_internal(gaussian_blur, tv_tensors.Video)
+def gaussian_blur_video(
+    video: torch.Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None
+) -> torch.Tensor:
+    return gaussian_blur_image(video, kernel_size, sigma)
+
+
+def gaussian_noise(inpt: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.GaussianNoise`"""
+    if torch.jit.is_scripting():
+        return gaussian_noise_image(inpt, mean=mean, sigma=sigma)
+
+    _log_api_usage_once(gaussian_noise)
+
+    kernel = _get_kernel(gaussian_noise, type(inpt))
+    return kernel(inpt, mean=mean, sigma=sigma, clip=clip)
+
+
+@_register_kernel_internal(gaussian_noise, torch.Tensor)
+@_register_kernel_internal(gaussian_noise, tv_tensors.Image)
+def gaussian_noise_image(image: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
+    if not image.is_floating_point():
+        raise ValueError(f"Input tensor is expected to be in float dtype, got dtype={image.dtype}")
+    if sigma < 0:
+        raise ValueError(f"sigma shouldn't be negative. Got {sigma}")
+
+    noise = mean + torch.randn_like(image) * sigma
+    out = image + noise
+    if clip:
+        out = torch.clamp(out, 0, 1)
+    return out
+
+
+@_register_kernel_internal(gaussian_noise, tv_tensors.Video)
+def gaussian_noise_video(video: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
+    return gaussian_noise_image(video, mean=mean, sigma=sigma, clip=clip)
+
+
+@_register_kernel_internal(gaussian_noise, PIL.Image.Image)
+def _gaussian_noise_pil(
+    video: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True
+) -> PIL.Image.Image:
+    raise ValueError("Gaussian Noise is not implemented for PIL images.")
+
+
+def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ToDtype` for details."""
+    if torch.jit.is_scripting():
+        return to_dtype_image(inpt, dtype=dtype, scale=scale)
+
+    _log_api_usage_once(to_dtype)
+
+    kernel = _get_kernel(to_dtype, type(inpt))
+    return kernel(inpt, dtype=dtype, scale=scale)
+
+
+def _num_value_bits(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 8
+    elif dtype == torch.int8:
+        return 7
+    elif dtype == torch.int16:
+        return 15
+    elif dtype == torch.uint16:
+        return 16
+    elif dtype == torch.int32:
+        return 31
+    elif dtype == torch.int64:
+        return 63
+    else:
+        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
+
+
+@_register_kernel_internal(to_dtype, torch.Tensor)
+@_register_kernel_internal(to_dtype, tv_tensors.Image)
+def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+
+    if image.dtype == dtype:
+        return image
+    elif not scale:
+        return image.to(dtype)
+
+    float_input = image.is_floating_point()
+    if torch.jit.is_scripting():
+        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
+        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
+    else:
+        float_output = dtype.is_floating_point
+
+    if float_input:
+        # float to float
+        if float_output:
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
+
+        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
+        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
+        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # for a detailed analysis.
+        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
+        # Instead, we can also multiply by the maximum value plus something close to `1`. See
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
+        eps = 1e-3
+        max_value = float(_max_value(dtype))
+        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
+        # discrete set `{0, 1}`.
+        return image.mul(max_value + 1.0 - eps).to(dtype)
+    else:
+        # int to float
+        if float_output:
+            return image.to(dtype).mul_(1.0 / _max_value(image.dtype))
+
+        # int to int
+        num_value_bits_input = _num_value_bits(image.dtype)
+        num_value_bits_output = _num_value_bits(dtype)
+
+        # TODO: Remove if/else inner blocks once uint16 dtype supports bitwise shift operations.
+        shift_by = abs(num_value_bits_input - num_value_bits_output)
+        if num_value_bits_input > num_value_bits_output:
+            if image.dtype == torch.uint16:
+                return (image / 2 ** (shift_by)).to(dtype)
+            else:
+                return image.bitwise_right_shift(shift_by).to(dtype)
+        else:
+            if dtype == torch.uint16:
+                return image.to(dtype) * 2 ** (shift_by)
+            else:
+                return image.to(dtype).bitwise_left_shift_(shift_by)
+
+
+# We encourage users to use to_dtype() instead but we keep this for BC
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """[DEPRECATED] Use to_dtype() instead."""
+    return to_dtype_image(image, dtype=dtype, scale=True)
+
+
+@_register_kernel_internal(to_dtype, tv_tensors.Video)
+def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    return to_dtype_image(video, dtype, scale=scale)
+
+
+@_register_kernel_internal(to_dtype, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: bool = False) -> torch.Tensor:
+    # We don't need to unwrap and rewrap here, since TVTensor.to() preserves the type
+    return inpt.to(dtype)
+
+
+def sanitize_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: Optional[tv_tensors.BoundingBoxFormat] = None,
+    canvas_size: Optional[tuple[int, int]] = None,
+    min_size: float = 1.0,
+    min_area: float = 1.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Remove degenerate/invalid bounding boxes and return the corresponding indexing mask.
+
+    This removes bounding boxes that:
+
+    - are below a given ``min_size`` or ``min_area``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :func:`~torchvision.transforms.v2.functional.clamp_bounding_boxes` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        bounding_boxes (Tensor or :class:`~torchvision.tv_tensors.BoundingBoxes`): The bounding boxes to be sanitized.
+        format (str or :class:`~torchvision.tv_tensors.BoundingBoxFormat`, optional): The format of the bounding boxes.
+            Must be left to none if ``bounding_boxes`` is a :class:`~torchvision.tv_tensors.BoundingBoxes` object.
+        canvas_size (tuple of int, optional): The canvas_size of the bounding boxes
+            (size of the corresponding image/video).
+            Must be left to none if ``bounding_boxes`` is a :class:`~torchvision.tv_tensors.BoundingBoxes` object.
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        min_area (float, optional) The area below which bounding boxes are removed. Default is 1.
+
+    Returns:
+        out (tuple of Tensors): The subset of valid bounding boxes, and the corresponding indexing mask.
+        The mask can then be used to subset other tensors (e.g. labels) that are associated with the bounding boxes.
+    """
+    if torch.jit.is_scripting() or is_pure_tensor(bounding_boxes):
+        if format is None or canvas_size is None:
+            raise ValueError(
+                "format and canvas_size cannot be None if bounding_boxes is a pure tensor. "
+                f"Got format={format} and canvas_size={canvas_size}."
+                "Set those to appropriate values or pass bounding_boxes as a tv_tensors.BoundingBoxes object."
+            )
+        if isinstance(format, str):
+            format = tv_tensors.BoundingBoxFormat[format.upper()]
+        valid = _get_sanitize_bounding_boxes_mask(
+            bounding_boxes, format=format, canvas_size=canvas_size, min_size=min_size, min_area=min_area
+        )
+        bounding_boxes = bounding_boxes[valid]
+    else:
+        if not isinstance(bounding_boxes, tv_tensors.BoundingBoxes):
+            raise ValueError("bounding_boxes must be a tv_tensors.BoundingBoxes instance or a pure tensor.")
+        if format is not None or canvas_size is not None:
+            raise ValueError(
+                "format and canvas_size must be None when bounding_boxes is a tv_tensors.BoundingBoxes instance. "
+                f"Got format={format} and canvas_size={canvas_size}. "
+                "Leave those to None or pass bounding_boxes as a pure tensor."
+            )
+        valid = _get_sanitize_bounding_boxes_mask(
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            min_size=min_size,
+            min_area=min_area,
+        )
+        bounding_boxes = tv_tensors.wrap(bounding_boxes[valid], like=bounding_boxes)
+
+    return bounding_boxes, valid
+
+
+def _get_sanitize_bounding_boxes_mask(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    min_size: float = 1.0,
+    min_area: float = 1.0,
+) -> torch.Tensor:
+
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+    bounding_boxes = _convert_bounding_box_format(bounding_boxes, new_format=intermediate_format, old_format=format)
+
+    image_h, image_w = canvas_size
+    if is_rotated:
+        dx12 = bounding_boxes[..., 0] - bounding_boxes[..., 2]
+        dy12 = bounding_boxes[..., 1] - bounding_boxes[..., 3]
+        dx23 = bounding_boxes[..., 3] - bounding_boxes[..., 5]
+        dy23 = bounding_boxes[..., 4] - bounding_boxes[..., 6]
+        ws = torch.sqrt(dx12**2 + dy12**2)
+        hs = torch.sqrt(dx23**2 + dy23**2)
+    else:
+        ws, hs = bounding_boxes[:, 2] - bounding_boxes[:, 0], bounding_boxes[:, 3] - bounding_boxes[:, 1]
+    valid = (ws >= min_size) & (hs >= min_size) & (bounding_boxes >= 0).all(dim=-1) & (ws * hs >= min_area)
+    # TODO: Do we really need to check for out of bounds here? All
+    # transforms should be clamping anyway, so this should never happen?
+    image_h, image_w = canvas_size
+    valid &= (bounding_boxes[:, 0] <= image_w) & (bounding_boxes[:, 2] <= image_w)
+    valid &= (bounding_boxes[:, 1] <= image_h) & (bounding_boxes[:, 3] <= image_h)
+    if is_rotated:
+        valid &= (bounding_boxes[..., 4] <= image_w) & (bounding_boxes[..., 5] <= image_h)
+        valid &= (bounding_boxes[..., 6] <= image_w) & (bounding_boxes[..., 7] <= image_h)
+    return valid
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
new file mode 100644
index 00000000000..f932b06a295
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -0,0 +1,27 @@
+import torch
+
+from torchvision import tv_tensors
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.UniformTemporalSubsample` for details."""
+    if torch.jit.is_scripting():
+        return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
+
+    _log_api_usage_once(uniform_temporal_subsample)
+
+    kernel = _get_kernel(uniform_temporal_subsample, type(inpt))
+    return kernel(inpt, num_samples=num_samples)
+
+
+@_register_kernel_internal(uniform_temporal_subsample, torch.Tensor)
+@_register_kernel_internal(uniform_temporal_subsample, tv_tensors.Video)
+def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
+    # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
+    t_max = video.shape[-4] - 1
+    indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
+    return torch.index_select(video, -4, indices)
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
new file mode 100644
index 00000000000..c5a731fe143
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -0,0 +1,27 @@
+from typing import Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import functional as _F
+
+
+@torch.jit.unused
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tensors.Image:
+    """See :class:`~torchvision.transforms.v2.ToImage` for details."""
+    if isinstance(inpt, np.ndarray):
+        output = torch.from_numpy(np.atleast_3d(inpt)).permute((2, 0, 1)).contiguous()
+    elif isinstance(inpt, PIL.Image.Image):
+        output = pil_to_tensor(inpt)
+    elif isinstance(inpt, torch.Tensor):
+        output = inpt
+    else:
+        raise TypeError(
+            f"Input can either be a pure Tensor, a numpy array, or a PIL image, but got {type(inpt)} instead."
+        )
+    return tv_tensors.Image(output)
+
+
+to_pil_image = _F.to_pil_image
+pil_to_tensor = _F.pil_to_tensor
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
new file mode 100644
index 00000000000..b857285c891
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -0,0 +1,142 @@
+import functools
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torchvision import tv_tensors
+
+_FillType = Union[int, float, Sequence[int], Sequence[float], None]
+_FillTypeJIT = Optional[list[float]]
+
+
+def is_pure_tensor(inpt: Any) -> bool:
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, tv_tensors.TVTensor)
+
+
+# {functional: {input_type: type_specific_kernel}}
+_KERNEL_REGISTRY: dict[Callable, dict[type, Callable]] = {}
+
+
+def _kernel_tv_tensor_wrapper(kernel):
+    @functools.wraps(kernel)
+    def wrapper(inpt, *args, **kwargs):
+        # If you're wondering whether we could / should get rid of this wrapper,
+        # the answer is no: we want to pass pure Tensors to avoid the overhead
+        # of the __torch_function__ machinery. Note that this is always valid,
+        # regardless of whether we override __torch_function__ in our base class
+        # or not.
+        # Also, even if we didn't call `as_subclass` here, we would still need
+        # this wrapper to call wrap(), because the TVTensor type would be
+        # lost after the first operation due to our own __torch_function__
+        # logic.
+        output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
+        return tv_tensors.wrap(output, like=inpt)
+
+    return wrapper
+
+
+def _register_kernel_internal(functional, input_type, *, tv_tensor_wrapper=True):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
+    if input_type in registry:
+        raise ValueError(f"Functional {functional} already has a kernel registered for type {input_type}.")
+
+    def decorator(kernel):
+        registry[input_type] = (
+            _kernel_tv_tensor_wrapper(kernel)
+            if issubclass(input_type, tv_tensors.TVTensor) and tv_tensor_wrapper
+            else kernel
+        )
+        return kernel
+
+    return decorator
+
+
+def _name_to_functional(name):
+    import torchvision.transforms.v2.functional  # noqa
+
+    try:
+        return getattr(torchvision.transforms.v2.functional, name)
+    except AttributeError:
+        raise ValueError(
+            f"Could not find functional with name '{name}' in torchvision.transforms.v2.functional."
+        ) from None
+
+
+_BUILTIN_DATAPOINT_TYPES = {
+    obj for obj in tv_tensors.__dict__.values() if isinstance(obj, type) and issubclass(obj, tv_tensors.TVTensor)
+}
+
+
+def register_kernel(functional, tv_tensor_cls):
+    """Decorate a kernel to register it for a functional and a (custom) tv_tensor type.
+
+    See :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for usage
+    details.
+    """
+    if isinstance(functional, str):
+        functional = _name_to_functional(name=functional)
+    elif not (
+        callable(functional)
+        and getattr(functional, "__module__", "").startswith("torchvision.transforms.v2.functional")
+    ):
+        raise ValueError(
+            f"Kernels can only be registered on functionals from the torchvision.transforms.v2.functional namespace, "
+            f"but got {functional}."
+        )
+
+    if not (isinstance(tv_tensor_cls, type) and issubclass(tv_tensor_cls, tv_tensors.TVTensor)):
+        raise ValueError(
+            f"Kernels can only be registered for subclasses of torchvision.tv_tensors.TVTensor, "
+            f"but got {tv_tensor_cls}."
+        )
+
+    if tv_tensor_cls in _BUILTIN_DATAPOINT_TYPES:
+        raise ValueError(f"Kernels cannot be registered for the builtin tv_tensor classes, but got {tv_tensor_cls}")
+
+    return _register_kernel_internal(functional, tv_tensor_cls, tv_tensor_wrapper=False)
+
+
+def _get_kernel(functional, input_type, *, allow_passthrough=False):
+    registry = _KERNEL_REGISTRY.get(functional)
+    if not registry:
+        raise ValueError(f"No kernel registered for functional {functional.__name__}.")
+
+    for cls in input_type.__mro__:
+        if cls in registry:
+            return registry[cls]
+        elif cls is tv_tensors.TVTensor:
+            # We don't want user-defined tv_tensors to dispatch to the pure Tensor kernels, so we explicit stop the
+            # MRO traversal before hitting torch.Tensor. We can even stop at tv_tensors.TVTensor, since we don't
+            # allow kernels to be registered for tv_tensors.TVTensor anyway.
+            break
+
+    if allow_passthrough:
+        return lambda inpt, *args, **kwargs: inpt
+
+    raise TypeError(
+        f"Functional F.{functional.__name__} supports inputs of type {registry.keys()}, "
+        f"but got {input_type} instead."
+    )
+
+
+# This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
+# We could get rid of this by letting _register_kernel_internal take arbitrary functionals rather than wrap_kernel: bool
+def _register_five_ten_crop_kernel_internal(functional, input_type):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
+    if input_type in registry:
+        raise TypeError(f"Functional '{functional}' already has a kernel registered for type '{input_type}'.")
+
+    def wrap(kernel):
+        @functools.wraps(kernel)
+        def wrapper(inpt, *args, **kwargs):
+            output = kernel(inpt, *args, **kwargs)
+            container_type = type(output)
+            return container_type(tv_tensors.wrap(o, like=inpt) for o in output)
+
+        return wrapper
+
+    def decorator(kernel):
+        registry[input_type] = wrap(kernel) if issubclass(input_type, tv_tensors.TVTensor) else kernel
+        return kernel
+
+    return decorator
diff --git a/torchvision/tv_tensors/__init__.py b/torchvision/tv_tensors/__init__.py
new file mode 100644
index 00000000000..744e5241135
--- /dev/null
+++ b/torchvision/tv_tensors/__init__.py
@@ -0,0 +1,39 @@
+import torch
+
+from ._bounding_boxes import BoundingBoxes, BoundingBoxFormat, is_rotated_bounding_format
+from ._image import Image
+from ._keypoints import KeyPoints
+from ._mask import Mask
+from ._torch_function_helpers import set_return_type
+from ._tv_tensor import TVTensor
+from ._video import Video
+
+
+# TODO: Fix this. We skip this method as it leads to
+# RecursionError: maximum recursion depth exceeded while calling a Python object
+# Until `disable` is removed, there will be graph breaks after all calls to functional transforms
+@torch.compiler.disable
+def wrap(wrappee, *, like, **kwargs):
+    """Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.tv_tensors.TVTensor` subclass as ``like``.
+
+    If ``like`` is a :class:`~torchvision.tv_tensors.BoundingBoxes`, the ``format`` and ``canvas_size`` of
+    ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``.
+
+    Args:
+        wrappee (Tensor): The tensor to convert.
+        like (:class:`~torchvision.tv_tensors.TVTensor`): The reference.
+            ``wrappee`` will be converted into the same subclass as ``like``.
+        kwargs: Can contain "format", "canvas_size" and "clamping_mode" if ``like`` is a :class:`~torchvision.tv_tensor.BoundingBoxes`.
+            Ignored otherwise.
+    """
+    if isinstance(like, BoundingBoxes):
+        return BoundingBoxes._wrap(
+            wrappee,
+            format=kwargs.get("format", like.format),
+            canvas_size=kwargs.get("canvas_size", like.canvas_size),
+            clamping_mode=kwargs.get("clamping_mode", like.clamping_mode),
+        )
+    elif isinstance(like, KeyPoints):
+        return KeyPoints._wrap(wrappee, canvas_size=kwargs.get("canvas_size", like.canvas_size))
+    else:
+        return wrappee.as_subclass(type(like))
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
new file mode 100644
index 00000000000..7aa3e50458d
--- /dev/null
+++ b/torchvision/tv_tensors/_bounding_boxes.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from enum import Enum
+from typing import Any, Optional
+
+import torch
+from torch.utils._pytree import tree_flatten
+
+from ._tv_tensor import TVTensor
+
+
+class BoundingBoxFormat(Enum):
+    """Coordinate format of a bounding box.
+
+    Available formats are:
+
+    * ``XYXY``: bounding box represented via corners; x1, y1 being top left;
+      x2, y2 being bottom right.
+    * ``XYWH``: bounding box represented via corner, width and height; x1, y1
+      being top left; w, h being width and height.
+    * ``CXCYWH``: bounding box represented via centre, width and height; cx,
+      cy being center of box; w, h being width and height.
+    * ``XYWHR``: rotated boxes represented via corner, width and height; x1, y1
+      being top left; w, h being width and height. r is rotation angle in
+      degrees.
+    * ``CXCYWHR``: rotated boxes represented via center, width and height; cx,
+      cy being center of box; w, h being width and height. r is rotation angle
+      in degrees.
+    * ``XYXYXYXY``: rotated boxes represented via corners; x1, y1 being top
+      left; x2, y2 being top right; x3, y3 being bottom right; x4, y4 being
+      bottom left.
+    """
+
+    XYXY = "XYXY"
+    XYWH = "XYWH"
+    CXCYWH = "CXCYWH"
+    XYWHR = "XYWHR"
+    CXCYWHR = "CXCYWHR"
+    XYXYXYXY = "XYXYXYXY"
+
+
+# TODO: Once torchscript supports Enums with staticmethod
+# this can be put into BoundingBoxFormat as staticmethod
+def is_rotated_bounding_format(format: BoundingBoxFormat | str) -> bool:
+    if isinstance(format, BoundingBoxFormat):
+        return (
+            format == BoundingBoxFormat.XYWHR
+            or format == BoundingBoxFormat.CXCYWHR
+            or format == BoundingBoxFormat.XYXYXYXY
+        )
+    elif isinstance(format, str):
+        return format in ("XYWHR", "CXCYWHR", "XYXYXYXY")
+    else:
+        raise ValueError(f"format should be str or BoundingBoxFormat, got {type(format)}")
+
+
+# This should ideally be a Literal, but torchscript fails.
+CLAMPING_MODE_TYPE = Optional[str]
+
+
+class BoundingBoxes(TVTensor):
+    """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, K]``.
+
+    .. note::
+        Support for rotated bounding boxes was released in TorchVision 0.23 and
+        is currently a BETA feature. We don't expect the API to change, but
+        there may be some rare edge-cases. If you find any issues, please report
+        them on our bug tracker:
+        https://github.com/pytorch/vision/issues?q=is:open+is:issue
+
+    Where ``N`` is the number of bounding boxes
+    and ``K`` is 4 for unrotated boxes, and 5 or 8 for rotated boxes.
+
+    .. note::
+        There should be only one :class:`~torchvision.tv_tensors.BoundingBoxes`
+        instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``,
+        although one :class:`~torchvision.tv_tensors.BoundingBoxes` object can
+        contain multiple bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
+        clamping_mode: The clamping mode to use when applying transforms that may result in bounding boxes
+            partially outside of the image. Possible values are: "soft", "hard", or ``None``. Read more in :ref:`clamping_mode_tuto`.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    format: BoundingBoxFormat
+    canvas_size: tuple[int, int]
+    clamping_mode: CLAMPING_MODE_TYPE
+
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat | str, canvas_size: tuple[int, int], clamping_mode: CLAMPING_MODE_TYPE = "soft", check_dims: bool = True) -> BoundingBoxes:  # type: ignore[override]
+        if check_dims:
+            if tensor.ndim == 1:
+                tensor = tensor.unsqueeze(0)
+            elif tensor.ndim != 2:
+                raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
+        if clamping_mode is not None and clamping_mode not in ("hard", "soft"):
+            raise ValueError(f"clamping_mode must be None, hard or soft, got {clamping_mode}.")
+
+        if isinstance(format, str):
+            format = BoundingBoxFormat[format.upper()]
+
+        bounding_boxes = tensor.as_subclass(cls)
+        bounding_boxes.format = format
+        bounding_boxes.canvas_size = canvas_size
+        bounding_boxes.clamping_mode = clamping_mode
+        return bounding_boxes
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        format: BoundingBoxFormat | str,
+        canvas_size: tuple[int, int],
+        clamping_mode: CLAMPING_MODE_TYPE = "soft",
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> BoundingBoxes:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if not torch.is_floating_point(tensor) and is_rotated_bounding_format(format):
+            raise ValueError(f"Rotated bounding boxes should be floating point tensors, got {tensor.dtype}.")
+        return cls._wrap(tensor, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> BoundingBoxes:
+        # If there are BoundingBoxes instances in the output, their metadata got lost when we called
+        # super().__torch_function__. We need to restore the metadata somehow, so we choose to take
+        # the metadata from the first bbox in the parameters.
+        # This should be what we want in most cases. When it's not, it's probably a mis-use anyway, e.g.
+        # something like some_xyxy_bbox + some_xywh_bbox; we don't guard against those cases.
+        flat_params, _ = tree_flatten(args + (tuple(kwargs.values()) if kwargs else ()))  # type: ignore[operator]
+        first_bbox_from_args = next(x for x in flat_params if isinstance(x, BoundingBoxes))
+        format, canvas_size, clamping_mode = (
+            first_bbox_from_args.format,
+            first_bbox_from_args.canvas_size,
+            first_bbox_from_args.clamping_mode,
+        )
+
+        if isinstance(output, torch.Tensor) and not isinstance(output, BoundingBoxes):
+            output = BoundingBoxes._wrap(
+                output, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode, check_dims=False
+            )
+        elif isinstance(output, (tuple, list)):
+            # This branch exists for chunk() and unbind()
+            output = type(output)(
+                BoundingBoxes._wrap(
+                    part, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode, check_dims=False
+                )
+                for part in output
+            )
+        return output
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(format=self.format, canvas_size=self.canvas_size, clamping_mode=self.clamping_mode)
diff --git a/torchvision/tv_tensors/_dataset_wrapper.py b/torchvision/tv_tensors/_dataset_wrapper.py
new file mode 100644
index 00000000000..23683221f60
--- /dev/null
+++ b/torchvision/tv_tensors/_dataset_wrapper.py
@@ -0,0 +1,666 @@
+# type: ignore
+
+from __future__ import annotations
+
+import collections.abc
+
+import contextlib
+from collections import defaultdict
+from copy import copy
+
+import torch
+
+from torchvision import datasets, tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+__all__ = ["wrap_dataset_for_transforms_v2"]
+
+
+def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
+    """Wrap a ``torchvision.dataset`` for usage with :mod:`torchvision.transforms.v2`.
+
+    Example:
+        >>> dataset = torchvision.datasets.CocoDetection(...)
+        >>> dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    .. note::
+
+       For now, only the most popular datasets are supported. Furthermore, the wrapper only supports dataset
+       configurations that are fully supported by ``torchvision.transforms.v2``. If you encounter an error prompting you
+       to raise an issue to ``torchvision`` for a dataset or configuration that you need, please do so.
+
+    The dataset samples are wrapped according to the description below.
+
+    Special cases:
+
+        * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
+          returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
+          ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.tv_tensors``.
+          The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the
+          ``"image_id"``, ``"boxes"``, and ``"labels"``.
+        * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
+          the target and wrap the data in the corresponding ``torchvision.tv_tensors``. The original keys are
+          preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+        * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
+        * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
+          dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
+          in the corresponding ``torchvision.tv_tensors``. The original keys are preserved. If ``target_keys`` is
+          omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+        * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor.
+        * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor. The target for ``target_type="instance"`` is *replaced* by
+          a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.tv_tensors.Mask` tv_tensor) and
+          ``"labels"``.
+        * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
+
+    Image classification datasets
+
+        This wrapper is a no-op for image classification datasets, since they were already fully supported by
+        :mod:`torchvision.transforms` and thus no change is needed for :mod:`torchvision.transforms.v2`.
+
+    Segmentation datasets
+
+        Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation`, return a two-tuple of
+        :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the
+        segmentation mask into a :class:`~torchvision.tv_tensors.Mask` (second item).
+
+    Video classification datasets
+
+        Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics`, return a three-tuple containing a
+        :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a
+        :class:`~torchvision.tv_tensors.Video` while leaving the other items as is.
+
+        .. note::
+
+            Only datasets constructed with ``output_format="TCHW"`` are supported, since the alternative
+            ``output_format="THWC"`` is not supported by :mod:`torchvision.transforms.v2`.
+
+    Args:
+        dataset: the dataset instance to wrap for compatibility with transforms v2.
+        target_keys: Target keys to return in case the target is a dictionary. If ``None`` (default), selected keys are
+            specific to the dataset. If ``"all"``, returns the full target. Can also be a collection of strings for
+            fine grained access. Currently only supported for :class:`~torchvision.datasets.CocoDetection`,
+            :class:`~torchvision.datasets.VOCDetection`, :class:`~torchvision.datasets.Kitti`, and
+            :class:`~torchvision.datasets.WIDERFace`. See above for details.
+    """
+    if not (
+        target_keys is None
+        or target_keys == "all"
+        or (isinstance(target_keys, collections.abc.Collection) and all(isinstance(key, str) for key in target_keys))
+    ):
+        raise ValueError(
+            f"`target_keys` can be None, 'all', or a collection of strings denoting the keys to be returned, "
+            f"but got {target_keys}"
+        )
+
+    # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name
+    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetTVTensorWrapper (see below) as well as the
+    # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks,
+    # while we can still inject everything that we need.
+    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetTVTensorWrapper, type(dataset)), {})
+    # Since VisionDatasetTVTensorWrapper comes before ImageNet in the MRO, calling the class hits
+    # VisionDatasetTVTensorWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
+    # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather
+    # have the existing instance as attribute on the new object.
+    return wrapped_dataset_cls(dataset, target_keys)
+
+
+class WrapperFactories(dict):
+    def register(self, dataset_cls):
+        def decorator(wrapper_factory):
+            self[dataset_cls] = wrapper_factory
+            return wrapper_factory
+
+        return decorator
+
+
+# We need this two-stage design, i.e. a wrapper factory producing the actual wrapper, since some wrappers depend on the
+# dataset instance rather than just the class, since they require the user defined instance attributes. Thus, we can
+# provide a wrapping from the dataset class to the factory here, but can only instantiate the wrapper at runtime when
+# we have access to the dataset instance.
+WRAPPER_FACTORIES = WrapperFactories()
+
+
+class VisionDatasetTVTensorWrapper:
+    def __init__(self, dataset, target_keys):
+        dataset_cls = type(dataset)
+
+        if not isinstance(dataset, datasets.VisionDataset):
+            raise TypeError(
+                f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, "
+                f"but got a '{dataset_cls.__name__}' instead.\n"
+                f"For an example of how to perform the wrapping for custom datasets, see\n\n"
+                "https://pytorch.org/vision/main/auto_examples/plot_tv_tensors.html#do-i-have-to-wrap-the-output-of-the-datasets-myself"
+            )
+
+        for cls in dataset_cls.mro():
+            if cls in WRAPPER_FACTORIES:
+                wrapper_factory = WRAPPER_FACTORIES[cls]
+                if target_keys is not None and cls not in {
+                    datasets.CocoDetection,
+                    datasets.VOCDetection,
+                    datasets.Kitti,
+                    datasets.WIDERFace,
+                }:
+                    raise ValueError(
+                        f"`target_keys` is currently only supported for `CocoDetection`, `VOCDetection`, `Kitti`, "
+                        f"and `WIDERFace`, but got {cls.__name__}."
+                    )
+                break
+            elif cls is datasets.VisionDataset:
+                # TODO: If we have documentation on how to do that, put a link in the error message.
+                msg = f"No wrapper exists for dataset class {dataset_cls.__name__}. Please wrap the output yourself."
+                if dataset_cls in datasets.__dict__.values():
+                    msg = (
+                        f"{msg} If an automated wrapper for this dataset would be useful for you, "
+                        f"please open an issue at https://github.com/pytorch/vision/issues."
+                    )
+                raise TypeError(msg)
+
+        self._dataset = dataset
+        self._target_keys = target_keys
+        self._wrapper = wrapper_factory(dataset, target_keys)
+
+        # We need to disable the transforms on the dataset here to be able to inject the wrapping before we apply them.
+        # Although internally, `datasets.VisionDataset` merges `transform` and `target_transform` into the joint
+        # `transforms`
+        # https://github.com/pytorch/vision/blob/135a0f9ea9841b6324b4fe8974e2543cbb95709a/torchvision/datasets/vision.py#L52-L54
+        # some (if not most) datasets still use `transform` and `target_transform` individually. Thus, we need to
+        # disable all three here to be able to extract the untransformed sample to wrap.
+        self.transform, dataset.transform = dataset.transform, None
+        self.target_transform, dataset.target_transform = dataset.target_transform, None
+        self.transforms, dataset.transforms = dataset.transforms, None
+
+    def __getattr__(self, item):
+        with contextlib.suppress(AttributeError):
+            return object.__getattribute__(self, item)
+
+        return getattr(self._dataset, item)
+
+    def __getitem__(self, idx):
+        # This gets us the raw sample since we disabled the transforms for the underlying dataset in the constructor
+        # of this class
+        sample = self._dataset[idx]
+
+        sample = self._wrapper(idx, sample)
+
+        # Regardless of whether the user has supplied the transforms individually (`transform` and `target_transform`)
+        # or joint (`transforms`), we can access the full functionality through `transforms`
+        if self.transforms is not None:
+            sample = self.transforms(*sample)
+
+        return sample
+
+    def __len__(self):
+        return len(self._dataset)
+
+    # TODO: maybe we should use __getstate__ and __setstate__ instead of __reduce__, as recommended in the docs.
+    def __reduce__(self):
+        # __reduce__ gets called when we try to pickle the dataset.
+        # In a DataLoader with spawn context, this gets called `num_workers` times from the main process.
+
+        # We have to reset the [target_]transform[s] attributes of the dataset
+        # to their original values, because we previously set them to None in __init__().
+        dataset = copy(self._dataset)
+        dataset.transform = self.transform
+        dataset.transforms = self.transforms
+        dataset.target_transform = self.target_transform
+
+        return wrap_dataset_for_transforms_v2, (dataset, self._target_keys)
+
+
+def raise_not_supported(description):
+    raise RuntimeError(
+        f"{description} is currently not supported by this wrapper. "
+        f"If this would be helpful for you, please open an issue at https://github.com/pytorch/vision/issues."
+    )
+
+
+def identity(item):
+    return item
+
+
+def identity_wrapper_factory(dataset, target_keys):
+    def wrapper(idx, sample):
+        return sample
+
+    return wrapper
+
+
+def pil_image_to_mask(pil_image):
+    return tv_tensors.Mask(pil_image)
+
+
+def parse_target_keys(target_keys, *, available, default):
+    if target_keys is None:
+        target_keys = default
+    if target_keys == "all":
+        target_keys = available
+    else:
+        target_keys = set(target_keys)
+        extra = target_keys - available
+        if extra:
+            raise ValueError(f"Target keys {sorted(extra)} are not available")
+
+    return target_keys
+
+
+def list_of_dicts_to_dict_of_lists(list_of_dicts):
+    dict_of_lists = defaultdict(list)
+    for dct in list_of_dicts:
+        for key, value in dct.items():
+            dict_of_lists[key].append(value)
+    return dict(dict_of_lists)
+
+
+def wrap_target_by_type(target, *, target_types, type_wrappers):
+    if not isinstance(target, (tuple, list)):
+        target = [target]
+
+    wrapped_target = tuple(
+        type_wrappers.get(target_type, identity)(item) for target_type, item in zip(target_types, target)
+    )
+
+    if len(wrapped_target) == 1:
+        wrapped_target = wrapped_target[0]
+
+    return wrapped_target
+
+
+def classification_wrapper_factory(dataset, target_keys):
+    return identity_wrapper_factory(dataset, target_keys)
+
+
+for dataset_cls in [
+    datasets.Caltech256,
+    datasets.CIFAR10,
+    datasets.CIFAR100,
+    datasets.ImageNet,
+    datasets.MNIST,
+    datasets.FashionMNIST,
+    datasets.GTSRB,
+    datasets.DatasetFolder,
+    datasets.ImageFolder,
+    datasets.Imagenette,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(classification_wrapper_factory)
+
+
+def segmentation_wrapper_factory(dataset, target_keys):
+    def wrapper(idx, sample):
+        image, mask = sample
+        return image, pil_image_to_mask(mask)
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.VOCSegmentation,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(segmentation_wrapper_factory)
+
+
+def video_classification_wrapper_factory(dataset, target_keys):
+    if dataset.video_clips.output_format == "THWC":
+        raise RuntimeError(
+            f"{type(dataset).__name__} with `output_format='THWC'` is not supported by this wrapper, "
+            f"since it is not compatible with the transformations. Please use `output_format='TCHW'` instead."
+        )
+
+    def wrapper(idx, sample):
+        video, audio, label = sample
+
+        video = tv_tensors.Video(video)
+
+        return video, audio, label
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.HMDB51,
+    datasets.Kinetics,
+    datasets.UCF101,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(video_classification_wrapper_factory)
+
+
+@WRAPPER_FACTORIES.register(datasets.Caltech101)
+def caltech101_wrapper_factory(dataset, target_keys):
+    if "annotation" in dataset.target_type:
+        raise_not_supported("Caltech101 dataset with `target_type=['annotation', ...]`")
+
+    return classification_wrapper_factory(dataset, target_keys)
+
+
+@WRAPPER_FACTORIES.register(datasets.CocoDetection)
+def coco_dectection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "segmentation",
+            "area",
+            "iscrowd",
+            "image_id",
+            "bbox",
+            "category_id",
+            # added by the wrapper
+            "boxes",
+            "masks",
+            "labels",
+        },
+        default={"image_id", "boxes", "labels"},
+    )
+
+    def segmentation_to_mask(segmentation, *, canvas_size):
+        from pycocotools import mask
+
+        if isinstance(segmentation, dict):
+            # if counts is a string, it is already an encoded RLE mask
+            if not isinstance(segmentation["counts"], str):
+                segmentation = mask.frPyObjects(segmentation, *canvas_size)
+        elif isinstance(segmentation, list):
+            segmentation = mask.merge(mask.frPyObjects(segmentation, *canvas_size))
+        else:
+            raise ValueError(f"COCO segmentation expected to be a dict or a list, got {type(segmentation)}")
+        return torch.from_numpy(mask.decode(segmentation))
+
+    def wrapper(idx, sample):
+        image_id = dataset.ids[idx]
+
+        image, target = sample
+
+        if not target:
+            return image, dict(image_id=image_id)
+
+        canvas_size = tuple(F.get_size(image))
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
+
+        if "image_id" in target_keys:
+            target["image_id"] = image_id
+
+        if "boxes" in target_keys:
+            target["boxes"] = F.convert_bounding_box_format(
+                tv_tensors.BoundingBoxes(
+                    batched_target["bbox"],
+                    format=tv_tensors.BoundingBoxFormat.XYWH,
+                    canvas_size=canvas_size,
+                ),
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            )
+
+        if "masks" in target_keys:
+            target["masks"] = tv_tensors.Mask(
+                torch.stack(
+                    [
+                        segmentation_to_mask(segmentation, canvas_size=canvas_size)
+                        for segmentation in batched_target["segmentation"]
+                    ]
+                ),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(batched_target["category_id"])
+
+        for target_key in target_keys - {"image_id", "boxes", "masks", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
+
+    return wrapper
+
+
+WRAPPER_FACTORIES.register(datasets.CocoCaptions)(identity_wrapper_factory)
+
+
+VOC_DETECTION_CATEGORIES = [
+    "__background__",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+VOC_DETECTION_CATEGORY_TO_IDX = dict(zip(VOC_DETECTION_CATEGORIES, range(len(VOC_DETECTION_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.VOCDetection)
+def voc_detection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "annotation",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        batched_instances = list_of_dicts_to_dict_of_lists(target["annotation"]["object"])
+
+        if "annotation" not in target_keys:
+            target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = tv_tensors.BoundingBoxes(
+                [
+                    [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
+                    for bndbox in batched_instances["bndbox"]
+                ],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(
+                [VOC_DETECTION_CATEGORY_TO_IDX[category] for category in batched_instances["name"]]
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.SBDataset)
+def sbd_wrapper(dataset, target_keys):
+    if dataset.mode == "boundaries":
+        raise_not_supported("SBDataset with mode='boundaries'")
+
+    return segmentation_wrapper_factory(dataset, target_keys)
+
+
+@WRAPPER_FACTORIES.register(datasets.CelebA)
+def celeba_wrapper_factory(dataset, target_keys):
+    if any(target_type in dataset.target_type for target_type in ["attr", "landmarks"]):
+        raise_not_supported("`CelebA` dataset with `target_type=['attr', 'landmarks', ...]`")
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "bbox": lambda item: F.convert_bounding_box_format(
+                    tv_tensors.BoundingBoxes(
+                        item,
+                        format=tv_tensors.BoundingBoxFormat.XYWH,
+                        canvas_size=(image.height, image.width),
+                    ),
+                    new_format=tv_tensors.BoundingBoxFormat.XYXY,
+                ),
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+KITTI_CATEGORIES = ["Car", "Van", "Truck", "Pedestrian", "Person_sitting", "Cyclist", "Tram", "Misc", "DontCare"]
+KITTI_CATEGORY_TO_IDX = dict(zip(KITTI_CATEGORIES, range(len(KITTI_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.Kitti)
+def kitti_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "type",
+            "truncated",
+            "occluded",
+            "alpha",
+            "bbox",
+            "dimensions",
+            "location",
+            "rotation_y",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is None:
+            return image, target
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = tv_tensors.BoundingBoxes(
+                batched_target["bbox"],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor([KITTI_CATEGORY_TO_IDX[category] for category in batched_target["type"]])
+
+        for target_key in target_keys - {"boxes", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.OxfordIIITPet)
+def oxford_iiit_pet_wrapper_factor(dataset, target_keys):
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is not None:
+            target = wrap_target_by_type(
+                target,
+                target_types=dataset._target_types,
+                type_wrappers={
+                    "segmentation": pil_image_to_mask,
+                },
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.Cityscapes)
+def cityscapes_wrapper_factory(dataset, target_keys):
+    if any(target_type in dataset.target_type for target_type in ["polygon", "color"]):
+        raise_not_supported("`Cityscapes` dataset with `target_type=['polygon', 'color', ...]`")
+
+    def instance_segmentation_wrapper(mask):
+        # See https://github.com/mcordts/cityscapesScripts/blob/8da5dd00c9069058ccc134654116aac52d4f6fa2/cityscapesscripts/preparation/json2instanceImg.py#L7-L21
+        data = pil_image_to_mask(mask)
+        masks = []
+        labels = []
+        for id in data.unique():
+            masks.append(data == id)
+            label = id
+            if label >= 1_000:
+                label //= 1_000
+            labels.append(label)
+        return dict(masks=tv_tensors.Mask(torch.stack(masks)), labels=torch.stack(labels))
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "instance": instance_segmentation_wrapper,
+                "semantic": pil_image_to_mask,
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.WIDERFace)
+def widerface_wrapper(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            "bbox",
+            "blur",
+            "expression",
+            "illumination",
+            "occlusion",
+            "pose",
+            "invalid",
+        },
+        default="all",
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is None:
+            return image, target
+
+        target = {key: target[key] for key in target_keys}
+
+        if "bbox" in target_keys:
+            target["bbox"] = F.convert_bounding_box_format(
+                tv_tensors.BoundingBoxes(
+                    target["bbox"], format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
+                ),
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            )
+
+        return image, target
+
+    return wrapper
diff --git a/torchvision/tv_tensors/_image.py b/torchvision/tv_tensors/_image.py
new file mode 100644
index 00000000000..19fe468ac81
--- /dev/null
+++ b/torchvision/tv_tensors/_image.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import Any
+
+import PIL.Image
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Image(TVTensor):
+    """:class:`torch.Tensor` subclass for images with shape ``[..., C, H, W]``.
+
+    .. note::
+
+        In the :ref:`transforms <transforms>`, ``Image`` instances are largely
+        interchangeable with pure :class:`torch.Tensor`. See
+        :ref:`this note <passthrough_heuristic>` for more details.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the image is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> Image:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.transforms.v2 import functional as F
+
+            data = F.pil_to_tensor(data)
+
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if tensor.ndim < 2:
+            raise ValueError(f"Tensor must be 2D or higher, got {tensor.ndim}D tensor.")
+        elif tensor.ndim == 2:
+            tensor = tensor.unsqueeze(0)
+
+        return tensor.as_subclass(cls)
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr()
diff --git a/torchvision/tv_tensors/_keypoints.py b/torchvision/tv_tensors/_keypoints.py
new file mode 100644
index 00000000000..aede31ad7db
--- /dev/null
+++ b/torchvision/tv_tensors/_keypoints.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from typing import Any, Mapping, Sequence
+
+import torch
+from torch.utils._pytree import tree_flatten
+
+from ._tv_tensor import TVTensor
+
+
+class KeyPoints(TVTensor):
+    """:class:`torch.Tensor` subclass for tensors with shape ``[..., 2]`` that represent points in an image.
+
+    .. note::
+        Support for keypoints was released in TorchVision 0.23 and is currently
+        a BETA feature. We don't expect the API to change, but there may be some
+        rare edge-cases. If you find any issues, please report them on our bug
+        tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue
+        Each point is represented by its X and Y coordinates along the width and
+        height dimensions, respectively.
+
+    Each point is represented by its X and Y coordinates along the width and height dimensions, respectively.
+
+    KeyPoints may represent any object that can be represented by sequences of 2D points:
+
+    - `Polygonal chains <https://en.wikipedia.org/wiki/Polygonal_chain>`_,
+      including polylines, Bézier curves, etc., which can be of shape
+      ``[N_chains, N_points, 2]``.
+    - Polygons, which can be of shape ``[N_polygons, N_points, 2]``.
+    - Skeletons, which can be of shape ``[N_skeletons, N_bones, 2, 2]`` for
+      pose-estimation models.
+
+    .. note::
+        Like for :class:`torchvision.tv_tensors.BoundingBoxes`, there should
+        only be a single instance of the
+        :class:`torchvision.tv_tensors.KeyPoints` class per sample e.g.
+        ``{"img": img, "poins_of_interest": KeyPoints(...)}``, although one
+        :class:`torchvision.tv_tensors.KeyPoints` object can contain multiple
+        key points
+
+    Args:
+        data: Any data that can be turned into a tensor with
+            :func:`torch.as_tensor`.
+        canvas_size (two-tuple of ints): Height and width of the corresponding
+            image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If
+            omitted, will be inferred from ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If
+            omitted and ``data`` is a :class:`torch.Tensor`, the device is taken
+            from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record
+            operations on the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the value is taken from it. Otherwise,
+            defaults to ``False``.
+    """
+
+    canvas_size: tuple[int, int]
+
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, canvas_size: tuple[int, int], check_dims: bool = True) -> KeyPoints:  # type: ignore[override]
+        if check_dims:
+            if tensor.ndim == 1:
+                tensor = tensor.unsqueeze(0)
+            elif tensor.shape[-1] != 2:
+                raise ValueError(f"Expected a tensor of shape (..., 2), not {tensor.shape}")
+        points = tensor.as_subclass(cls)
+        points.canvas_size = canvas_size
+        return points
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        canvas_size: tuple[int, int],
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> KeyPoints:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor, canvas_size=canvas_size)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> KeyPoints:
+        # Similar to BoundingBoxes._wrap_output(), see comment there.
+        flat_params, _ = tree_flatten(args + (tuple(kwargs.values()) if kwargs else ()))  # type: ignore[operator]
+        first_keypoints_from_args = next(x for x in flat_params if isinstance(x, KeyPoints))
+        canvas_size = first_keypoints_from_args.canvas_size
+
+        if isinstance(output, torch.Tensor) and not isinstance(output, KeyPoints):
+            output = KeyPoints._wrap(output, canvas_size=canvas_size, check_dims=False)
+        elif isinstance(output, (tuple, list)):
+            # This branch exists for chunk() and unbind()
+            output = type(output)(KeyPoints._wrap(part, canvas_size=canvas_size, check_dims=False) for part in output)
+        return output
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(canvas_size=self.canvas_size)
diff --git a/torchvision/tv_tensors/_mask.py b/torchvision/tv_tensors/_mask.py
new file mode 100644
index 00000000000..f43a5c7e2fd
--- /dev/null
+++ b/torchvision/tv_tensors/_mask.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Any
+
+import PIL.Image
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Mask(TVTensor):
+    """:class:`torch.Tensor` subclass for segmentation and detection masks with shape ``[..., H, W]``.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the mask is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> Mask:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.transforms.v2 import functional as F
+
+            data = F.pil_to_tensor(data)
+
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return tensor.as_subclass(cls)
diff --git a/torchvision/tv_tensors/_torch_function_helpers.py b/torchvision/tv_tensors/_torch_function_helpers.py
new file mode 100644
index 00000000000..66812fb5ca6
--- /dev/null
+++ b/torchvision/tv_tensors/_torch_function_helpers.py
@@ -0,0 +1,78 @@
+import torch
+
+_TORCHFUNCTION_SUBCLASS = False
+
+
+class _ReturnTypeCM:
+    def __init__(self, to_restore):
+        self.to_restore = to_restore
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        global _TORCHFUNCTION_SUBCLASS
+        _TORCHFUNCTION_SUBCLASS = self.to_restore
+
+
+def set_return_type(return_type: str):
+    """Set the return type of torch operations on :class:`~torchvision.tv_tensors.TVTensor`.
+
+    This only affects the behaviour of torch operations. It has no effect on
+    ``torchvision`` transforms or functionals, which will always return as
+    output the same type that was passed as input.
+
+    .. warning::
+
+        We recommend using :class:`~torchvision.transforms.v2.ToPureTensor` at
+        the end of your transform pipelines if you use
+        ``set_return_type("TVTensor")``. This will avoid the
+        ``__torch_function__`` overhead in the models ``forward()``.
+
+    Can be used as a global flag for the entire program:
+
+    .. code:: python
+
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor (default behaviour)
+
+        set_return_type("TVTensor")
+        img + 2  # This is an Image
+
+    or as a context manager to restrict the scope:
+
+    .. code:: python
+
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor
+        with set_return_type("TVTensor"):
+            img + 2  # This is an Image
+        img + 2  # This is a pure Tensor
+
+    Args:
+        return_type (str): Can be "TVTensor" or "Tensor" (case-insensitive).
+            Default is "Tensor" (i.e. pure :class:`torch.Tensor`).
+    """
+    global _TORCHFUNCTION_SUBCLASS
+    to_restore = _TORCHFUNCTION_SUBCLASS
+
+    try:
+        _TORCHFUNCTION_SUBCLASS = {"tensor": False, "tvtensor": True}[return_type.lower()]
+    except KeyError:
+        raise ValueError(f"return_type must be 'TVTensor' or 'Tensor', got {return_type}") from None
+
+    return _ReturnTypeCM(to_restore)
+
+
+def _must_return_subclass():
+    return _TORCHFUNCTION_SUBCLASS
+
+
+# For those ops we always want to preserve the original subclass instead of returning a pure Tensor
+_FORCE_TORCHFUNCTION_SUBCLASS = {
+    torch.Tensor.clone,
+    torch.Tensor.to,
+    torch.Tensor.detach,
+    torch.Tensor.requires_grad_,
+    torch.Tensor.pin_memory,
+}
diff --git a/torchvision/tv_tensors/_tv_tensor.py b/torchvision/tv_tensors/_tv_tensor.py
new file mode 100644
index 00000000000..9f07fc8f226
--- /dev/null
+++ b/torchvision/tv_tensors/_tv_tensor.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from typing import Any, Callable, TypeVar
+
+import torch
+from torch._C import DisableTorchFunctionSubclass
+from torch.types import _device, _dtype, _size
+
+from torchvision.tv_tensors._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass
+
+
+D = TypeVar("D", bound="TVTensor")
+
+
+class TVTensor(torch.Tensor):
+    """Base class for all TVTensors.
+
+    You probably don't want to use this class unless you're defining your own
+    custom TVTensors. See
+    :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for details.
+    """
+
+    @staticmethod
+    def _to_tensor(
+        data: Any,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> torch.Tensor:
+        if requires_grad is None:
+            requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
+        return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> torch.Tensor:
+        # Same as torch._tensor._convert
+        if isinstance(output, torch.Tensor) and not isinstance(output, cls):
+            output = output.as_subclass(cls)
+
+        if isinstance(output, (tuple, list)):
+            # Also handles things like namedtuples
+            output = type(output)(cls._wrap_output(part, args, kwargs) for part in output)
+        return output
+
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable[..., torch.Tensor],
+        types: tuple[type[torch.Tensor], ...],
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> torch.Tensor:
+        """For general information about how the __torch_function__ protocol works,
+        see https://pytorch.org/docs/stable/notes/extending.html#extending-torch
+
+        TL;DR: Every time a PyTorch operator is called, it goes through the inputs and looks for the
+        ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
+        ``args`` and ``kwargs`` of the original call.
+
+        Why do we override this? Because the base implementation in torch.Tensor would preserve the TVTensor type
+        of the output. In our case, we want to return pure tensors instead (with a few exceptions). Refer to the
+        "TVTensors FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet).
+
+        Our implementation below is very similar to the base implementation in ``torch.Tensor`` - go check it out.
+        """
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        # Like in the base Tensor.__torch_function__ implementation, it's easier to always use
+        # DisableTorchFunctionSubclass and then manually re-wrap the output if necessary
+        with DisableTorchFunctionSubclass():
+            output = func(*args, **kwargs or dict())
+
+        must_return_subclass = _must_return_subclass()
+        if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)):
+            # If you're wondering why we need the `isinstance(args[0], cls)` check, remove it and see what fails
+            # in test_to_tv_tensor_reference().
+            # The __torch_function__ protocol will invoke the __torch_function__ method on *all* types involved in
+            # the computation by walking the MRO upwards. For example,
+            # `out = a_pure_tensor.to(an_image)` will invoke `Image.__torch_function__` with
+            # `args = (a_pure_tensor, an_image)` first. Without this guard, `out` would
+            # be wrapped into an `Image`.
+            return cls._wrap_output(output, args, kwargs)
+
+        if not must_return_subclass and isinstance(output, cls):
+            # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`,
+            # so for those, the output is still a TVTensor. Thus, we need to manually unwrap.
+            return output.as_subclass(torch.Tensor)
+
+        return output
+
+    def _make_repr(self, **kwargs: Any) -> str:
+        # This is a poor man's implementation of the proposal in https://github.com/pytorch/pytorch/issues/76532.
+        # If that ever gets implemented, remove this in favor of the solution on the `torch.Tensor` class.
+        extra_repr = ", ".join(f"{key}={value}" for key, value in kwargs.items())
+        return f"{super().__repr__()[:-1]}, {extra_repr})"
+
+    # Add properties for common attributes like shape, dtype, device, ndim etc
+    # this way we return the result without passing into __torch_function__
+    @property
+    def shape(self) -> _size:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().shape
+
+    @property
+    def ndim(self) -> int:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().ndim
+
+    @property
+    def device(self, *args: Any, **kwargs: Any) -> _device:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().device
+
+    @property
+    def dtype(self) -> _dtype:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().dtype
+
+    def __deepcopy__(self: D, memo: dict[int, Any]) -> D:
+        # We need to detach first, since a plain `Tensor.clone` will be part of the computation graph, which does
+        # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
+        # attribute is cleared, so we need to refill it before we return.
+        # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
+        # `BoundingBoxes.format` and `BoundingBoxes.canvas_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBoxes.clone()`.
+        return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
diff --git a/torchvision/tv_tensors/_video.py b/torchvision/tv_tensors/_video.py
new file mode 100644
index 00000000000..2dd9dafadde
--- /dev/null
+++ b/torchvision/tv_tensors/_video.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Video(TVTensor):
+    """:class:`torch.Tensor` subclass for videos with shape ``[..., T, C, H, W]``.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the video is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> Video:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if data.ndim < 4:
+            raise ValueError
+        return tensor.as_subclass(cls)
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr()
diff --git a/torchvision/utils.py b/torchvision/utils.py
index 3809a13c049..0d819ef8330 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -4,13 +4,15 @@
 import warnings
 from itertools import repeat
 from types import FunctionType
-from typing import Any, BinaryIO, List, Optional, Tuple, Union
+from typing import Any, BinaryIO, Optional, Union
 
 import numpy as np
 import torch
-from PIL import Image, ImageColor, ImageDraw, ImageFont
+from PIL import __version__ as PILLOW_VERSION_STRING, Image, ImageColor, ImageDraw, ImageFont
+
 
 __all__ = [
+    "_Image_fromarray",
     "make_grid",
     "save_image",
     "draw_bounding_boxes",
@@ -22,14 +24,13 @@
 
 @torch.no_grad()
 def make_grid(
-    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    tensor: Union[torch.Tensor, list[torch.Tensor]],
     nrow: int = 8,
     padding: int = 2,
     normalize: bool = False,
-    value_range: Optional[Tuple[int, int]] = None,
+    value_range: Optional[tuple[int, int]] = None,
     scale_each: bool = False,
     pad_value: float = 0.0,
-    **kwargs,
 ) -> torch.Tensor:
     """
     Make a grid of images.
@@ -45,10 +46,6 @@ def make_grid(
         value_range (tuple, optional): tuple (min, max) where min and max are numbers,
             then these numbers are used to normalize the image. By default, min and max
             are computed from the tensor.
-        range (tuple. optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``value_range``
-                instead.
         scale_each (bool, optional): If ``True``, scale each image in the batch of
             images separately rather than the (min, max) over all images. Default: ``False``.
         pad_value (float, optional): Value for the padded pixels. Default: ``0``.
@@ -66,13 +63,6 @@ def make_grid(
         else:
             raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
 
-    if "range" in kwargs.keys():
-        warnings.warn(
-            "The parameter 'range' is deprecated since 0.12 and will be removed in 0.14. "
-            "Please use 'value_range' instead."
-        )
-        value_range = kwargs["range"]
-
     # if list of tensors, convert to a 4D mini-batch Tensor
     if isinstance(tensor, list):
         tensor = torch.stack(tensor, dim=0)
@@ -134,9 +124,139 @@ def norm_range(t, value_range):
     return grid
 
 
+class _ImageDrawTV(ImageDraw.ImageDraw):
+    """
+    A wrapper around PIL.ImageDraw to add functionalities for drawing rotated bounding boxes.
+    """
+
+    def oriented_rectangle(self, xy, fill=None, outline=None, width=1):
+        self.dashed_line(((xy[0], xy[1]), (xy[2], xy[3])), width=width, fill=outline)
+        for i in range(2, len(xy), 2):
+            self.line(
+                ((xy[i], xy[i + 1]), (xy[(i + 2) % len(xy)], xy[(i + 3) % len(xy)])),
+                width=width,
+                fill=outline,
+            )
+        self.polygon(xy, fill=fill, outline=None, width=0)
+
+    def dashed_line(self, xy, fill=None, width=0, joint=None, dash_length=5, space_length=5):
+        # Calculate the total length of the line
+        total_length = 0
+        for i in range(1, len(xy)):
+            x1, y1 = xy[i - 1]
+            x2, y2 = xy[i]
+            total_length += ((x2 - x1) ** 2 + (y2 - y1) ** 2) ** 0.5
+        # Initialize the current position and the current dash
+        current_position = 0
+        current_dash = True
+        # Iterate over the coordinates of the line
+        for i in range(1, len(xy)):
+            x1, y1 = xy[i - 1]
+            x2, y2 = xy[i]
+            # Calculate the length of this segment
+            segment_length = ((x2 - x1) ** 2 + (y2 - y1) ** 2) ** 0.5
+            # While there are still dashes to draw on this segment
+            while segment_length > 0:
+                # Calculate the length of this dash
+                dash_length_to_draw = min(segment_length, dash_length if current_dash else space_length)
+                # Calculate the end point of this dash
+                dx = x2 - x1
+                dy = y2 - y1
+                angle = math.atan2(dy, dx)
+                end_x = x1 + math.cos(angle) * dash_length_to_draw
+                end_y = y1 + math.sin(angle) * dash_length_to_draw
+                # If this is a dash, draw it
+                if current_dash:
+                    self.line([(x1, y1), (end_x, end_y)], fill, width, joint)
+                # Update the current position and the current dash
+                current_position += dash_length_to_draw
+                segment_length -= dash_length_to_draw
+                x1, y1 = end_x, end_y
+                current_dash = not current_dash
+
+
+def _Image_fromarray(
+    obj: np.ndarray,
+    mode: str,
+) -> Image.Image:
+    """
+    A wrapper around PIL.Image.fromarray to mitigate the deprecation of the
+    mode paramter. See:
+      https://pillow.readthedocs.io/en/stable/releasenotes/11.3.0.html#image-fromarray-mode-parameter
+    """
+
+    # This may throw if the version string is from an install that comes from a
+    # non-stable or development version. We'll fall back to the old behavior in
+    # such cases.
+    try:
+        PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION_STRING.split("."))
+    except Exception:
+        PILLOW_VERSION = None
+
+    if PILLOW_VERSION is not None and PILLOW_VERSION >= (11, 3):
+        # The actual PR that implements the deprecation has more context for why
+        # it was done, and also points out some problems:
+        #
+        #    https://github.com/python-pillow/Pillow/pull/9018
+        #
+        # Our use case falls into those problems. We actually rely on the old
+        # behavior of Image.fromarray():
+        #
+        #    new behavior: PIL will infer the image mode from the data passed
+        #                  in. That is, the type and shape determines the mode.
+        #
+        #    old behiavor: The mode will change how PIL reads the image,
+        #                  regardless of the data. That is, it will make the
+        #                  data work with the mode.
+        #
+        # Our uses of Image.fromarray() are effectively a "turn into PIL image
+        # AND convert the kind" operation. In particular, in
+        # functional.to_pil_image() and transforms.ToPILImage.
+        #
+        # However, Image.frombuffer() still performs this conversion. The code
+        # below is lifted from the new implementation of Image.fromarray(). We
+        # omit the code that infers the mode, and use the code that figures out
+        # from the data passed in (obj) what the correct parameters are to
+        # Image.frombuffer().
+        #
+        # Note that the alternate solution below does not work:
+        #
+        #    img = Image.fromarray(obj)
+        #    img = img.convert(mode)
+        #
+        # The resulting image has very different actual pixel values than before.
+        #
+        # TODO: Issue #9151. Pillow has an open PR to restore the functionality
+        #       we rely on:
+        #
+        #       https://github.com/python-pillow/Pillow/pull/9063
+        #
+        #       When that is part of a release, we can revisit this hack below.
+        arr = obj.__array_interface__
+        shape = arr["shape"]
+        ndim = len(shape)
+        size = 1 if ndim == 1 else shape[1], shape[0]
+
+        strides = arr.get("strides", None)
+        contiguous_obj: Union[np.ndarray, bytes] = obj
+        if strides is not None:
+            # We require that the data is contiguous; if it is not, we need to
+            # convert it into a contiguous format.
+            if hasattr(obj, "tobytes"):
+                contiguous_obj = obj.tobytes()
+            elif hasattr(obj, "tostring"):
+                contiguous_obj = obj.tostring()
+            else:
+                raise ValueError("Unable to convert obj into contiguous format")
+
+        return Image.frombuffer(mode, size, contiguous_obj, "raw", mode, 0, 1)
+    else:
+        return Image.fromarray(obj, mode)
+
+
 @torch.no_grad()
 def save_image(
-    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    tensor: Union[torch.Tensor, list[torch.Tensor]],
     fp: Union[str, pathlib.Path, BinaryIO],
     format: Optional[str] = None,
     **kwargs,
@@ -156,7 +276,7 @@ def save_image(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(save_image)
     grid = make_grid(tensor, **kwargs)
-    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
+    # Add 0.5 after unnormalizing to [0, 255] to round to the nearest integer
     ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
     im = Image.fromarray(ndarr)
     im.save(fp, format=format)
@@ -166,24 +286,27 @@ def save_image(
 def draw_bounding_boxes(
     image: torch.Tensor,
     boxes: torch.Tensor,
-    labels: Optional[List[str]] = None,
-    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+    labels: Optional[list[str]] = None,
+    colors: Optional[Union[list[Union[str, tuple[int, int, int]]], str, tuple[int, int, int]]] = None,
     fill: Optional[bool] = False,
     width: int = 1,
     font: Optional[str] = None,
     font_size: Optional[int] = None,
+    label_colors: Optional[Union[list[Union[str, tuple[int, int, int]]], str, tuple[int, int, int]]] = None,
+    fill_labels: bool = False,
 ) -> torch.Tensor:
-
     """
-    Draws bounding boxes on given image.
-    The values of the input image should be uint8 between 0 and 255.
+    Draws bounding boxes on given RGB image.
+    The image values should be uint8 in [0, 255] or float in [0, 1].
     If fill is True, Resulting Tensor should be saved as PNG image.
 
     Args:
-        image (Tensor): Tensor of shape (C x H x W) and dtype uint8.
-        boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax) format. Note that
-            the boxes are absolute coordinates with respect to the image. In other words: `0 <= xmin < xmax < W` and
-            `0 <= ymin < ymax < H`.
+        image (Tensor): Tensor of shape (C, H, W) and dtype uint8 or float.
+        boxes (Tensor): Tensor of size (N, 4) or (N, 8) containing bounding boxes.
+            For (N, 4), the format is (xmin, ymin, xmax, ymax) and the boxes are absolute coordinates with respect to the image.
+            In other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
+            For (N, 8), the format is (x1, y1, x2, y2, x3, y3, x4, y4) and the boxes are absolute coordinates with respect to the underlying
+            object, so no need to verify the latter inequalities.
         labels (List[str]): List containing the labels of bounding boxes.
         colors (color or list of colors, optional): List containing the colors
             of the boxes or single color for all boxes. The color can be represented as
@@ -195,22 +318,27 @@ def draw_bounding_boxes(
             also search in other directories, such as the `fonts/` directory on Windows or `/Library/Fonts/`,
             `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
         font_size (int): The requested font size in points.
+        label_colors (color or list of colors, optional): Colors for the label text.  See the description of the
+            `colors` argument for details.  Defaults to the same colors used for the boxes, or to black if ``fill_labels`` is True.
+        fill_labels (bool): If `True` fills the label background with specified box color (from the ``colors`` parameter). Default: False.
 
     Returns:
         img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
+
     """
+    import torchvision.transforms.v2.functional as F  # noqa
 
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(draw_bounding_boxes)
     if not isinstance(image, torch.Tensor):
         raise TypeError(f"Tensor expected, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
+    elif not (image.dtype == torch.uint8 or image.is_floating_point()):
+        raise ValueError(f"The image dtype must be uint8 or float, got {image.dtype}")
     elif image.dim() != 3:
         raise ValueError("Pass individual images, not batches")
     elif image.size(0) not in {1, 3}:
         raise ValueError("Only grayscale and RGB images are supported")
-    elif (boxes[:, 0] > boxes[:, 2]).any() or (boxes[:, 1] > boxes[:, 3]).any():
+    elif boxes.shape[-1] == 4 and ((boxes[:, 0] > boxes[:, 2]).any() or (boxes[:, 1] > boxes[:, 3]).any()):
         raise ValueError(
             "Boxes need to be in (xmin, ymin, xmax, ymax) format. Use torchvision.ops.box_convert to convert them"
         )
@@ -222,21 +350,17 @@ def draw_bounding_boxes(
         return image
 
     if labels is None:
-        labels: Union[List[str], List[None]] = [None] * num_boxes  # type: ignore[no-redef]
+        labels: Union[list[str], list[None]] = [None] * num_boxes  # type: ignore[no-redef]
     elif len(labels) != num_boxes:
         raise ValueError(
             f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box."
         )
 
-    if colors is None:
-        colors = _generate_color_palette(num_boxes)
-    elif isinstance(colors, list):
-        if len(colors) < num_boxes:
-            raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ")
-    else:  # colors specifies a single color for all boxes
-        colors = [colors] * num_boxes
-
-    colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
+    colors = _parse_colors(colors, num_objects=num_boxes)
+    if label_colors or fill_labels:
+        label_colors = _parse_colors(label_colors if label_colors else "black", num_objects=num_boxes)  # type: ignore[assignment]
+    else:
+        label_colors = colors.copy()  # type: ignore[assignment]
 
     if font is None:
         if font_size is not None:
@@ -249,27 +373,37 @@ def draw_bounding_boxes(
     if image.size(0) == 1:
         image = torch.tile(image, (3, 1, 1))
 
-    ndarr = image.permute(1, 2, 0).cpu().numpy()
-    img_to_draw = Image.fromarray(ndarr)
+    original_dtype = image.dtype
+    if original_dtype.is_floating_point:
+        image = F.to_dtype(image, dtype=torch.uint8, scale=True)
+
+    img_to_draw = F.to_pil_image(image)
     img_boxes = boxes.to(torch.int64).tolist()
 
     if fill:
-        draw = ImageDraw.Draw(img_to_draw, "RGBA")
+        draw = _ImageDrawTV(img_to_draw, "RGBA")
     else:
-        draw = ImageDraw.Draw(img_to_draw)
+        draw = _ImageDrawTV(img_to_draw)
 
-    for bbox, color, label in zip(img_boxes, colors, labels):  # type: ignore[arg-type]
-        if fill:
-            fill_color = color + (100,)
-            draw.rectangle(bbox, width=width, outline=color, fill=fill_color)
-        else:
-            draw.rectangle(bbox, width=width, outline=color)
+    for bbox, color, label, label_color in zip(img_boxes, colors, labels, label_colors):  # type: ignore[arg-type]
+        draw_method = draw.oriented_rectangle if len(bbox) > 4 else draw.rectangle
+        fill_color = color + (100,) if fill else None
+        draw_method(bbox, width=width, outline=color, fill=fill_color)
 
         if label is not None:
-            margin = width + 1
-            draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font)
+            box_margin = 1
+            margin = width + box_margin
+            if fill_labels:
+                left, top, right, bottom = draw.textbbox((bbox[0] + margin, bbox[1] + margin), label, font=txt_font)
+                draw.rectangle(
+                    (left - box_margin, top - box_margin, right + box_margin, bottom + box_margin), fill=color
+                )
+            draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=label_color, font=txt_font)  # type: ignore[arg-type]
 
-    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+    out = F.pil_to_tensor(img_to_draw)
+    if original_dtype.is_floating_point:
+        out = F.to_dtype(out, dtype=original_dtype, scale=True)
+    return out
 
 
 @torch.no_grad()
@@ -277,15 +411,14 @@ def draw_segmentation_masks(
     image: torch.Tensor,
     masks: torch.Tensor,
     alpha: float = 0.8,
-    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+    colors: Optional[Union[list[Union[str, tuple[int, int, int]]], str, tuple[int, int, int]]] = None,
 ) -> torch.Tensor:
-
     """
     Draws segmentation masks on given RGB image.
-    The values of the input image should be uint8 between 0 and 255.
+    The image values should be uint8 in [0, 255] or float in [0, 1].
 
     Args:
-        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8 or float.
         masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
         alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
             0 means full transparency, 1 means no transparency.
@@ -302,8 +435,8 @@ def draw_segmentation_masks(
         _log_api_usage_once(draw_segmentation_masks)
     if not isinstance(image, torch.Tensor):
         raise TypeError(f"The image must be a tensor, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
+    elif not (image.dtype == torch.uint8 or image.is_floating_point()):
+        raise ValueError(f"The image dtype must be uint8 or float, got {image.dtype}")
     elif image.dim() != 3:
         raise ValueError("Pass individual images, not batches")
     elif image.size()[0] != 3:
@@ -318,98 +451,129 @@ def draw_segmentation_masks(
         raise ValueError("The image and the masks must have the same height and width")
 
     num_masks = masks.size()[0]
-    if colors is not None and num_masks > len(colors):
-        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
+    overlapping_masks = masks.sum(dim=0) > 1
 
     if num_masks == 0:
         warnings.warn("masks doesn't contain any mask. No mask was drawn")
         return image
 
-    if colors is None:
-        colors = _generate_color_palette(num_masks)
-
-    if not isinstance(colors, list):
-        colors = [colors]
-    if not isinstance(colors[0], (tuple, str)):
-        raise ValueError("colors must be a tuple or a string, or a list thereof")
-    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
-        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
-
-    out_dtype = torch.uint8
-
-    colors_ = []
-    for color in colors:
-        if isinstance(color, str):
-            color = ImageColor.getrgb(color)
-        colors_.append(torch.tensor(color, dtype=out_dtype))
+    original_dtype = image.dtype
+    colors = [
+        torch.tensor(color, dtype=original_dtype, device=image.device)
+        for color in _parse_colors(colors, num_objects=num_masks, dtype=original_dtype)
+    ]
 
     img_to_draw = image.detach().clone()
     # TODO: There might be a way to vectorize this
-    for mask, color in zip(masks, colors_):
+    for mask, color in zip(masks, colors):
         img_to_draw[:, mask] = color[:, None]
 
+    img_to_draw[:, overlapping_masks] = 0
+
     out = image * (1 - alpha) + img_to_draw * alpha
-    return out.to(out_dtype)
+    # Note: at this point, out is a float tensor in [0, 1] or [0, 255] depending on original_dtype
+    return out.to(original_dtype)
 
 
 @torch.no_grad()
 def draw_keypoints(
     image: torch.Tensor,
     keypoints: torch.Tensor,
-    connectivity: Optional[List[Tuple[int, int]]] = None,
-    colors: Optional[Union[str, Tuple[int, int, int]]] = None,
+    connectivity: Optional[list[tuple[int, int]]] = None,
+    colors: Optional[Union[str, tuple[int, int, int]]] = None,
     radius: int = 2,
     width: int = 3,
+    visibility: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-
     """
     Draws Keypoints on given RGB image.
-    The values of the input image should be uint8 between 0 and 255.
+    The image values should be uint8 in [0, 255] or float in [0, 1].
+    Keypoints can be drawn for multiple instances at a time.
+
+    This method allows that keypoints and their connectivity are drawn based on the visibility of this keypoint.
 
     Args:
-        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
-        keypoints (Tensor): Tensor of shape (num_instances, K, 2) the K keypoints location for each of the N instances,
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8 or float.
+        keypoints (Tensor): Tensor of shape (num_instances, K, 2) the K keypoint locations for each of the N instances,
             in the format [x, y].
-        connectivity (List[Tuple[int, int]]]): A List of tuple where,
-            each tuple contains pair of keypoints to be connected.
+        connectivity (List[Tuple[int, int]]]): A List of tuple where each tuple contains a pair of keypoints
+            to be connected.
+            If at least one of the two connected keypoints has a ``visibility`` of False,
+            this specific connection is not drawn.
+            Exclusions due to invisibility are computed per-instance.
         colors (str, Tuple): The color can be represented as
             PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
         radius (int): Integer denoting radius of keypoint.
         width (int): Integer denoting width of line connecting keypoints.
+        visibility (Tensor): Tensor of shape (num_instances, K) specifying the visibility of the K
+            keypoints for each of the N instances.
+            True means that the respective keypoint is visible and should be drawn.
+            False means invisible, so neither the point nor possible connections containing it are drawn.
+            The input tensor will be cast to bool.
+            Default ``None`` means that all the keypoints are visible.
+            For more details, see :ref:`draw_keypoints_with_visibility`.
 
     Returns:
-        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with keypoints drawn.
+        img (Tensor[C, H, W]): Image Tensor with keypoints drawn.
     """
 
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(draw_keypoints)
+    # validate image
     if not isinstance(image, torch.Tensor):
         raise TypeError(f"The image must be a tensor, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
+    elif not (image.dtype == torch.uint8 or image.is_floating_point()):
+        raise ValueError(f"The image dtype must be uint8 or float, got {image.dtype}")
     elif image.dim() != 3:
         raise ValueError("Pass individual images, not batches")
     elif image.size()[0] != 3:
         raise ValueError("Pass an RGB image. Other Image formats are not supported")
 
+    # validate keypoints
     if keypoints.ndim != 3:
         raise ValueError("keypoints must be of shape (num_instances, K, 2)")
 
+    # validate visibility
+    if visibility is None:  # set default
+        visibility = torch.ones(keypoints.shape[:-1], dtype=torch.bool)
+    if visibility.ndim == 3:
+        # If visibility was passed as pred.split([2, 1], dim=-1), it will be of shape (num_instances, K, 1).
+        # We make sure it is of shape (num_instances, K). This isn't documented, we're just being nice.
+        visibility = visibility.squeeze(-1)
+    if visibility.ndim != 2:
+        raise ValueError(f"visibility must be of shape (num_instances, K). Got ndim={visibility.ndim}")
+    if visibility.shape != keypoints.shape[:-1]:
+        raise ValueError(
+            "keypoints and visibility must have the same dimensionality for num_instances and K. "
+            f"Got {visibility.shape = } and {keypoints.shape = }"
+        )
+
+    original_dtype = image.dtype
+    if original_dtype.is_floating_point:
+        from torchvision.transforms.v2.functional import to_dtype  # noqa
+
+        image = to_dtype(image, dtype=torch.uint8, scale=True)
+
     ndarr = image.permute(1, 2, 0).cpu().numpy()
     img_to_draw = Image.fromarray(ndarr)
     draw = ImageDraw.Draw(img_to_draw)
     img_kpts = keypoints.to(torch.int64).tolist()
-
-    for kpt_id, kpt_inst in enumerate(img_kpts):
-        for inst_id, kpt in enumerate(kpt_inst):
-            x1 = kpt[0] - radius
-            x2 = kpt[0] + radius
-            y1 = kpt[1] - radius
-            y2 = kpt[1] + radius
+    img_vis = visibility.cpu().bool().tolist()
+
+    for kpt_inst, vis_inst in zip(img_kpts, img_vis):
+        for kpt_coord, kp_vis in zip(kpt_inst, vis_inst):
+            if not kp_vis:
+                continue
+            x1 = kpt_coord[0] - radius
+            x2 = kpt_coord[0] + radius
+            y1 = kpt_coord[1] - radius
+            y2 = kpt_coord[1] + radius
             draw.ellipse([x1, y1, x2, y2], fill=colors, outline=None, width=0)
 
         if connectivity:
             for connection in connectivity:
+                if (not vis_inst[connection[0]]) or (not vis_inst[connection[1]]):
+                    continue
                 start_pt_x = kpt_inst[connection[0]][0]
                 start_pt_y = kpt_inst[connection[0]][1]
 
@@ -421,13 +585,15 @@ def draw_keypoints(
                     width=width,
                 )
 
-    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+    out = torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1)
+    if original_dtype.is_floating_point:
+        out = to_dtype(out, dtype=original_dtype, scale=True)
+    return out
 
 
 # Flow visualization code adapted from https://github.com/tomrunia/OpticalFlow_Visualization
 @torch.no_grad()
 def flow_to_image(flow: torch.Tensor) -> torch.Tensor:
-
     """
     Converts a flow to an RGB image.
 
@@ -461,7 +627,6 @@ def flow_to_image(flow: torch.Tensor) -> torch.Tensor:
 
 @torch.no_grad()
 def _normalized_flow_to_image(normalized_flow: torch.Tensor) -> torch.Tensor:
-
     """
     Converts a batch of normalized flow to an RGB image.
 
@@ -546,8 +711,54 @@ def _generate_color_palette(num_objects: int):
     return [tuple((i * palette) % 255) for i in range(num_objects)]
 
 
-def _log_api_usage_once(obj: Any) -> None:
+def _parse_colors(
+    colors: Union[None, str, tuple[int, int, int], list[Union[str, tuple[int, int, int]]]],
+    *,
+    num_objects: int,
+    dtype: torch.dtype = torch.uint8,
+) -> list[tuple[int, int, int]]:
+    """
+    Parses a specification of colors for a set of objects.
+
+    Args:
+        colors: A specification of colors for the objects. This can be one of the following:
+            - None: to generate a color palette automatically.
+            - A list of colors: where each color is either a string (specifying a named color) or an RGB tuple.
+            - A string or an RGB tuple: to use the same color for all objects.
+
+            If `colors` is a tuple, it should be a 3-tuple specifying the RGB values of the color.
+            If `colors` is a list, it should have at least as many elements as the number of objects to color.
+
+        num_objects (int): The number of objects to color.
+
+    Returns:
+        A list of 3-tuples, specifying the RGB values of the colors.
 
+    Raises:
+        ValueError: If the number of colors in the list is less than the number of objects to color.
+                    If `colors` is not a list, tuple, string or None.
+    """
+    if colors is None:
+        colors = _generate_color_palette(num_objects)
+    elif isinstance(colors, list):
+        if len(colors) < num_objects:
+            raise ValueError(
+                f"Number of colors must be equal or larger than the number of objects, but got {len(colors)} < {num_objects}."
+            )
+    elif not isinstance(colors, (tuple, str)):
+        raise ValueError(f"`colors` must be a tuple or a string, or a list thereof, but got {colors}.")
+    elif isinstance(colors, tuple) and len(colors) != 3:
+        raise ValueError(f"If passed as tuple, colors should be an RGB triplet, but got {colors}.")
+    else:  # colors specifies a single color for all objects
+        colors = [colors] * num_objects
+
+    colors = [ImageColor.getrgb(color) if isinstance(color, str) else color for color in colors]
+    if dtype.is_floating_point:  # [0, 255] -> [0, 1]
+        colors = [tuple(v / 255 for v in color) for color in colors]  # type: ignore[union-attr]
+    return colors  # type: ignore[return-value]
+
+
+def _log_api_usage_once(obj: Any) -> None:
     """
     Logs API usage(module and name) within an organization.
     In a large ecosystem, it's often useful to track the PyTorch and
@@ -573,10 +784,10 @@ def _log_api_usage_once(obj: Any) -> None:
     torch._C._log_api_usage_once(f"{module}.{name}")
 
 
-def _make_ntuple(x: Any, n: int) -> Tuple[Any, ...]:
+def _make_ntuple(x: Any, n: int) -> tuple[Any, ...]:
     """
     Make n-tuple from input x. If x is an iterable, then we just convert it to tuple.
-    Otherwise we will make a tuple of length n, all with value of x.
+    Otherwise, we will make a tuple of length n, all with value of x.
     reference: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/utils.py#L8
 
     Args:
diff --git a/version.txt b/version.txt
index 56f78043a8b..06e94230111 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.0a0
+0.24.0a0