diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9318a2d07720f8..0bd8e8e1a4a272 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -632,7 +632,8 @@ jobs:
           }
 
           if is_vanilla_build; then
-            echo "apt-get update && apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash
+            echo "apt-get update || apt-get install libgnutls30" | docker exec -u root -i "$id" bash
+            echo "apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash
             echo "cd workspace/build; qemu-x86_64 -g 2345 -cpu Broadwell -E ATEN_CPU_CAPABILITY=default ./bin/basic --gtest_filter=BasicTest.BasicTestCPU & gdb ./bin/basic -ex 'set pagination off' -ex 'target remote :2345' -ex 'continue' -ex 'bt' -ex='set confirm off' -ex 'quit \$_isvoid(\$_exitcode)'" | docker exec -u jenkins -i "$id" bash
           else
             echo "Skipping for ${BUILD_ENVIRONMENT}"
@@ -1734,16 +1735,17 @@ jobs:
             # install fastlane
             sudo gem install bundler && bundle install
             # install certificates
-            echo ${IOS_CERT_KEY} >> cert.txt
+            echo ${IOS_CERT_KEY_2022} >> cert.txt
             base64 --decode cert.txt -o Certificates.p12
             rm cert.txt
-            bundle exec fastlane install_cert
+            bundle exec fastlane install_root_cert
+            bundle exec fastlane install_dev_cert
             # install the provisioning profile
-            PROFILE=PyTorch_CI_2021.mobileprovision
+            PROFILE=PyTorch_CI_2022.mobileprovision
             PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
             mkdir -pv "${PROVISIONING_PROFILES}"
             cd "${PROVISIONING_PROFILES}"
-            echo ${IOS_SIGN_KEY} >> cert.txt
+            echo ${IOS_SIGN_KEY_2022} >> cert.txt
             base64 --decode cert.txt -o ${PROFILE}
             rm cert.txt
       - run:
@@ -1802,7 +1804,7 @@ jobs:
           command: |
             set -e
             PROJ_ROOT=/Users/distiller/project
-            PROFILE=PyTorch_CI_2021
+            PROFILE=PyTorch_CI_2022
             # run the ruby build script
             if ! [ -x "$(command -v xcodebuild)" ]; then
               echo 'Error: xcodebuild is not installed.'
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index bebbd3f04ddb5b..6e6acf518d8903 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -329,6 +329,7 @@ docker build \
        --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
        --build-arg "KATEX=${KATEX:-}" \
        --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906;gfx908}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \
diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index 044ff869a2bf71..57156b31fd8e35 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -11,8 +11,13 @@ install_ubuntu() {
   #   "$UBUNTU_VERSION" == "18.04"
   if [[ "$UBUNTU_VERSION" == "18.04"* ]]; then
     cmake3="cmake=3.10*"
+    maybe_libiomp_dev="libiomp-dev"
+  elif [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
+    cmake3="cmake=3.16*"
+    maybe_libiomp_dev=""
   else
     cmake3="cmake=3.5*"
+    maybe_libiomp_dev="libiomp-dev"
   fi
 
   # Install common dependencies
@@ -33,7 +38,7 @@ install_ubuntu() {
     git \
     libatlas-base-dev \
     libc6-dbg \
-    libiomp-dev \
+    ${maybe_libiomp_dev} \
     libyaml-dev \
     libz-dev \
     libjpeg-dev \
@@ -107,11 +112,13 @@ case "$ID" in
 esac
 
 # Install Valgrind separately since the apt-get version is too old.
+OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
+if [[ $ID == centos && $OS_VERSION == 7 ]]; then WGET_FLAG="--no-check-certificate" ; else WGET_FLAG=""; fi
 mkdir valgrind_build && cd valgrind_build
 VALGRIND_VERSION=3.16.1
-if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
+if ! wget $WGET_FLAG http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
 then
-  wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2
+  wget $WGET_FLAG https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2
 fi
 tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
 cd valgrind-${VALGRIND_VERSION}
diff --git a/.circleci/docker/common/install_rocm.sh b/.circleci/docker/common/install_rocm.sh
index fa4df9a81b4087..d454b9bfddc4f6 100644
--- a/.circleci/docker/common/install_rocm.sh
+++ b/.circleci/docker/common/install_rocm.sh
@@ -14,7 +14,7 @@ install_magma() {
     cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
     echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
     echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
-    echo 'DEVCCFLAGS += --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906 --amdgpu-target=gfx908 --gpu-max-threads-per-block=256' >> make.inc
+    echo 'DEVCCFLAGS += --amdgpu-target=gfx900 --amdgpu-target=gfx906 --amdgpu-target=gfx908 --amdgpu-target=gfx90a --gpu-max-threads-per-block=256' >> make.inc
     # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
     sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
     export PATH="${PATH}:/opt/rocm/bin"
@@ -29,12 +29,19 @@ ver() {
     printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }
 
+# Map ROCm version to AMDGPU version
+declare -A AMDGPU_VERSIONS=( ["4.5.2"]="21.40.2" ["5.0"]="21.50" ["5.0.1"]="21.50.1" ["5.1"]="22.10" ["5.1.1"]="22.10.1" ["5.1.3"]="22.10.3" ["5.2"]="22.20" ["5.2.1"]="22.20.1" )
+
 install_ubuntu() {
     apt-get update
     if [[ $UBUNTU_VERSION == 18.04 ]]; then
       # gpg-agent is not available by default on 18.04
       apt-get install -y --no-install-recommends gpg-agent
     fi
+    if [[ $UBUNTU_VERSION == 20.04 ]]; then
+      # gpg-agent is not available by default on 20.04
+      apt-get install -y --no-install-recommends gpg-agent
+    fi
     apt-get install -y kmod
     apt-get install -y wget
 
@@ -42,6 +49,13 @@ install_ubuntu() {
     apt-get install -y libc++1
     apt-get install -y libc++abi1
 
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+        # Add amdgpu repository
+        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+        local amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
+        echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    fi
+
     ROCM_REPO="ubuntu"
     if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
         ROCM_REPO="xenial"
@@ -49,7 +63,8 @@ install_ubuntu() {
 
     # Add rocm repository
     wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-    echo "deb [arch=amd64] http://repo.radeon.com/rocm/apt/${ROCM_VERSION} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
+    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
+    echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
     apt-get update --allow-insecure-repositories
 
     DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -86,11 +101,24 @@ install_centos() {
   yum install -y epel-release
   yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`
 
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+      # Add amdgpu repository
+      local amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
+      echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+      echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+      echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+      echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
+  fi
+
+  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
   echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
   echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo
-  echo "baseurl=http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" >> /etc/yum.repos.d/rocm.repo
+  echo "baseurl=${rocm_baseurl}" >> /etc/yum.repos.d/rocm.repo
   echo "enabled=1" >> /etc/yum.repos.d/rocm.repo
-  echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
+  echo "gpgcheck=1" >> /etc/yum.repos.d/rocm.repo
+  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/rocm.repo
 
   yum update -y
 
diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh
index db2b0660d9f506..8e5b0b99c854b7 100755
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@@ -61,7 +61,7 @@ git --no-pager log --max-count 1
 popd
 
 # Clone the Builder master repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+retry git clone -q https://github.com/pytorch/builder.git -b release/1.10 "$BUILDER_ROOT"
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
diff --git a/.circleci/scripts/binary_ios_test.sh b/.circleci/scripts/binary_ios_test.sh
index 5280b62a4e708c..c3d91a231ee88f 100644
--- a/.circleci/scripts/binary_ios_test.sh
+++ b/.circleci/scripts/binary_ios_test.sh
@@ -8,16 +8,17 @@ cd ${PROJ_ROOT}/ios/TestApp
 # install fastlane
 sudo gem install bundler && bundle install
 # install certificates
-echo "${IOS_CERT_KEY}" >> cert.txt
+echo "${IOS_CERT_KEY_2022}" >> cert.txt
 base64 --decode cert.txt -o Certificates.p12
 rm cert.txt
-bundle exec fastlane install_cert
+bundle exec fastlane install_root_cert
+bundle exec fastlane install_dev_cert
 # install the provisioning profile
-PROFILE=PyTorch_CI_2021.mobileprovision
+PROFILE=PyTorch_CI_2022.mobileprovision
 PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
 mkdir -pv "${PROVISIONING_PROFILES}"
 cd "${PROVISIONING_PROFILES}"
-echo "${IOS_SIGN_KEY}" >> cert.txt
+echo "${IOS_SIGN_KEY_2022}" >> cert.txt
 base64 --decode cert.txt -o ${PROFILE}
 rm cert.txt
 # run the ruby build script
@@ -25,5 +26,5 @@ if ! [ -x "$(command -v xcodebuild)" ]; then
     echo 'Error: xcodebuild is not installed.'
     exit 1
 fi
-PROFILE=PyTorch_CI_2021
+PROFILE=PyTorch_CI_2022
 ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID} -f Accelerate,MetalPerformanceShaders,CoreML
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 765c1d82067e10..f82512236b8735 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -467,16 +467,17 @@
             # install fastlane
             sudo gem install bundler && bundle install
             # install certificates
-            echo ${IOS_CERT_KEY} >> cert.txt
+            echo ${IOS_CERT_KEY_2022} >> cert.txt
             base64 --decode cert.txt -o Certificates.p12
             rm cert.txt
-            bundle exec fastlane install_cert
+            bundle exec fastlane install_root_cert
+            bundle exec fastlane install_dev_cert
             # install the provisioning profile
-            PROFILE=PyTorch_CI_2021.mobileprovision
+            PROFILE=PyTorch_CI_2022.mobileprovision
             PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
             mkdir -pv "${PROVISIONING_PROFILES}"
             cd "${PROVISIONING_PROFILES}"
-            echo ${IOS_SIGN_KEY} >> cert.txt
+            echo ${IOS_SIGN_KEY_2022} >> cert.txt
             base64 --decode cert.txt -o ${PROFILE}
             rm cert.txt
       - run:
@@ -535,7 +536,7 @@
           command: |
             set -e
             PROJ_ROOT=/Users/distiller/project
-            PROFILE=PyTorch_CI_2021
+            PROFILE=PyTorch_CI_2022
             # run the ruby build script
             if ! [ -x "$(command -v xcodebuild)" ]; then
               echo 'Error: xcodebuild is not installed.'
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 437a28b1c98185..7e7a2e07989ed7 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -158,7 +158,8 @@ jobs:
           }
 
           if is_vanilla_build; then
-            echo "apt-get update && apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash
+            echo "apt-get update || apt-get install libgnutls30" | docker exec -u root -i "$id" bash
+            echo "apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash
             echo "cd workspace/build; qemu-x86_64 -g 2345 -cpu Broadwell -E ATEN_CPU_CAPABILITY=default ./bin/basic --gtest_filter=BasicTest.BasicTestCPU & gdb ./bin/basic -ex 'set pagination off' -ex 'target remote :2345' -ex 'continue' -ex 'bt' -ex='set confirm off' -ex 'quit \$_isvoid(\$_exitcode)'" | docker exec -u jenkins -i "$id" bash
           else
             echo "Skipping for ${BUILD_ENVIRONMENT}"
diff --git a/.gitmodules b/.gitmodules
index a7cc437f438402..0e2dc99d5bc5f4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -135,7 +135,7 @@
 	url = https://github.com/NVIDIA/cudnn-frontend.git
 [submodule "third_party/kineto"]
     path = third_party/kineto
-    url = https://github.com/pytorch/kineto
+    url = https://github.com/mwootton/kineto
 [submodule "third_party/pocketfft"]
 	path = third_party/pocketfft
 	url = https://github.com/mreineck/pocketfft
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 0cfb5e01184a7c..338b3e1e3de7cd 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -63,9 +63,9 @@ function get_pr_change_files() {
 function file_diff_from_base() {
   # The fetch may fail on Docker hosts, this fetch is necessary for GHA
   set +e
-  git fetch origin master --quiet
+  git fetch origin release/1.10 --quiet
   set -e
-  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
+  git diff --name-only "$(git merge-base origin/release/1.10 HEAD)" > "$1"
 }
 
 function get_bazel() {
@@ -99,5 +99,5 @@ function checkout_install_torchvision() {
 }
 
 function clone_pytorch_xla() {
-  git clone --recursive https://github.com/pytorch/xla.git
+  git clone --recursive -b r1.10 https://github.com/pytorch/xla.git
 }
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 3f39431ba54a1d..60d4015c78e783 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -230,6 +230,8 @@ test_aten() {
 test_without_numpy() {
   pushd "$(dirname "${BASH_SOURCE[0]}")"
   python -c "import sys;sys.path.insert(0, 'fake_numpy');from unittest import TestCase;import torch;x=torch.randn(3,3);TestCase().assertRaises(RuntimeError, lambda: x.numpy())"
+  # Regression test for https://github.com/pytorch/pytorch/issues/66353
+  python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))"
   popd
 }
 
@@ -424,7 +426,7 @@ test_backward_compatibility() {
   python -m venv venv
   # shellcheck disable=SC1091
   . venv/bin/activate
-  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_nightly.html
   pip show torch
   python dump_all_function_schemas.py --filename nightly_schemas.txt
   deactivate
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c11507838fde1..0d8d40d01013e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -148,7 +148,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
-option(BUILD_CAFFE2 "Master flag to build Caffe2" ON)
+option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF)
 option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
 cmake_dependent_option(
     BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
diff --git a/aten/src/ATen/ConjugateFallback.cpp b/aten/src/ATen/ConjugateFallback.cpp
index 21c044ee074e48..79076d67658c11 100644
--- a/aten/src/ATen/ConjugateFallback.cpp
+++ b/aten/src/ATen/ConjugateFallback.cpp
@@ -2,21 +2,12 @@
 #include <ATen/native/MathBitFallThroughLists.h>
 
 namespace at {
-
+namespace native {
 struct ConjFallback : MathOpFallback {
   ConjFallback() : MathOpFallback(DispatchKey::Conjugate, "conjugate") {}
   bool is_bit_set(const Tensor& tensor) override {
     return tensor.is_conj();
   }
-  void _set_bit(const Tensor& tensor, bool value) override {
-    return tensor._set_conj(value);
-  }
-  Tensor resolve_bit(const Tensor& tensor) override {
-    return at::resolve_conj(tensor);
-  }
-  Tensor& math_op_(Tensor& tensor) override {
-    return at::conj_physical_(tensor);
-  }
 };
 
 void conjugateFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
@@ -60,4 +51,5 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) {
   TENSOR_UTILITIES_AND_CONSTRUCTORS(m)
 }
 
+}
 } // namespace at
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index bf06efa2a2bb5c..42abef9cc3cac4 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -254,6 +254,20 @@ bool NoTF32Guard::should_disable_tf32() {
   return override_allow_tf32_flag;
 }
 
+thread_local bool BackwardPassGuard::is_backward_pass_;
+
+BackwardPassGuard::BackwardPassGuard() {
+  is_backward_pass_ = true;
+}
+
+BackwardPassGuard::~BackwardPassGuard() {
+  is_backward_pass_ = false;
+}
+
+bool BackwardPassGuard::is_backward_pass() {
+  return is_backward_pass_;
+}
+
 bool Context::areVmapFallbackWarningsEnabled() const {
   return display_vmap_fallback_warnings_;
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 4a45ac6f8ac18e..f30ffb57f9f21c 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -380,4 +380,12 @@ struct TORCH_API NoTF32Guard {
   bool changed = false;
 };
 
+struct TORCH_API BackwardPassGuard {
+  BackwardPassGuard();
+  ~BackwardPassGuard();
+  static bool is_backward_pass();
+private:
+  static thread_local bool is_backward_pass_;
+};
+
 } // namespace at
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index e9258603f4306d..f46245e72ca4ba 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -663,7 +663,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
   int64_t numel = this->numel();
   if (numel == 0) {
     return;
-  } else if (numel < grain_size || at::get_num_threads() == 1) {
+  } else if (numel < internal::GRAIN_SIZE || at::get_num_threads() == 1) {
     return serial_for_each(loop, {0, numel});
   } else {
     at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 70c3dda6f3401f..6904b6d72b2368 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -4,6 +4,13 @@
 
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
+#include <c10/util/irange.h>
+#include <c10/macros/Export.h>
+
+#ifdef __HIP_PLATFORM_HCC__
+#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+#endif
 
 #define CUDABLAS_POSINT_CHECK(FD, X)         \
   TORCH_CHECK(                               \
@@ -95,7 +102,7 @@ namespace at {
 namespace cuda {
 namespace blas {
 
-const char* _cublasGetErrorEnum(cublasStatus_t error) {
+C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) {
   if (error == CUBLAS_STATUS_SUCCESS) {
     return "CUBLAS_STATUS_SUCCESS";
   }
@@ -272,13 +279,17 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
   float falpha = alpha;
   float fbeta = beta;
 #ifdef __HIP_PLATFORM_HCC__
+  int flag = 0;
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+  flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#endif
   TORCH_CUDABLAS_CHECK(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k,
                                    (void*)&falpha, a, rocblas_datatype_f16_r, (int)lda, stridea,
                                    b, rocblas_datatype_f16_r, (int)ldb, strideb,
                                    (void*)&fbeta, c, rocblas_datatype_f16_r, (int)ldc, stridec,
                                    c, rocblas_datatype_f16_r, (int)ldc, stridec,
                                    (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
-                                   0, 0));
+                                   0, flag));
 #else
   #if defined(CUDA_VERSION) && CUDA_VERSION < 11000
     // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
@@ -418,6 +429,10 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   GEMM_CHECK_ARGVALUES(at::Half);
 #ifdef __HIP_PLATFORM_HCC__
+  int flag = 0;
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+  flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#endif
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(
       handle,
       opa,
@@ -442,7 +457,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
       rocblas_datatype_f32_r,
       rocblas_gemm_algo_standard,
       0,
-      0));
+      flag));
 #else
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   if (prop->major >= 5) {
diff --git a/aten/src/ATen/cuda/CUDASolver.cpp b/aten/src/ATen/cuda/CUDASolver.cpp
index 355238f6b2dcf4..8b906d4931feba 100644
--- a/aten/src/ATen/cuda/CUDASolver.cpp
+++ b/aten/src/ATen/cuda/CUDASolver.cpp
@@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDASolver.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/macros/Export.h>
 
 #ifdef CUDART_VERSION
 
@@ -9,7 +10,7 @@ namespace at {
 namespace cuda {
 namespace solver {
 
-const char* cusolverGetErrorMessage(cusolverStatus_t status) {
+C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) {
   switch (status) {
     case CUSOLVER_STATUS_SUCCESS:                     return "CUSOLVER_STATUS_SUCCES";
     case CUSOLVER_STATUS_NOT_INITIALIZED:             return "CUSOLVER_STATUS_NOT_INITIALIZED";
diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
index 1414e319656bd3..d3fdfec69c9a62 100644
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -2,6 +2,7 @@
 
 #include <cublas_v2.h>
 #include <cusparse.h>
+#include <c10/macros/Export.h>
 
 #ifdef CUDART_VERSION
 #include <cusolver_common.h>
@@ -39,7 +40,7 @@ class CuDNNError : public c10::Error {
   } while (0)
 
 namespace at { namespace cuda { namespace blas {
-const char* _cublasGetErrorEnum(cublasStatus_t error);
+C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error);
 }}} // namespace at::cuda::blas
 
 #define TORCH_CUDABLAS_CHECK(EXPR)                              \
@@ -66,7 +67,7 @@ const char *cusparseGetErrorString(cusparseStatus_t status);
 #ifdef CUDART_VERSION
 
 namespace at { namespace cuda { namespace solver {
-const char* cusolverGetErrorMessage(cusolverStatus_t status);
+C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status);
 }}} // namespace at::cuda::solver
 
 #define TORCH_CUSOLVER_CHECK(EXPR)                                \
diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 26f804768e42c1..cf835219750964 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -235,6 +235,16 @@ struct chained_iterator {
 
 template<typename InputIteratorT, typename OutputIteratorT, typename ScanOpT>
 inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, int64_t num_items) {
+#if defined(USE_ROCM) && (ROCM_VERSION >= 50000)
+  //For ROCm, use hipCUB chained iterators
+  CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::InclusiveScan,
+      input,
+      output,
+      scan_op,
+      num_items,
+      at::cuda::getCurrentCUDAStream());
+  C10_HIP_KERNEL_LAUNCH_CHECK();
+#else
   // non synchronizing cub call
   // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
   // so split at int_max/2
@@ -278,10 +288,22 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
         size_cub,
         at::cuda::getCurrentCUDAStream());
   }
+#endif
 }
 
 template<typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT>
 inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, InitValueT init_value, int64_t num_items) {
+#if defined(USE_ROCM) && (ROCM_VERSION >= 50000)
+  //For ROCm, use hipCUB chained iterators
+  CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::ExclusiveScan,
+      input,
+      output,
+      scan_op,
+      init_value,
+      num_items,
+      at::cuda::getCurrentCUDAStream());
+  C10_HIP_KERNEL_LAUNCH_CHECK();
+#else
   // non synchronizing cub call
   // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
   // so split at int_max/2
@@ -316,6 +338,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
         size_cub,
         at::cuda::getCurrentCUDAStream());
   }
+#endif
 }
 
 template<typename InputIteratorT , typename OutputIteratorT , typename NumSelectedIteratorT >
diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
index a7d6578b2889f2..3aaa9d06c5e91f 100644
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@@ -10,7 +10,7 @@ class DataPtr;
 namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {
 
-Allocator* get();
+C10_HIP_API Allocator* get();
 C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
 
 } // namespace HIPCachingAllocatorMasqueradingAsCUDA
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 7974840dd3f971..9105c831e993ad 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -240,14 +240,11 @@ Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, c
 }
 
 Tensor cosine_similarity(const Tensor& x1, const Tensor& x2, int64_t dim, double eps) {
-  TORCH_CHECK(x1.ndimension() == x2.ndimension(), "cosine_similarity requires both inputs to have the same number of dimensions, but x1 has ",
-              x1.ndimension(), " and x2 has ", x2.ndimension());
-  TORCH_CHECK(x1.ndimension() == 0 || x1.size(dim) == x2.size(dim), "cosine_similarity requires both inputs to have the same size at dimension ", dim, "but x1 has ",
-  x1.size(dim), " and x2 has ", x2.size(dim));
+  auto common_size = at::infer_size_dimvector(x1.sizes(), x2.sizes());
   auto commonDtype = at::result_type(x1, x2);
   TORCH_CHECK(at::isFloatingType(commonDtype), "expected common dtype to be floating point, yet common dtype is ", commonDtype);
-  Tensor x1_ = x1.to(commonDtype);
-  Tensor x2_ = x2.to(commonDtype);
+  Tensor x1_ = x1.to(commonDtype).expand(common_size);
+  Tensor x2_ = x2.to(commonDtype).expand(common_size);
   // Follow scipy impl to improve numerical precision
   // Use x / sqrt(x * x) instead of x / (sqrt(x) * sqrt(x))
   Tensor w12 = at::sum(x1_ * x2_, dim);
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index ed5aa847bea4e9..00211942ed7e98 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -238,7 +238,7 @@ template<template<typename> class normal_kernel, typename RNG>
 Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex");
   TORCH_CHECK(
-    std.min().ge(0).item<bool>(),
+    std.numel() == 0 || std.min().ge(0).item<bool>(),
     "normal expects all elements of std >= 0.0");
   bool is_deprecated_th_impl = resize_output_for_normal(output, mean, std);
   normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
diff --git a/aten/src/ATen/native/MathBitFallThroughLists.h b/aten/src/ATen/native/MathBitFallThroughLists.h
index 052e730194ee90..ea3fed748c0b1f 100644
--- a/aten/src/ATen/native/MathBitFallThroughLists.h
+++ b/aten/src/ATen/native/MathBitFallThroughLists.h
@@ -50,7 +50,8 @@ namespace at {
   m.impl("vsplit.array", torch::CppFunction::makeFallthrough()); \
   m.impl("conj", torch::CppFunction::makeFallthrough()); \
   m.impl("_conj", torch::CppFunction::makeFallthrough()); \
-  m.impl("_unsafe_view", torch::CppFunction::makeFallthrough());
+  m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); \
+  m.impl("resize_", torch::CppFunction::makeFallthrough());
 
 #define TENSOR_UTILITIES_AND_CONSTRUCTORS(m) \
   m.impl("empty_like", torch::CppFunction::makeFallthrough()); \
diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h
index 2e78ebd66b66ec..89670699fcb6d7 100644
--- a/aten/src/ATen/native/MathBitsFallback.h
+++ b/aten/src/ATen/native/MathBitsFallback.h
@@ -3,42 +3,49 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/native/Resize.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
 
 namespace at {
-
+namespace native {
 // This fallback should only be used for operations that are self inverse and have a corresponding tensor
 // bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit.
 // Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit.
 // Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called.
 
+// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
 struct MathOpFallback {
   MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(op_name_) {}
   virtual bool is_bit_set(const Tensor&) = 0;
-  virtual void _set_bit(const Tensor&, bool) = 0;
-  // materializes the bit, i.e., returns a new tensor tensor containing the true output
-  // (after performing the math operation corresponding to the tensor bit) if the bit is set to 1
-  // else returns self.
-  virtual Tensor resolve_bit(const Tensor&) = 0;
-  // in-place operation corresponding to the math op represented by the bit. Im the future if this class
-  // is generalized for ops that are not self inverse, then this must be replaced by op_inverse_inplace
-  virtual Tensor& math_op_(Tensor&) = 0;
   void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
-    // Situations to handle:
-    //  1. Out-of-place operation.  Easy: materialize all inputs and
-    //     call it a day.
-    //  2. Inplace operation.  Desugar x.add_(2) into x.conj_().add_(2).conj_().
-    //     Materialize other inputs as in (1).
-    //  3. out= operation.  Desugar add(x, 2, out=y) into y.copy_(add(x, 2))
-    //  Materialize other inputs as in (1).
-    //
-    //  It is important to be able to tell if we READ from an argument and if we
-    //  WRITE from an argument.  Conservative approach is to assume that we always
-    //  READ from an argument, but in out-of-place operations you can skip
-    //  conjugating inputs on entry that never get used.  In current schema we
-    //  can't easily tell if inplace situation has happened, so don't do it.
+    /*
+      Situations to handle:
+        1. Out-of-place operation.  Easy: materialize all inputs and
+          call it a day.
+        2. Inplace operation.  Desugar x.add_(2) into x.conj_().add_(2).conj_().
+          Materialize other inputs as in (1).
+        3. out= operation.  Desugar add(x, 2, out=y) into y.copy_(add(x, 2))
+        Materialize other inputs as in (1).
+
+        It is important to be able to tell if we READ from an argument and if we
+        WRITE to an argument.  Conservative approach is to assume that we always
+        READ from an argument, but in out= operations you can skip
+        conjugating inputs on entry that never get used. In the current schema we
+        can't easily tell if the operation is in in-place or out= operation.
 
+        Note:
+        1. Mutable tensorlists containing tensors whose math bit set to true are disallowed.
+        2. Mutable tensors with math bit set to true are unconditionally cloned to ensure
+           correct behavior in the case when the mutable tensor shares memory with non mutable arguments.
+
+           If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory
+           with these mutable inputs would read into wrong values in the following cases:
+           1. Non mutable inputs have their math bit set to false.
+           2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory
+              with one or more mutable arg(s)) are cloned.
+           At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs.
+    */
     const auto& arguments = op.schema().arguments();
     const auto num_arguments = arguments.size();
     const auto stack_start = stack->size() - num_arguments;
@@ -72,9 +79,8 @@ struct MathOpFallback {
       return;
     }
 
-    // Mutable inputs to be tracked separately
-    std::vector<Tensor> mutable_inputs;
-
+    // Mutable inputs with math bit set to True and their clones
+    std::vector<std::pair<Tensor, Tensor>> mutable_inputs_with_their_clones;
     for (const auto i : c10::irange(num_arguments)) {
       auto& ivalue = (*stack)[stack_start + i];
       if (!(ivalue.isTensor() || ivalue.isTensorList())) {
@@ -91,31 +97,26 @@ struct MathOpFallback {
         if (!is_bit_set(ivalue.toTensor())) {
           continue;
         }
-
         auto tensor = std::move(ivalue).toTensor();
         TORCH_CHECK_NOT_IMPLEMENTED(!tensor.is_meta(), op_name, " fallback does not support meta tensors.");
+        auto resolved_tensor = at::clone(tensor);
         if (mut_arg) {
-          // TODO: This is a waste if the argument is write only
-          _set_bit(tensor, false);
-          math_op_(tensor);
-          mutable_inputs.emplace_back(tensor);
-        } else {
-          tensor = resolve_bit(tensor);
+          TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ",
+            op_name, "bit set to true.");
+          mutable_inputs_with_their_clones.emplace_back(std::make_pair(std::move(tensor), resolved_tensor));
         }
-        (*stack)[stack_start + i] = std::move(tensor);
+        (*stack)[stack_start + i] = std::move(resolved_tensor);
       } else if (ivalue.isTensorList()) {
         auto tensors = std::move(ivalue).toTensorList();
-        if (mut_arg) {
-          for(const auto j : c10::irange(tensors.size())) {
-            Tensor t = tensors[j];
-            _set_bit(t, false);
-            math_op_(t);
-            mutable_inputs.emplace_back(t);
-          }
-        } else {
-          for(const auto j : c10::irange(tensors.size())) {
-            tensors[j] = resolve_bit(tensors[j]);
+        for(const auto j : c10::irange(tensors.size())) {
+          const auto& tensor = tensors[j];
+          if (!is_bit_set(tensor)) {
+            continue;
           }
+          TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ",
+              op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ",
+              op.schema().name());
+          tensors[j] = at::clone(tensor);
         }
         (*stack)[stack_start + i] = std::move(tensors);
       }
@@ -123,9 +124,22 @@ struct MathOpFallback {
 
     op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
 
-    for (auto& mutable_input : mutable_inputs) {
-      math_op_(mutable_input);
-      _set_bit(mutable_input, true);
+    TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1);
+
+    for (std::pair<Tensor, Tensor> mut_tensors: mutable_inputs_with_their_clones) {
+      auto& mutable_input =  mut_tensors.first;
+      auto& cloned_mutable_input =  mut_tensors.second;
+      auto& ivalue = (*stack)[stack_start];
+      auto returned_output = std::move(ivalue).toTensor();
+
+      // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input
+      TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output));
+
+      // necessary for out= arg
+      at::native::resize_output(mutable_input, returned_output.sizes());
+
+      mutable_input.copy_(returned_output);
+      (*stack)[stack_start] = std::move(mutable_input);
     }
   }
 
@@ -134,5 +148,5 @@ struct MathOpFallback {
   DispatchKey key;
   string op_name;
 };
-
-} // namespace at
+}
+}// namespace at
diff --git a/aten/src/ATen/native/NegateFallback.cpp b/aten/src/ATen/native/NegateFallback.cpp
index 8920a2a4d78721..62e7eccea7113e 100644
--- a/aten/src/ATen/native/NegateFallback.cpp
+++ b/aten/src/ATen/native/NegateFallback.cpp
@@ -2,21 +2,12 @@
 #include <ATen/native/MathBitFallThroughLists.h>
 
 namespace at {
-
+namespace native {
 struct NegFallback : MathOpFallback {
   NegFallback() : MathOpFallback(DispatchKey::Negative, "negation") {}
   bool is_bit_set(const Tensor& tensor) override {
     return tensor.is_neg();
   }
-  void _set_bit(const Tensor& tensor, bool value) override {
-    return tensor._set_neg(value);
-  }
-  Tensor resolve_bit(const Tensor& tensor) override {
-    return at::resolve_neg(tensor);
-  }
-  Tensor& math_op_(Tensor& tensor) override {
-    return tensor.neg_();
-  }
 };
 
 void negationFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
@@ -42,4 +33,5 @@ TORCH_LIBRARY_IMPL(aten, Negative, m) {
   TENSOR_UTILITIES_AND_CONSTRUCTORS(m)
 }
 
+}
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index d11a5bb074c5d6..4b9a51f4b2fedf 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -60,12 +60,12 @@
 namespace at { namespace native {
 
 template<int vec_size, typename func_t, typename array_t>
-C10_LAUNCH_BOUNDS_1(num_threads)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
   using traits = function_traits<func_t>;
-  int remaining = N - block_work_size * blockIdx.x;
+  int remaining = N - block_work_size() * blockIdx.x;
 
-  if (remaining < block_work_size) {  // if this block handles the reminder, just do a naive unrolled loop
+  if (remaining < block_work_size()) {  // if this block handles the reminder, just do a naive unrolled loop
     auto input_calc = TrivialOffsetCalculator<traits::arity>();
     auto output_calc = TrivialOffsetCalculator<1>();
     auto loader = memory::LoadWithoutCast();
@@ -80,11 +80,11 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
 }
 
 template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t>
-C10_LAUNCH_BOUNDS_1(num_threads)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data,
                                             inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
 {
-  int remaining = N - block_work_size * blockIdx.x;
+  int remaining = N - block_work_size() * blockIdx.x;
   auto policy = memory::policies::unroll<array_t, inp_calc_t, out_calc_t, loader_t, storer_t>(data, remaining, ic, oc, l, s);
   elementwise_kernel_helper(f, policy);
 }
@@ -94,17 +94,17 @@ template<typename func_t, typename array_t>
 static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t data) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
   using traits = function_traits<func_t>;
-  int64_t grid = (N + block_work_size - 1) / block_work_size;
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
 
   switch (vec_size) {
   case 4:
-    vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
+    vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 2:
-    vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
+    vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 1: {
@@ -112,7 +112,7 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t
     auto output_calc = TrivialOffsetCalculator<1>();
     auto loader = memory::LoadWithoutCast();
     auto storer = memory::StoreWithoutCast();
-    unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
+    unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   }
@@ -126,9 +126,9 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
                                           inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
 {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  int64_t grid = (N + block_work_size - 1) / block_work_size;
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
-  unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc, l, s);
+  unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc, l, s);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu
index 06bea0aea972a9..3d06ce3fdf31a2 100644
--- a/aten/src/ATen/native/cuda/CrossKernel.cu
+++ b/aten/src/ATen/native/cuda/CrossKernel.cu
@@ -36,7 +36,7 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
   const auto N = iter.numel();
   auto offset_calculator = make_element_offset_calculator<3>(iter);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  int64_t grid = (N + NUM_THREADS - 1) / NUM_THREADS;
+  int64_t grid = (N + num_threads() - 1) / num_threads();
   auto stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kHalf, iter.common_dtype(), "cross_cuda", [&] {
@@ -45,11 +45,11 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
     auto x2 = static_cast<const scalar_t*>(iter.data_ptr(2));
     constexpr int64_t int_max = std::numeric_limits<int>::max();
     if (ostride * 2 > int_max || x1stride * 2 > int_max || x2stride * 2 > int_max) {
-      cross_kernel<<<grid, num_threads, 0, stream>>>(
+      cross_kernel<<<grid, num_threads(), 0, stream>>>(
           N, out, x1, x2, offset_calculator, ostride, x1stride, x2stride);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      cross_kernel<<<grid, num_threads, 0, stream>>>(
+      cross_kernel<<<grid, num_threads(), 0, stream>>>(
           N, out, x1, x2, offset_calculator,
           static_cast<int>(ostride),
           static_cast<int>(x1stride),
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 481b5589684484..2f25b224e77fb0 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -188,11 +188,11 @@ __global__ void distribution_binary_elementwise_kernel(
   using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
   using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
 
-  input_t_1 inputs_1[thread_work_size];
-  input_t_2 inputs_2[thread_work_size];
+  input_t_1 inputs_1[thread_work_size()];
+  input_t_2 inputs_2[thread_work_size()];
 
-  int base_index = BLOCK_WORK_SIZE * blockIdx.x;
-  int remaining = std::min<int>(numel - base_index, BLOCK_WORK_SIZE);
+  int base_index = block_work_size() * blockIdx.x;
+  int remaining = std::min<int>(numel - base_index, block_work_size());
 
   curandStatePhilox4_32_10_t state;
   curand_init(std::get<0>(seeds),
@@ -203,7 +203,7 @@ __global__ void distribution_binary_elementwise_kernel(
   // load data into registers
   int thread_idx = threadIdx.x;
   #pragma unroll
-  for (int i = 0; i < thread_work_size; i++) {
+  for (int i = 0; i < thread_work_size(); i++) {
     if (thread_idx >= remaining) {
       break;
     }
@@ -212,20 +212,20 @@ __global__ void distribution_binary_elementwise_kernel(
     inputs_1[i] = input_data_1[offsets[0]];
     inputs_2[i] = input_data_2[offsets[1]];
 
-    thread_idx += num_threads;
+    thread_idx += num_threads();
   }
 
   // compute and store
   thread_idx = threadIdx.x;
   #pragma unroll
-  for (int i = 0; i < thread_work_size; i++) {
+  for (int i = 0; i < thread_work_size(); i++) {
     if (thread_idx >= remaining) {
       break;
     }
     int input_idx = thread_idx + base_index;
     auto offsets = out_calc.get(input_idx);
     output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]);
-    thread_idx += num_threads;
+    thread_idx += num_threads();
   }
 }
 
@@ -254,16 +254,16 @@ void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_arg
   const input_t_1 *input_data_1 = static_cast<const input_t_1 *>(iter.data_ptr(1));
   const input_t_2 *input_data_2 = static_cast<const input_t_2 *>(iter.data_ptr(2));
 
-  int64_t grid = (numel + block_work_size - 1) / block_work_size;
+  int64_t grid = (numel + block_work_size() - 1) / block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
 
   if (iter.is_contiguous()) {
-    distribution_binary_elementwise_kernel<<<grid,num_threads, 0, stream>>>(
+    distribution_binary_elementwise_kernel<<<grid,num_threads(), 0, stream>>>(
         numel, f, philox_args, output_data, input_data_1, input_data_2,
         TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    distribution_binary_elementwise_kernel<<<grid, num_threads, 0, stream>>>(
+    distribution_binary_elementwise_kernel<<<grid, num_threads(), 0, stream>>>(
         numel, f, philox_args, output_data, input_data_1, input_data_2,
         make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
index 180385aaf05202..57462ffe5da0dd 100644
--- a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
+++ b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
@@ -85,7 +85,7 @@ void _compute_linear_combination_internal_kernel(
     }
   };
 
-  _lauch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+  _lauch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
 }
 
 void _compute_linear_combination_cuda_kernel(
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index b4936c069b0b12..c6439e586f0b51 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -137,7 +137,7 @@ void _unpack_pivots_internal_kernel(
     }
   };
 
-  _launch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+  _launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
 }
 
 void unpack_pivots_cuda_kernel(
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index 8849293e20210e..d3a7027a8fb02c 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -9,13 +9,9 @@
 
 #include <thrust/tuple.h>
 
-#define NUM_THREADS (C10_WARP_SIZE * 2)
-#define THREAD_WORK_SIZE 4
-#define BLOCK_WORK_SIZE (THREAD_WORK_SIZE * num_threads)
-
-constexpr int num_threads = NUM_THREADS;
-constexpr int thread_work_size = THREAD_WORK_SIZE;
-constexpr int block_work_size = BLOCK_WORK_SIZE;
+constexpr int num_threads() { return C10_WARP_SIZE * 4; }
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
 #include <ATen/native/cuda/MemoryAccess.cuh>
 
@@ -55,15 +51,15 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
 
   int idx = blockIdx.x;
 
-  return_t results[thread_work_size];
-  args_t args[thread_work_size];
+  return_t results[thread_work_size()];
+  args_t args[thread_work_size()];
 
   // load
   policy.load(args, idx);
 
   // compute
   #pragma unroll
-  for (int i = 0; i < thread_work_size; i++) {
+  for (int i = 0; i < thread_work_size(); i++) {
     if (policy.check_inbounds(i)) {
       results[i] = c10::guts::apply(f, args[i]);
     }
@@ -209,18 +205,18 @@ template <typename T> struct is_tuple: std::false_type {};
 template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {};
 
 template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
-C10_LAUNCH_BOUNDS_1(num_threads)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
-  int remaining = N - block_work_size * blockIdx.x;
+  int remaining = N - block_work_size() * blockIdx.x;
   elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc));
 }
 
 template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
 static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  int64_t grid = (N + block_work_size - 1) / block_work_size;
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
-  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc);
+  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
index 9bc1dd954eebb2..b9d94d6ce71b63 100644
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -59,7 +59,7 @@ struct vectorized_load_helper {
     using arg_t = std::tuple_element_t<arg_index, args_t>;
     // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
     // need a +1 offset to get the input
-    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size * idx;
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
     auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
     self.load_single_arg(args_accessor, ptr);
   }
@@ -164,7 +164,7 @@ struct unroll {
     data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((threadIdx.x  + thread_work_elem*num_threads) < remaining);
+    return ((threadIdx.x  + thread_work_elem*num_threads()) < remaining);
   }
 
   template<typename args_t>
@@ -172,30 +172,30 @@ struct unroll {
     constexpr int arity = std::tuple_size<args_t>::value;
     int thread_idx = threadIdx.x;
     #pragma unroll
-    for (int i = 0; i < thread_work_size; i++) {
+    for (int i = 0; i < thread_work_size(); i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size * idx;
+      int linear_idx = thread_idx + block_work_size() * idx;
       auto offset = input_offset_calculator.get(linear_idx);
       detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
-      thread_idx += num_threads;
+      thread_idx += num_threads();
     }
   }
 
   template<typename scalar_t>
   __device__ inline void store(scalar_t *from, int idx) {
     int thread_idx = threadIdx.x;
-    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
     #pragma unroll
-    for (int i = 0; i < thread_work_size; i++) {
+    for (int i = 0; i < thread_work_size(); i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size * idx;
+      int linear_idx = thread_idx + block_work_size() * idx;
       int offset = output_offset_calculator.get(linear_idx)[0];
       storer.store(from[i], data[0], offset);
-      thread_idx += num_threads;
+      thread_idx += num_threads();
     }
   }
 };
@@ -208,8 +208,8 @@ struct unroll {
 template <int vec_size, typename data_t>  // vec_size: number of scalars, can be 1, 2, or 4.
 struct vectorized {
 
-  static_assert(thread_work_size % vec_size == 0, "The workload per thread must be a multiple of vec_size");
-  static constexpr int loop_size = thread_work_size / vec_size;
+  static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
+  static constexpr int loop_size = thread_work_size() / vec_size;
 
   data_t data;
 
@@ -226,7 +226,7 @@ struct vectorized {
     int thread_idx = threadIdx.x;
     #pragma unroll
     for (int i = 0; i < loop_size; i++) {
-      int index = thread_idx + i * num_threads;
+      int index = thread_idx + i * num_threads();
       vec_t v = from_[index];
       #pragma unroll
       for (int j = 0; j < vec_size; j++) {
@@ -244,12 +244,12 @@ struct vectorized {
   template<typename scalar_t>
   __device__ inline void store(scalar_t *from, int idx) {
     using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
     vec_t *to_ = reinterpret_cast<vec_t *>(to);
     int thread_idx = threadIdx.x;
     #pragma unroll
     for (int i = 0; i < loop_size; i++) {
-      int index = thread_idx + i * num_threads;
+      int index = thread_idx + i * num_threads();
       vec_t v;
       for (int j = 0; j < vec_size; j++) {
         v.val[j] = from[vec_size * i + j];
@@ -274,7 +274,7 @@ struct multi_outputs_unroll {
   data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((threadIdx.x  + thread_work_elem*num_threads) < remaining);
+    return ((threadIdx.x  + thread_work_elem*num_threads()) < remaining);
   }
 
   template<typename args_t>
@@ -282,14 +282,14 @@ struct multi_outputs_unroll {
     constexpr int arity = std::tuple_size<args_t>::value;
     int thread_idx = threadIdx.x;
     #pragma unroll
-    for (int i = 0; i < thread_work_size; i++) {
+    for (int i = 0; i < thread_work_size(); i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size * idx;
+      int linear_idx = thread_idx + block_work_size() * idx;
       auto offset = input_offset_calculator.get(linear_idx);
       detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
-      thread_idx += num_threads;
+      thread_idx += num_threads();
     }
   }
 
@@ -298,14 +298,14 @@ struct multi_outputs_unroll {
   __device__ inline void store(return_t *from, int idx) {
     int thread_idx = threadIdx.x;
     #pragma unroll
-    for (int i = 0; i < thread_work_size; i++) {
+    for (int i = 0; i < thread_work_size(); i++) {
       if (thread_idx >= this->remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size * idx;
+      int linear_idx = thread_idx + block_work_size() * idx;
       auto offsets = this->output_offset_calculator.get(linear_idx);
       memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
-      thread_idx += num_threads;
+      thread_idx += num_threads();
     }
   }
 };
diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh
index c339364b5a0211..2134c0f8c725fe 100644
--- a/aten/src/ATen/native/cuda/ROCmLoops.cuh
+++ b/aten/src/ATen/native/cuda/ROCmLoops.cuh
@@ -229,7 +229,7 @@ struct has_same_arg_types<func_t, -1> {
 }  // namespace detail
 
 template<typename func_t, typename array_t>
-C10_LAUNCH_BOUNDS_1(num_threads)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void elementwise_kernel(int N, func_t f, array_t data) {
   // Assumption:
   // 1. all arguments of `f` have the same type, which could be different from the return type of `f`
@@ -246,7 +246,7 @@ __global__ void elementwise_kernel(int N, func_t f, array_t data) {
   constexpr int nargs = traits::arity == 0 ? 1 : traits::arity;
 
   int tid = threadIdx.x;
-  int idx = block_work_size * blockIdx.x + tid;
+  int idx = block_work_size() * blockIdx.x + tid;
 
   // compute base pointers
   return_t *result_base = reinterpret_cast<return_t *>(data[0]) + idx;
@@ -257,31 +257,31 @@ __global__ void elementwise_kernel(int N, func_t f, array_t data) {
   }
 
   // fetch data
-  return_t results[thread_work_size];
-  arg_t args[thread_work_size][nargs];
+  return_t results[thread_work_size()];
+  arg_t args[thread_work_size()][nargs];
   #pragma unroll
-  for (int i = 0; i < thread_work_size; i++) {
-    if (idx + num_threads * i < N) {
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (idx + num_threads() * i < N) {
       #pragma unroll
       for (int j = 0; j < arity; j++) {
-        args[i][j] = *(args_base[j] + i * num_threads);
+        args[i][j] = *(args_base[j] + i * num_threads());
       }
     }
   }
 
   // compute
   #pragma unroll
-  for (int i = 0; i < thread_work_size; i++) {
-    if (idx + num_threads * i < N) {
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (idx + num_threads() * i < N) {
       results[i] = detail::invoke_with_array<func_t, arg_t[nargs]>(f, args[i]);
     }
   }
 
   // store data
   #pragma unroll
-  for (int i = 0; i < thread_work_size; i++) {
-    if (idx + num_threads * i < N) {
-      *(result_base + i * num_threads) = results[i];
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (idx + num_threads() * i < N) {
+      *(result_base + i * num_threads()) = results[i];
     }
   }
 }
@@ -293,9 +293,9 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) {
   if (N == 0) {
     return;
   }
-  int64_t grid = (N + block_work_size - 1) / block_work_size;
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
-  elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
+  elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
index 5f03cc450f2069..2bb45a59d84332 100644
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -120,7 +120,7 @@ struct _cuda_scatter_gather_internal_kernel {
 
     };
 
-    _launch_scatter_gather_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+    _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
   }
 }; // struct _cuda_scatter_fill_internal_kernel
 
@@ -284,7 +284,7 @@ struct _cuda_scatter_fill_internal_kernel {
 
     };
 
-    _launch_scatter_gather_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+    _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
   }
 }; // struct _cuda_scatter_fill_internal_kernel
 
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index 1fd151c1402495..006dfd11b52295 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -14,11 +14,7 @@
 namespace at {
 namespace native {
 
-#ifdef __HIP_PLATFORM_HCC__
-constexpr int CAT_ARRAY_BATCH_SIZE = 1024;
-#else
 constexpr int CAT_ARRAY_BATCH_SIZE = 128;
-#endif
 constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4;
 
 namespace {
@@ -85,45 +81,6 @@ struct TensorSizeStride {
   */
 
 
-// Use pinned memory and and pass the struct by pointer on ROCm
-template <typename T, typename IndexType>
-struct CatArrInputTensor {
-  T* input;
-  IndexType offset;
-  IndexType dimSize;
-  IndexType nElements;
-};
-
-template <typename T, typename IndexType, int Dims>
-C10_LAUNCH_BOUNDS_1(512)
-__global__ void HIP_CatArrayBatchedCopy(
-    T* output,
-    CatArrInputTensor<T, IndexType>* inputs,
-    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
-    const int concatDim,
-    IndexType dimStride) {
-
-    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
-    IndexType nElements = inputs[blockIdx.y].nElements;
-
-    if(tid >= nElements) return;
-
-    T* data = inputs[blockIdx.y].input;
-    IndexType offset = inputs[blockIdx.y].offset;
-    IndexType dimSize = inputs[blockIdx.y].dimSize;
-    IndexType dataOffset = offset * dimStride;
-
-    IndexType stride = gridDim.x * blockDim.x;
-
-    while( tid < nElements){
-    IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
-                  os.tensorSize, os.tensorStride, dimSize, concatDim, tid);
-    output[dataOffset + elementOffset] = data[tid];
-
-    tid += stride;
-    }
-}
-
 // pass meta data directly through kernel argument instead of pin memory
 // In contiguous case, we will not need stride_size, setting it as 1 as placeholder
 // to pass compile.
@@ -173,127 +130,6 @@ __global__ void CatArrayBatchedCopy(
     }
 }
 
-template <typename scalar_t>
-void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
-                  int nDims, c10::MemoryFormat memory_format) {
-  // First, let's set up our kernel parameters. We start with a raw pointer to
-  // the storage for the output Tensor.
-  scalar_t *data = out.data_ptr<scalar_t>();
-
-  // Kernel Parameter
-  long tensorMetadataSize =
-    sizeof(CatArrInputTensor<scalar_t, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
-  auto d_inputs_storage = at::empty(
-    {tensorMetadataSize}, out.options().dtype(at::kByte));
-  auto d_inputs = static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(
-    d_inputs_storage.data_ptr());
-
-  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
-
-  // Next, let's initialize the size, stride arrays for the output Tensor.
-  if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = 0; i < nDims; ++i) {
-      outputParam.tensorSize[i] = at::native::size(out, i);
-      outputParam.tensorStride[i] = out.stride(i);
-    }
-  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
-    // permute the semantics of dims from NCHW to NHWC so that the input
-    // tensor is now contiguous
-    outputParam.tensorSize[0] = at::native::size(out, 0);
-    outputParam.tensorStride[0] = out.stride(0);
-    for (int i = 1; i < nDims - 1; ++i) {
-      outputParam.tensorSize[i] = at::native::size(out, i + 1);
-      outputParam.tensorStride[i] = out.stride(i + 1);
-    }
-    outputParam.tensorSize[nDims - 1] = at::native::size(out, 1);
-    outputParam.tensorStride[nDims - 1] = out.stride(1);
-  } else {
-    TORCH_CHECK(false, "unsupported memory format");
-  }
-
-  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
-
-  // Now we loop
-  int batchCounter = 0;
-  int64_t offset = 0;
-  for (int i = 0; i < inputs.size() ; i += CAT_ARRAY_BATCH_SIZE) {
-    // Re-allocate stackInputs every iteration to avoid read-after-write hazard
-    {
-      auto stackInputs_storage = at::empty({tensorMetadataSize},
-          out.options().dtype(at::kByte).device(at::kCPU).pinned_memory(true));
-      auto stackInputs =
-        static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(
-          stackInputs_storage.data_ptr());
-      for (batchCounter = 0;
-           batchCounter < CAT_ARRAY_BATCH_SIZE &&
-             (i+batchCounter) < inputs.size();
-           ++batchCounter) {
-        int64_t dimSize = 0;
-        // There is a legacy case where a 1-D empty tensor can be concat with
-        // high-dimensional tensor
-        if (inputs[i+batchCounter].numel() > 0) {
-          dimSize = at::native::size(inputs[i+batchCounter], dimension);
-        }
-
-        stackInputs[batchCounter].input =
-          inputs[i+batchCounter].data_ptr<scalar_t>();
-        stackInputs[batchCounter].offset = offset;
-        stackInputs[batchCounter].dimSize = dimSize;
-        stackInputs[batchCounter].nElements = inputs[i+batchCounter].numel();
-
-        // update offset
-        offset += dimSize;
-      }
-      at::native::copy_(d_inputs_storage, stackInputs_storage,
-                        /* non_blocking= */ true);
-    }
-
-    // Next, let's consider how we set our kernel launch parameters.
-    // We borrow from THCApply, which the kernel's internal indexing
-    // is based on.
-    dim3 applyBlock = dim3(32*16);
-
-    //Get grid where x dim fills half gpu and y dim is number of tensors.
-    //This will have cating two tensors fill the entire grid, but prevent
-    //many threads from needlessly load meta data if their sizes is small.
-    dim3 catGrid;
-    getCatGrid(batchCounter, catGrid);
-
-    if (memory_format != c10::MemoryFormat::Contiguous) {
-      switch (dimension) {
-      case 0:
-        break;
-      case 1:
-        dimension = nDims - dimension;
-        break;
-      default:
-        dimension--;
-      }
-    }
-    // Template Declarations for dim = 1, 2, 3, 4
-#define HANDLE_CASE(DIMS) \
-    HIP_CatArrayBatchedCopy<scalar_t, unsigned int, DIMS><<<\
-        catGrid, applyBlock, 0, stream.stream()>>>(\
-            data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-    switch (nDims) {
-      case 1:
-        HANDLE_CASE(1);
-        break;
-      case 2:
-        HANDLE_CASE(2);
-        break;
-      case 3:
-        HANDLE_CASE(3);
-        break;
-      case 4:
-        HANDLE_CASE(4);
-        break;
-    }
-#undef HANDLE_CASE
-  }
-}
-
 template <typename scalar_t, int batch_size, int stride_size>
 void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
                   int nDims, c10::MemoryFormat memory_format) {
@@ -546,19 +382,6 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) {
     });
   allSameType = allSameType && (out.scalar_type() == firstType);
 
-#ifdef __HIP_PLATFORM_HCC__
-  if (inputs.size() > 1 &&
-      out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
-      at::cuda::detail::canUse32BitIndexMath(out) &&
-      allContiguous &&
-      all32BitIndexable &&
-      allSameType) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-          at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
-          out.scalar_type(), "cat_cuda", [&]() {
-        hip_parallel_cat<scalar_t>(out, inputs, dimension, nDims, memory_format);
-      });
-#else
   // We support the contiguous inputs and non-contiguous input (<=4 dims) in different ways
   // For contiguous input, we don't need to pass stride meta data to cuda kernel through constant
   // memory. Therefore, we could pass more inputs to cuda threads.
@@ -587,7 +410,6 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) {
           out.scalar_type(), "cat_cuda", [&]() {
         parallel_cat<scalar_t, CAT_ARRAY_BATCH_SIZE/2, CAT_ARRAY_BATCH_SIZE/2>(out, inputs, dimension, nDims, memory_format);
       });
-#endif
   } else {
     int64_t offset = 0;
     for (int j = 0; j < inputs.size(); j++)
diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
index 64bda79809bbbc..8b43900e92716c 100644
--- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
@@ -103,7 +103,7 @@ void _unfold_backward_internal_kernel(
       grad_out_data[grad_out_idx_dim * grad_out_dim_stride] = *grad_in_data;
     };
 
-    _launch_unfold_backward_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+    _launch_unfold_backward_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
   }
   else {
     auto offset_calc = make_offset_calculator<3>(iter);
@@ -138,7 +138,7 @@ void _unfold_backward_internal_kernel(
 
     };
 
-    _launch_unfold_backward_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+    _launch_unfold_backward_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
   }
 }
 
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index a3b04333a52807..f24f865440400a 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -8,11 +8,13 @@
 using namespace at::native;
 using namespace at::native::memory;
 
-__managed__ double4 buffer1[1024];
-__managed__ double4 buffer2[1024];
+constexpr int buffer_size = 1024;
+
+__managed__ double4 buffer1[buffer_size];
+__managed__ double4 buffer2[buffer_size];
 
 void reset_buffers() {
-  for (int i = 0; i < 1024; i++) {
+  for (int i = 0; i < buffer_size; i++) {
     buffer1[i].x = i;
     buffer1[i].y = i + 0.1;
     buffer1[i].z = i + 0.2;
@@ -74,6 +76,7 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
 // defined in `ATen/native/cuda/MemoryAccess.cuh`
 template <typename scalar_t, int vec_size>
 __global__ void vectorized_copy(scalar_t *dst, scalar_t *src) {
+  static_assert(vec_size <= thread_work_size() && thread_work_size() % vec_size == 0, "Invalid vec_size");
   using array_t = at::detail::Array<char*, 2>;
   array_t data;
   data[0] = reinterpret_cast<char *>(dst);
@@ -81,9 +84,15 @@ __global__ void vectorized_copy(scalar_t *dst, scalar_t *src) {
   int idx = blockIdx.x;
   using vectorized = policies::vectorized<vec_size, array_t>;
   auto policy = vectorized(data);
-  scalar_t buf[thread_work_size];
+  scalar_t buf[thread_work_size()];
+#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
+  // This fails only on CUDA 10.x, remove this after CUDA 10.x support is dropped
+  scalar_t *buf_ = &buf[0];
+  auto accessor = [&](int index) -> scalar_t & { return buf_[index]; };
+#else
   auto accessor = [&](int index) -> scalar_t & { return buf[index]; };
-  policy.load_single_arg(accessor, src + 256 * blockIdx.x);
+#endif
+  policy.load_single_arg(accessor, src + block_work_size() * blockIdx.x);
   policy.store(buf, idx);
 }
 
@@ -98,7 +107,8 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
   // vec4 copy
   reset_buffers();
   cudaDeviceSynchronize();
-  vectorized_copy<double, 4><<<16, 64>>>(b2, b1);
+  constexpr int total_work_size = buffer_size * 4;
+  vectorized_copy<double, 4><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
@@ -112,7 +122,7 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
   // vec2 copy
   reset_buffers();
   cudaDeviceSynchronize();
-  vectorized_copy<double, 2><<<16, 64>>>(b2, b1);
+  vectorized_copy<double, 2><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
@@ -126,7 +136,7 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
   // vec1 copy
   reset_buffers();
   cudaDeviceSynchronize();
-  vectorized_copy<double, 1><<<16, 64>>>(b2, b1);
+  vectorized_copy<double, 1><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
@@ -136,8 +146,8 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
     ASSERT_EQ(buffer1[i].z, buffer2[i].z);
     ASSERT_EQ(buffer1[i].w, buffer2[i].w);
   }
-// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved
 
+// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved
 #if 0
   // unaligned
   for (int i = 0; i < 16; i++) {
@@ -146,9 +156,7 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
       b2 = reinterpret_cast<double *>(reinterpret_cast<char *>(buffer2) + j);
       cudaGetLastError();
       cudaDeviceSynchronize();
-      vectorized_copy<double, 4><<<1, 64>>>(b2, b1);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-
+      vectorized_copy<double, 4><<<1, num_threads()>>>(b2, b1);
       cudaDeviceSynchronize();
       auto err = cudaGetLastError();
       if (i % 16 == 0 && j % 16 == 0) {
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index db913c6fe23b42..bc257ef5430cf0 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -301,7 +301,7 @@ static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BF
 static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
 
 /* Special case fp32 atomic. */
-#if defined(__HIP_PLATFORM_HCC__) && defined(__gfx908__)
+#if defined(__HIP_PLATFORM_HCC__)
 static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); }
 #else
 static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 26210cb56a1408..b0c6aef4e9be17 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -161,7 +161,7 @@ if(BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
   # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
   add_subdirectory(transforms)
 endif()
-if(NOT BUILD_CAFFE2)
+if(NOT BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
   add_subdirectory(proto)
 endif()
 
@@ -1817,7 +1817,7 @@ if(BUILD_TEST)
   endif()
 
   # For special tests that explicitly uses dependencies, we add them here
-  if(USE_MPI)
+  if(BUILD_CAFFE2 AND USE_MPI)
     target_link_libraries(mpi_test ${MPI_CXX_LIBRARIES})
     if(USE_CUDA)
       target_link_libraries(mpi_gpu_test ${MPI_CXX_LIBRARIES})
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 85ec5b4ad00aa8..dd3e890689c344 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -131,7 +131,7 @@ else()
 endif()
 
 if(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
-  set(PYTORCH_ROCM_ARCH gfx803;gfx900;gfx906;gfx908)
+  set(PYTORCH_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
 else()
   set(PYTORCH_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
 endif()
@@ -187,21 +187,40 @@ if(HIP_FOUND)
   set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   ### Remove setting of Flags when FindHIP.CMake PR #558 is accepted.###
 
-  set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
-  set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
-  set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
-  set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
-  set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
-  set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
-  set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
-  set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
-  set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
-  set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft)
-  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
-  set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
-  set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim)
-  set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub)
-  set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust)
+  # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/<package>
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0")
+    set(hip_DIR ${ROCM_PATH}/lib/cmake/hip)
+    set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
+    set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
+    set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
+    set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand)
+    set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand)
+    set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas)
+    set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen)
+    set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft)
+    set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft)
+    set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse)
+    set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl)
+    set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim)
+    set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
+    set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
+  else()
+    set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
+    set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
+    set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
+    set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
+    set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
+    set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
+    set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
+    set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
+    set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
+    set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft)
+    set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
+    set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
+    set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim)
+    set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub)
+    set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust)
+  endif()
 
   find_package_and_print_version(hip REQUIRED)
   find_package_and_print_version(hsa-runtime64 REQUIRED)
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 8beeb55b448404..f307aa251dd642 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -307,11 +307,11 @@ If the operator is an ATen operator (shows up in the TorchScript graph with the
 
 * Define the symbolic function in ``torch/onnx/symbolic_opset<version>.py``, for example
   `torch/onnx/symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic_opset9.py>`_.
-  Make sure the function has the same name as the ATen function, which may declared in
+  Make sure the function has the same name as the ATen function, which may be declared in
   ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at
   build time, so will not appear in your checkout until you build PyTorch).
 * The first arg is always the ONNX graph that is being built for export.
-  Other arg names must EXACTLY match the names in ``_VariableFunctions.pyi``,
+  Other arg names must EXACTLY match the names in the ``.pyi`` file,
   because dispatch is done with keyword arguments.
 * In the symbolic function, if the operator is in the
   `ONNX standard operator set <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
@@ -365,8 +365,8 @@ See the ``symbolic_opset*.py`` files for more examples.
 torch.autograd.Functions
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-If the operator is defined in a sub-class of :class:`torch.autograd.Function`,
-there are two ways to export it.
+If the operator is a sub-class of :class:`torch.autograd.Function`, there are two ways
+to export it.
 
 Static Symbolic Method
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -388,11 +388,11 @@ PythonOp Symbolic
 ~~~~~~~~~~~~~~~~~
 
 Alternatively, you can register a custom symbolic function.
-This gives the symoblic function access to more info through the
+This gives the symbolic function access to more info through the
 TorchScript ``Node`` object for the original operation, which gets passed in as the second
 argument (after the ``Graph`` object).
 
-All autograd ``Function``s are emitted in the TorchScript graph as ``prim::PythonOp`` nodes.
+All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes.
 In order to differentiate between different ``Function`` subclasses, the
 symbolic function should use the ``name`` kwarg which gets set to the name of the class.
 
@@ -400,11 +400,12 @@ symbolic function should use the ``name`` kwarg which gets set to the name of th
 the ``prim`` namespace, so for this use case, there's a back door: register the
 symbolic for ``"::prim_PythonOp"``.
 
-Please also consider adding shape inference logic when you regiester a custom symbolic function
-via setType API. This can help the exporter to obtain correct shape inference.
-An example of setType is test_aten_embedding_2 in test_operators.py.
-Although it is not required to add shape inference logic,
-the exporter emits a warning message if it is not added.
+Custom symbolic functions should add type and shape information by calling ``setType(...)``
+on Value objects before returning them (implemented in C++ by
+``torch::jit::Value::setType``). This is not required, but it can help the exporter's
+shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
+``test_aten_embedding_2`` in
+`test_operators.py <https://github.com/pytorch/pytorch/blob/master/test/onnx/test_operators.py>`_.
 
 The example below shows how you can access ``requires_grad`` via the ``Node`` object::
 
@@ -430,13 +431,17 @@ The example below shows how you can access ``requires_grad`` via the ``Node`` ob
             print("arg {}: {}, requires grad: {}".format(i, arg, requires_grad))
 
         name = kwargs["name"]
+        ret = None
         if name == "MyClip":
-            return g.op("Clip", args[0], min_f=args[1])
+            ret = g.op("Clip", args[0], min_f=args[1])
         elif name == "MyRelu":
-            return g.op("Relu", args[0])
+            ret = g.op("Relu", args[0])
         else:
             # Logs a warning and returns None
             return _unimplemented("prim::PythonOp", "unknown node kind: " + name)
+        # Copy type and shape from original node.
+        ret.setType(n.type())
+        return ret
 
     from torch.onnx import register_custom_op_symbolic
     register_custom_op_symbolic("::prim_PythonOp", symbolic_pythonop, 1)
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 084dad349004a8..c828f0ecbf2a12 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -597,5 +597,4 @@ Utilities
     are_deterministic_algorithms_enabled
     set_warn_always
     is_warn_always_enabled
-    vmap
     _assert
diff --git a/ios/TestApp/AppleWWDRCAG3.cer b/ios/TestApp/AppleWWDRCAG3.cer
new file mode 100644
index 00000000000000..32f96f81dd6ea4
Binary files /dev/null and b/ios/TestApp/AppleWWDRCAG3.cer differ
diff --git a/ios/TestApp/fastlane/Fastfile b/ios/TestApp/fastlane/Fastfile
index 3e8ef00f0f7591..42a6346e1b7a47 100644
--- a/ios/TestApp/fastlane/Fastfile
+++ b/ios/TestApp/fastlane/Fastfile
@@ -4,7 +4,14 @@ platform :ios do
   before_all do
     setup_circle_ci
   end
-  lane :install_cert do
+  lane :install_root_cert do
+    import_certificate(
+      certificate_path: "AppleWWDRCAG3.cer",
+      keychain_path: "/Users/distiller/Library/Keychains/fastlane_tmp_keychain-db",
+      keychain_password: ""
+    )
+  end
+  lane :install_dev_cert do
     puts "Installing Certificates.p12"
     import_certificate(
       keychain_name: ENV["MATCH_KEYCHAIN_NAME"],
diff --git a/related_commits b/related_commits
new file mode 100644
index 00000000000000..f2349f17655c30
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,4 @@
+ubuntu|pytorch|apex|none|
+centos|pytorch|apex|none|
+ubuntu|pytorch|torchvision|release/0.11|fa347eb9f38c1759b73677a11b17335191e3f602
+centos|pytorch|torchvision|release/0.11|fa347eb9f38c1759b73677a11b17335191e3f602
diff --git a/test/cpp_extensions/cublas_extension.cpp b/test/cpp_extensions/cublas_extension.cpp
new file mode 100644
index 00000000000000..61945b1aa223a3
--- /dev/null
+++ b/test/cpp_extensions/cublas_extension.cpp
@@ -0,0 +1,17 @@
+#include <iostream>
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cublas_v2.h>
+
+torch::Tensor noop_cublas_function(torch::Tensor x) {
+  cublasHandle_t handle;
+  TORCH_CUDABLAS_CHECK(cublasCreate(&handle));
+  TORCH_CUDABLAS_CHECK(cublasDestroy(handle));
+  return x;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("noop_cublas_function", &noop_cublas_function, "a cublas function");
+}
diff --git a/test/cpp_extensions/cusolver_extension.cpp b/test/cpp_extensions/cusolver_extension.cpp
new file mode 100644
index 00000000000000..515d09958a8d27
--- /dev/null
+++ b/test/cpp_extensions/cusolver_extension.cpp
@@ -0,0 +1,17 @@
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cusolverDn.h>
+
+
+torch::Tensor noop_cusolver_function(torch::Tensor x) {
+  cusolverDnHandle_t handle;
+  TORCH_CUSOLVER_CHECK(cusolverDnCreate(&handle));
+  TORCH_CUSOLVER_CHECK(cusolverDnDestroy(handle));
+  return x;
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("noop_cusolver_function", &noop_cusolver_function, "a cusolver function");
+}
diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py
index 7888d0e3a88bbd..96301510f05adf 100644
--- a/test/cpp_extensions/setup.py
+++ b/test/cpp_extensions/setup.py
@@ -48,6 +48,19 @@
                             'nvcc': ['-O2']})
     ext_modules.append(extension)
 
+if torch.cuda.is_available() and CUDA_HOME is not None:
+    cublas_extension = CUDAExtension(
+        name='torch_test_cpp_extension.cublas_extension',
+        sources=['cublas_extension.cpp']
+    )
+    ext_modules.append(cublas_extension)
+
+    cusolver_extension = CUDAExtension(
+        name='torch_test_cpp_extension.cusolver_extension',
+        sources=['cusolver_extension.cpp']
+    )
+    ext_modules.append(cusolver_extension)
+
 setup(
     name='torch_test_cpp_extension',
     packages=['torch_test_cpp_extension'],
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 811137a8d83b42..6e998c5c271d39 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -15,7 +15,7 @@
 import time
 import unittest
 from itertools import product
-from typing import Dict, List, Union, Callable
+from typing import Callable, Dict, List, Union
 from unittest import mock
 from unittest.mock import patch
 
@@ -24,25 +24,25 @@
 from torch.distributed.elastic.multiprocessing import ProcessFailure, start_processes
 from torch.distributed.elastic.multiprocessing.api import (
     MultiprocessContext,
-    SignalException,
     RunProcsResult,
+    SignalException,
     Std,
     _validate_full_rank,
-    to_map,
     _wrap,
+    to_map,
 )
 from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error
 from torch.testing._internal.common_utils import (
+    IS_IN_CI,
+    IS_MACOS,
+    IS_WINDOWS,
     NO_MULTIPROCESSING_SPAWN,
     TEST_WITH_ASAN,
-    TEST_WITH_TSAN,
     TEST_WITH_DEV_DBG_ASAN,
-    IS_IN_CI,
-    IS_WINDOWS,
-    IS_MACOS,
+    TEST_WITH_TSAN,
+    run_tests,
     sandcastle_skip_if,
 )
-from torch.testing._internal.common_utils import run_tests
 
 
 class RunProcResultsTest(unittest.TestCase):
@@ -224,6 +224,7 @@ def start_processes_zombie_test(
 
 # tests incompatible with tsan or asan
 if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
+
     class StartProcessesTest(unittest.TestCase):
         def setUp(self):
             self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_")
@@ -251,12 +252,15 @@ def assert_pids_noexist(self, pids: Dict[int, int]):
 
         def test_to_map(self):
             local_world_size = 2
-            self.assertEqual({0: Std.OUT, 1: Std.OUT}, to_map(Std.OUT, local_world_size))
+            self.assertEqual(
+                {0: Std.OUT, 1: Std.OUT}, to_map(Std.OUT, local_world_size)
+            )
             self.assertEqual(
                 {0: Std.NONE, 1: Std.OUT}, to_map({1: Std.OUT}, local_world_size)
             )
             self.assertEqual(
-                {0: Std.ERR, 1: Std.OUT}, to_map({0: Std.ERR, 1: Std.OUT}, local_world_size)
+                {0: Std.ERR, 1: Std.OUT},
+                to_map({0: Std.ERR, 1: Std.OUT}, local_world_size),
             )
 
         def test_invalid_log_dir(self):
@@ -382,9 +386,7 @@ def test_void_function(self):
                     results = pc.wait(period=0.1)
                     self.assertEqual({0: None, 1: None}, results.return_values)
 
-        @sandcastle_skip_if(
-            TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan"
-        )
+        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan")
         def test_function_large_ret_val(self):
             # python multiprocessing.queue module uses pipes and actually PipedQueues
             # This means that if a single object is greater than a pipe size
@@ -439,7 +441,9 @@ def test_function_raise(self):
                     self.assertEqual(1, failure.exitcode)
                     self.assertEqual("<N/A>", failure.signal_name())
                     self.assertEqual(pc.pids()[0], failure.pid)
-                    self.assertEqual(os.path.join(log_dir, "0", "error.json"), error_file)
+                    self.assertEqual(
+                        os.path.join(log_dir, "0", "error.json"), error_file
+                    )
                     self.assertEqual(
                         int(error_file_data["message"]["extraInfo"]["timestamp"]),
                         int(failure.timestamp),
@@ -541,17 +545,22 @@ def test_multiprocessing_context_poll_raises_exception(self):
                 run_result = mp_context._poll()
                 self.assertEqual(1, len(run_result.failures))
                 failure = run_result.failures[0]
-                self.assertEqual("Signal 1 (SIGHUP) received by PID 123", failure.message)
+                self.assertEqual(
+                    "Signal 1 (SIGHUP) received by PID 123", failure.message
+                )
 
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
 if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
+
     class StartProcessesListTest(StartProcessesTest):
         ########################################
         # start_processes as binary tests
         ########################################
         def test_function(self):
-            for start_method, redirs in product(self._start_methods, redirects_oss_test()):
+            for start_method, redirs in product(
+                self._start_methods, redirects_oss_test()
+            ):
                 with self.subTest(start_method=start_method, redirs=redirs):
                     pc = start_processes(
                         name="echo",
@@ -644,6 +653,7 @@ def test_binary_redirect_and_tee(self):
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
 if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_IN_CI):
+
     class StartProcessesNotCITest(StartProcessesTest):
         def test_wrap_bad(self):
             none = ""
@@ -796,7 +806,8 @@ def test_function_exit(self):
                     self.assertEqual(pc.pids()[0], failure.pid)
                     self.assertEqual("<N/A>", error_file)
                     self.assertEqual(
-                        f"Process failed with exitcode {FAIL}", failure.message
+                        "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html",
+                        failure.message,
                     )
                     self.assertLessEqual(failure.timestamp, int(time.time()))
 
diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py
index 859069004ae712..a24986c659811f 100644
--- a/test/distributed/elastic/multiprocessing/errors/api_test.py
+++ b/test/distributed/elastic/multiprocessing/errors/api_test.py
@@ -115,7 +115,10 @@ def test_process_failure_no_error_file(self):
         pf = self.failure_without_error_file(exitcode=138)
         self.assertEqual("<N/A>", pf.signal_name())
         self.assertEqual("<N/A>", pf.error_file)
-        self.assertEqual("Process failed with exitcode 138", pf.message)
+        self.assertEqual(
+            "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html",
+            pf.message,
+        )
 
     def test_child_failed_error(self):
         pf0 = self.failure_with_error_file(exception=SentinelError("rank 0"))
@@ -134,7 +137,7 @@ def test_child_failed_error(self):
           rank: 0 (local_rank: 0)
           exitcode: 1 (pid: 997)
           error_file: /tmp/ApiTesttbb37ier/error.json
-          msg: "SentinelError: rank 0"
+          traceback: "SentinelError: rank 0"
         =============================================
         Other Failures:
         [1]:
@@ -148,7 +151,7 @@ def test_child_failed_error(self):
           rank: 2 (local_rank: 0)
           exitcode: 138 (pid: 997)
           error_file: <N/A>
-          msg: "Process failed with exitcode 138"
+          traceback: To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
         *********************************************
         """
         print(ex)
diff --git a/test/jit/test_export_modes.py b/test/jit/test_export_modes.py
index 70f236f59871f8..80c8e28ad00f78 100644
--- a/test/jit/test_export_modes.py
+++ b/test/jit/test_export_modes.py
@@ -64,7 +64,7 @@ def foo(a):
             return (a, a)
         f = io.BytesIO()
         x = torch.ones(3)
-        torch.onnx._export(foo, (x,), f, example_outputs=(x, x))
+        torch.onnx._export(foo, (x,), f)
 
     @skipIfNoLapack
     def test_aten_fallback(self):
@@ -76,9 +76,8 @@ def forward(self, x, y):
 
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
-        f = io.BytesIO()
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(), (x, y), f,
+            ModelWithAtenNotONNXOp(), (x, y), None,
             add_node_names=False,
             do_constant_folding=False,
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
@@ -91,11 +90,10 @@ class ModelWithAtenFmod(nn.Module):
             def forward(self, x, y):
                 return torch.fmod(x, y)
 
-        f = io.BytesIO()
         x = torch.randn(3, 4, dtype=torch.float32)
         y = torch.randn(3, 4, dtype=torch.float32)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenFmod(), (x, y), f,
+            ModelWithAtenFmod(), (x, y), None,
             add_node_names=False,
             do_constant_folding=False,
             operator_export_type=OperatorExportTypes.ONNX_ATEN)
diff --git a/test/jit/test_onnx_export.py b/test/jit/test_onnx_export.py
index 21ee19d262aab1..b999a1ee0b8a39 100644
--- a/test/jit/test_onnx_export.py
+++ b/test/jit/test_onnx_export.py
@@ -49,20 +49,17 @@ def forward(self, x):
 
         tm = TraceMe()
         tm = torch.jit.trace(tm, torch.rand(3, 4))
-        example_outputs = (tm(torch.rand(3, 4)),)
         f = io.BytesIO()
-        torch.onnx._export(tm, (torch.rand(3, 4),), f, example_outputs=example_outputs)
+        torch.onnx._export(tm, (torch.rand(3, 4),), f)
 
     def test_export_tensoroption_to(self):
         def foo(x):
             return x[0].clone().detach().cpu() + x
 
         traced = torch.jit.trace(foo, (torch.rand([2])))
-        example_outputs = traced(torch.rand([2]))
 
         f = io.BytesIO()
-        torch.onnx._export_to_pretty_string(traced, (torch.rand([2]),), f,
-                                            example_outputs=example_outputs)
+        torch.onnx._export_to_pretty_string(traced, (torch.rand([2]),), f)
 
     def test_onnx_export_script_module(self):
         class ModuleToExport(torch.jit.ScriptModule):
@@ -75,10 +72,8 @@ def forward(self, x):
                 return x + x
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3))
         torch.onnx.export_to_pretty_string(
-            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False)
 
     @suppress_warnings
     def test_onnx_export_func_with_warnings(self):
@@ -93,11 +88,9 @@ def __init__(self):
             def forward(self, x):
                 return func_with_warning(x)
 
-        outputs = WarningTest()(torch.randn(42))
         # no exception
         torch.onnx.export_to_pretty_string(
-            WarningTest(), torch.randn(42), None, verbose=False,
-            example_outputs=outputs)
+            WarningTest(), torch.randn(42), None, verbose=False)
 
     def test_onnx_export_script_python_fail(self):
         class PythonModule(torch.jit.ScriptModule):
@@ -119,11 +112,9 @@ def forward(self, x):
                 return y + y
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3))
         f = io.BytesIO()
         with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
-            torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False,
-                               example_outputs=outputs)
+            torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
 
     def test_onnx_export_script_inline_trace(self):
         class ModuleToInline(torch.nn.Module):
@@ -144,10 +135,8 @@ def forward(self, x):
                 return y + y
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3))
         torch.onnx.export_to_pretty_string(
-            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False)
 
     def test_onnx_export_script_inline_script(self):
         class ModuleToInline(torch.jit.ScriptModule):
@@ -169,10 +158,8 @@ def forward(self, x):
                 return y + y
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3))
         torch.onnx.export_to_pretty_string(
-            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False)
 
     def test_onnx_export_script_module_loop(self):
         class ModuleToExport(torch.jit.ScriptModule):
@@ -189,10 +176,8 @@ def forward(self, x):
                 return x
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3))
         torch.onnx.export_to_pretty_string(
-            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False)
 
     @suppress_warnings
     def test_onnx_export_script_truediv(self):
@@ -206,11 +191,9 @@ def forward(self, x):
                 return x + z
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3))
 
         torch.onnx.export_to_pretty_string(
-            mte, (torch.zeros(1, 2, 3, dtype=torch.float),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.zeros(1, 2, 3, dtype=torch.float),), None, verbose=False)
 
     def test_onnx_export_script_non_alpha_add_sub(self):
         class ModuleToExport(torch.jit.ScriptModule):
@@ -223,10 +206,8 @@ def forward(self, x):
                 return bs - 1
 
         mte = ModuleToExport()
-        outputs = torch.LongTensor([mte(torch.rand(3, 4))])
         torch.onnx.export_to_pretty_string(
-            mte, (torch.rand(3, 4),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.rand(3, 4),), None, verbose=False)
 
     def test_onnx_export_script_module_if(self):
         class ModuleToExport(torch.jit.ScriptModule):
@@ -240,10 +221,8 @@ def forward(self, x):
                 return x
 
         mte = ModuleToExport()
-        outputs = mte(torch.zeros(1, 2, 3, dtype=torch.long))
         torch.onnx.export_to_pretty_string(
-            mte, (torch.zeros(1, 2, 3),), None, verbose=False,
-            example_outputs=outputs)
+            mte, (torch.zeros(1, 2, 3),), None, verbose=False)
 
     def test_onnx_export_script_inline_params(self):
         class ModuleToInline(torch.jit.ScriptModule):
@@ -272,8 +251,7 @@ def forward(self, x):
         reference = torch.mm(torch.mm(torch.zeros(2, 3), torch.ones(3, 3)), torch.ones(3, 4))
         self.assertEqual(result, reference)
         torch.onnx.export_to_pretty_string(
-            mte, (torch.ones(2, 3),), None, verbose=False,
-            example_outputs=result)
+            mte, (torch.ones(2, 3),), None, verbose=False)
 
     def test_onnx_export_speculate(self):
 
@@ -305,18 +283,16 @@ def transpose(x):
             return x.t()
 
         f1 = Foo(transpose)
-        outputs_f1 = f1(torch.ones(1, 10, dtype=torch.float))
         f2 = Foo(linear)
-        outputs_f2 = f2(torch.ones(1, 10, dtype=torch.float))
 
         torch.onnx.export_to_pretty_string(
             f1,
             (torch.ones(1, 10, dtype=torch.float), ),
-            None, verbose=False, example_outputs=outputs_f1)
+            None, verbose=False)
         torch.onnx.export_to_pretty_string(
             f2,
             (torch.ones(1, 10, dtype=torch.float), ),
-            None, verbose=False, example_outputs=outputs_f2)
+            None, verbose=False)
 
     def test_onnx_export_shape_reshape(self):
         class Foo(torch.nn.Module):
@@ -328,10 +304,8 @@ def forward(self, x):
                 return reshaped
 
         foo = torch.jit.trace(Foo(), torch.zeros(1, 2, 3))
-        outputs = foo(torch.zeros(1, 2, 3))
         f = io.BytesIO()
-        torch.onnx.export_to_pretty_string(foo, (torch.zeros(1, 2, 3)), f,
-                                           example_outputs=outputs)
+        torch.onnx.export_to_pretty_string(foo, (torch.zeros(1, 2, 3)), f)
 
     def test_listconstruct_erasure(self):
         class FooMod(torch.nn.Module):
@@ -360,11 +334,10 @@ def forward(self, x):
         mod = DynamicSliceExportMod()
 
         input = torch.rand(3, 4, 5)
-        example_outs = mod(input)
 
         f = io.BytesIO()
         torch.onnx.export_to_pretty_string(
-            DynamicSliceExportMod(), (input,), f, example_outputs=example_outs, opset_version=10)
+            DynamicSliceExportMod(), (input,), f, opset_version=10)
 
     def test_export_dict(self):
         class DictModule(torch.nn.Module):
@@ -380,4 +353,4 @@ def forward(self, x_in: torch.Tensor) -> typing.Dict[str, torch.Tensor]:
 
         with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported."):
             torch.onnx.export_to_pretty_string(
-                torch.jit.script(mod), (x_in,), f, example_outputs=(mod(x_in),))
+                torch.jit.script(mod), (x_in,), f)
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 1d95dc8d0d8a4b..ee2402cc32b728 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1115,9 +1115,8 @@ class Mod(torch.nn.Module):
             def forward(self, x, w):
                 return torch.matmul(x, w).detach()
 
-        f = io.BytesIO()
         torch.onnx.export_to_pretty_string(
-            Mod(), (torch.rand(3, 4), torch.rand(4, 5)), f)
+            Mod(), (torch.rand(3, 4), torch.rand(4, 5)), None)
 
     def test_trace_slice_full_dim(self):
         def foo(x):
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index 59909db5958cc6..3bcd57018c36a8 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -19,7 +19,7 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None):
 
             outputs = model(inputs)
             script_model = torch.jit.script(model)
-            run_model_test(self, script_model, False, example_outputs=outputs,
+            run_model_test(self, script_model, False,
                            input=inputs, rtol=rtol, atol=atol)
 
 
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 4a12d92141bdca..53f0faec55d031 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -40,14 +40,13 @@ def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_versi
                     assert attributes[j][attribute_field] == getattr(graph.node[i].attribute[j], attribute_field)
 
 
-def check_onnx_opsets_operator(module, x, ops, opset_versions, training=torch.onnx.TrainingMode.EVAL, example_outputs=None,
+def check_onnx_opsets_operator(module, x, ops, opset_versions, training=torch.onnx.TrainingMode.EVAL,
                                input_names=None, dynamic_axes=None):
     for opset_version in opset_versions:
         f = io.BytesIO()
         torch.onnx.export(module, x, f,
                           opset_version=opset_version,
                           training=training,
-                          example_outputs=example_outputs,
                           input_names=input_names,
                           dynamic_axes=dynamic_axes)
         model = onnx.load(io.BytesIO(f.getvalue()))
@@ -91,10 +90,8 @@ def forward(self, input, k):
         x = torch.arange(1., 6., requires_grad=True)
         k = torch.tensor(3)
         module = MyModuleDynamic()
-        example_output = module(x, k)
         check_onnx_opsets_operator(module, [x, k], ops,
-                                   opset_versions=[10],
-                                   example_outputs=example_output)
+                                   opset_versions=[10])
 
     def test_maxpool(self):
         module = torch.nn.MaxPool1d(2, stride=1)
@@ -191,7 +188,6 @@ def forward(self, x):
 
         module = DynamicSliceModel()
         x = torch.rand(1, 2)
-        example_output = module(x)
         ops_10 = [{"op_name" : "Shape"},
                   {"op_name" : "Constant"},
                   {"op_name" : "Gather",
@@ -202,7 +198,7 @@ def forward(self, x):
                   {"op_name" : "Slice",
                    "attributes" : []}]
         ops = {10 : ops_10}
-        check_onnx_opsets_operator(module, x, ops, opset_versions=[10], example_outputs=example_output,
+        check_onnx_opsets_operator(module, x, ops, opset_versions=[10],
                                    input_names=['x'], dynamic_axes={"x": [0, 1]})
 
         ops_10 = [{"op_name" : "Constant"},
@@ -212,7 +208,7 @@ def forward(self, x):
                   {"op_name" : "Slice",
                    "attributes" : []}]
         ops = {10 : ops_10}
-        check_onnx_opsets_operator(module, x, ops, opset_versions=[10], example_outputs=example_output)
+        check_onnx_opsets_operator(module, x, ops, opset_versions=[10])
 
     def test_flip(self):
         class MyModule(Module):
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 14d47945f64377..696e01b2c393a0 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -279,12 +279,11 @@ def test_conv_onnx_irv4_opset8(self):
     def test_conv_variable_length(self):
         x = torch.ones(5, 3, 6, 6, requires_grad=True)
         model = torch.nn.Conv2d(3, 2, 3)
-        y = model(x)
 
         dynamic_axes = {"input_1": [0, 2, 3], "output_1": {0: "output_1_variable_dim_0", 1: "output_1_variable_dim_1"}}
         model_proto_name = "conv2d.onnx"
         torch.onnx.export(model, x, model_proto_name, verbose=True, input_names=["input_1"], output_names=["output_1"],
-                          example_outputs=y, dynamic_axes=dynamic_axes)
+                          dynamic_axes=dynamic_axes)
 
         import onnx
         onnx_model = onnx.load(model_proto_name)
@@ -729,22 +728,6 @@ def test_cumsum(self):
         x = torch.randn(2, 3, 4, requires_grad=True)
         self.assertONNX(lambda x: torch.cumsum(x, dim=1), x, opset_version=11)
 
-    def test_retain_param_name_disabled(self):
-        class MyModule(Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-                self.fc1 = nn.Linear(4, 5, bias=False)
-                self.fc1.weight.data.fill_(2.)
-                self.fc2 = nn.Linear(5, 6, bias=False)
-                self.fc2.weight.data.fill_(3.)
-
-            def forward(self, x):
-                return self.fc2(self.fc1(x))
-
-        x = torch.randn(3, 4).float()
-        self.assertONNX(MyModule(), (x,), _retain_param_name=False,
-                        keep_initializers_as_inputs=True)
-
     def test_c2_op(self):
         class MyModel(torch.nn.Module):
             def __init__(self):
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index fad9a0ba5b66ef..c4074bb1bbcbfb 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -134,7 +134,7 @@ def convert_cuda(self, model, input):
         return cuda_model, cuda_input
 
     def run_debug_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True, example_outputs=None,
+                       input=None, use_gpu=True,
                        operator_export_type=torch.onnx.OperatorExportTypes.ONNX):
         """
         # TODO: remove this from the final release version
@@ -153,7 +153,6 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
             model, input = self.convert_cuda(model, input)
 
         onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False,
-                                      example_outputs=example_outputs,
                                       do_constant_folding=False,
                                       opset_version=self.opset_version,
                                       keep_initializers_as_inputs=True,
@@ -168,7 +167,7 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
 
     def run_actual_test(self, model, train, batch_size, state_dict=None,
                         input=None, use_gpu=True, rtol=0.001, atol=1e-7,
-                        example_outputs=None, do_constant_folding=True,
+                        do_constant_folding=True,
                         operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
                         input_names=None, dynamic_axes=None,
                         remained_onnx_input_idx=None):
@@ -191,7 +190,6 @@ def run_actual_test(self, model, train, batch_size, state_dict=None,
 
         # Verify the model runs the same in Caffe2
         verify.verify(model, input, c2, rtol=rtol, atol=atol,
-                      example_outputs=example_outputs,
                       do_constant_folding=do_constant_folding,
                       opset_version=self.opset_version,
                       keep_initializers_as_inputs=True,
@@ -202,7 +200,7 @@ def run_actual_test(self, model, train, batch_size, state_dict=None,
 
     def run_model_test(self, model, train, batch_size, state_dict=None,
                        input=None, use_gpu=True, rtol=0.001, atol=1e-7,
-                       example_outputs=None, do_constant_folding=True,
+                       do_constant_folding=True,
                        operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
                        input_names=None, dynamic_axes=None,
                        remained_onnx_input_idx=None):
@@ -214,7 +212,6 @@ def run_model_test(self, model, train, batch_size, state_dict=None,
         if self.embed_params:
             self.run_actual_test(model, train, batch_size, state_dict, input,
                                  use_gpu=use_gpu_, rtol=rtol, atol=atol,
-                                 example_outputs=example_outputs,
                                  do_constant_folding=do_constant_folding,
                                  operator_export_type=operator_export_type,
                                  input_names=input_names,
@@ -222,8 +219,7 @@ def run_model_test(self, model, train, batch_size, state_dict=None,
                                  remained_onnx_input_idx=remained_onnx_input_idx)
         else:
             self.run_debug_test(model, train, batch_size, state_dict, input,
-                                use_gpu=use_gpu_, example_outputs=example_outputs,
-                                operator_export_type=operator_export_type)
+                                use_gpu=use_gpu_, operator_export_type=operator_export_type)
 
     def test_linear(self):
         class MyModel(torch.nn.Module):
@@ -289,7 +285,7 @@ def forward(self, input):
         # This test checks that given this edge condition, the model can be loaded and executed
         # in Caffe2 backend correctly.
         torch.onnx._export(model, input, f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE,
-                           input_names=["input1", "fc1.bias"], _retain_param_name=False,
+                           input_names=["input1", "fc1.bias"],
                            keep_initializers_as_inputs=True)
 
         f.seek(0)
@@ -1401,9 +1397,8 @@ def forward(self, x):
                 return x[1:x.size(0)]
         module = DynamicSliceModel()
         x = torch.rand(1, 2)
-        example_output = module(x)
         self.run_model_test(DynamicSliceModel(), train=False, input=(x,),
-                            batch_size=BATCH_SIZE, use_gpu=False, example_outputs=example_output)
+                            batch_size=BATCH_SIZE, use_gpu=False)
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dynamic_slice_to_the_end(self):
@@ -1472,7 +1467,7 @@ def forward(self):
         y = torch.ones(2, 3, 4) * 2
         self.run_model_test(Arithmetic(),
                             train=False, input=(), batch_size=BATCH_SIZE,
-                            use_gpu=False, example_outputs=(x + 3, y * (x + 3)))
+                            use_gpu=False)
 
     def test_tensor_factories(self):
         class TensorFactory(torch.nn.Module):
@@ -1493,11 +1488,9 @@ def forward(self, x):
 
         x = torch.randn(2, 3, 4)
         self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, example_outputs=(torch.ones(x.size()),),
-                            input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
+                            use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
         self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, example_outputs=(torch.ones(x.size()),),
-                            remained_onnx_input_idx=[])
+                            use_gpu=False, remained_onnx_input_idx=[])
 
     def test_tensor_like_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
@@ -1509,12 +1502,10 @@ def forward(self, x):
 
         x = torch.randn(2, 3, 4)
         self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, example_outputs=(torch.ones(x.size()),),
-                            input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
+                            use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
         remained_onnx_input_idx = None if self.opset_version < 9 else []
         self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, example_outputs=(torch.ones(x.size()),),
-                            remained_onnx_input_idx=remained_onnx_input_idx)
+                            use_gpu=False, remained_onnx_input_idx=remained_onnx_input_idx)
 
     def test_full(self):
         class FullModel(torch.nn.Module):
@@ -1532,8 +1523,7 @@ def forward(self, x):
                 return torch.full((4, 5), x, dtype=torch.long)
 
         x = torch.tensor(12)
-        self.run_model_test(FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, example_outputs=FullClass()(x))
+        self.run_model_test(FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
 
     def test_clamp(self):
         class ClampModel(torch.nn.Module):
@@ -2021,8 +2011,7 @@ def forward(self, a: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, t
                 return a
 
         x = (torch.randn(3, 4), torch.randn(4, 3))
-        self.run_model_test(TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            example_outputs=(x,))
+        self.run_model_test(TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
 
     def test_nested_tuple_input_output(self):
         class NestedTupleModel(torch.jit.ScriptModule):
@@ -2032,8 +2021,7 @@ def forward(self, a: torch.Tensor, b: Tuple[torch.Tensor, Tuple[torch.Tensor, to
 
         x = torch.randn(4, 5)
         y = (torch.randn(4, 5), (torch.randn(4, 5), torch.randn(4, 5)))
-        self.run_model_test(NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE,
-                            example_outputs=x + y[0] + y[1][0] + y[1][1])
+        self.run_model_test(NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE)
 
     def test_topk(self):
         class TopKModel(torch.nn.Module):
@@ -2050,7 +2038,7 @@ def forward(self, input):
                 return torch.topk(input, 3, dim=0)
 
         x = torch.randn(4, 3, requires_grad=True)
-        self.run_model_test(TopKModel(), train=False, input=(x,), batch_size=BATCH_SIZE, example_outputs=torch.topk(x, 3, dim=0))
+        self.run_model_test(TopKModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
 
     def test_floor(self):
         class FloorModel(torch.nn.Module):
@@ -2086,9 +2074,7 @@ def forward(self, a):
                 return torch.arange(a.size(0), dtype=torch.float).view(-1, 1) + a
 
         x = torch.randn(3, 4, requires_grad=True)
-        outputs = ArangeScript()(x)
-        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE)
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
@@ -2104,9 +2090,7 @@ def forward(self, a):
                 return torch.arange(2, a.size(0) + 2, dtype=torch.float).view(-1, 1) + a
 
         x = torch.randn(3, 4, requires_grad=True)
-        outputs = ArangeScript()(x)
-        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE)
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
@@ -2122,9 +2106,7 @@ def forward(self, a):
                 return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a
 
         x = torch.randn(3, 4, requires_grad=True)
-        outputs = ArangeScript()(x)
-        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE)
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
@@ -2249,9 +2231,7 @@ def forward(self, x):
 
         model = WhileModel()
         inputs = torch.zeros(1, 2, 3, dtype=torch.long)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,)
 
     def test_while_cond(self):
         class WhileModel(torch.jit.ScriptModule):
@@ -2266,9 +2246,7 @@ def forward(self, x, a):
         model = WhileModel()
         x = torch.zeros(1, 2, 3, dtype=torch.long)
         a = torch.tensor([0], dtype=torch.long)
-        outputs = model(x, a)
-        self.run_model_test(model, train=False, input=(x, a), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(x, a), batch_size=BATCH_SIZE)
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     def test_loop(self):
@@ -2281,9 +2259,7 @@ def forward(self, x):
 
         model = LoopModel()
         inputs = torch.zeros(1, 2, 3, dtype=torch.long)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     def test_dynamic_loop(self):
@@ -2296,9 +2272,7 @@ def forward(self, x):
 
         model = LoopModel()
         inputs = torch.zeros(1, 2, 3, dtype=torch.long)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -2317,9 +2291,7 @@ def forward(self, x):
 
         model = NestedLoopsModel()
         inputs = torch.zeros(1, 2, 3, dtype=torch.long)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,)
 
     def test_select(self):
         class SelectModel(torch.nn.Module):
@@ -2337,9 +2309,7 @@ def forward(self, input):
 
         model = StandardDeviation()
         inputs = torch.randn(2, 3, 4)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
 
     def test_std_along_dims(self):
         class StandardDeviationAlongDims(torch.nn.Module):
@@ -2348,9 +2318,7 @@ def forward(self, input):
 
         model = StandardDeviationAlongDims()
         inputs = torch.randn(2, 3, 4)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_masked_fill(self):
@@ -2379,9 +2347,7 @@ def forward(self, x, y, z):
         y = torch.zeros(4, requires_grad=True)
         z = torch.ones(5, requires_grad=True)
         model = MeshgridModel()
-        outputs = model(x, y, z)
-        self.run_model_test(model, train=False, input=(x, y, z), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(x, y, z), batch_size=BATCH_SIZE)
 
     def test_remainder(self):
         class RemainderModel(torch.nn.Module):
@@ -2391,9 +2357,7 @@ def forward(self, input, other):
         x = torch.randn(4, 2, 3)
         y = torch.randn(1, 2, 1)
         model = RemainderModel()
-        outputs = model(x, y)
-        self.run_model_test(model, train=False, input=(x, y), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(x, y), batch_size=BATCH_SIZE)
 
     def test_remainder_scalar(self):
         class RemainderModel(torch.nn.Module):
@@ -2402,9 +2366,7 @@ def forward(self, input):
 
         inputs = torch.randint(10, (2, 3))
         model = RemainderModel()
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,)
 
     def test_baddbmm(self):
         class MyModule(torch.nn.Module):
@@ -2423,9 +2385,7 @@ def forward(self, x):
 
         model = GeluModel()
         inputs = torch.randn(2, 4, 5, 6, requires_grad=True)
-        outputs = model(inputs)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
-                            example_outputs=(outputs,))
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_fill(self):
diff --git a/test/onnx/test_pytorch_onnx_caffe2_quantized.py b/test/onnx/test_pytorch_onnx_caffe2_quantized.py
index 25b95563cde989..7a194a66d27ad8 100644
--- a/test/onnx/test_pytorch_onnx_caffe2_quantized.py
+++ b/test/onnx/test_pytorch_onnx_caffe2_quantized.py
@@ -28,7 +28,7 @@ def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxe
         output = q_model(*pt_inputs)
 
         f = io.BytesIO()
-        torch.onnx.export(q_model, pt_inputs, f, input_names=input_names, example_outputs=output,
+        torch.onnx.export(q_model, pt_inputs, f, input_names=input_names,
                           operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
         f.seek(0)
         onnx_model = onnx.load(f)
@@ -84,8 +84,6 @@ def test_quantized_relu(self):
         self.generic_unary_test(torch.nn.ReLU())
 
     def export_to_onnx(self, model, input, input_names):
-        outputs = model(input)
-
         traced = torch.jit.trace(model, input)
         buf = io.BytesIO()
         torch.jit.save(traced, buf)
@@ -93,7 +91,7 @@ def export_to_onnx(self, model, input, input_names):
 
         model = torch.jit.load(buf)
         f = io.BytesIO()
-        torch.onnx.export(model, input, f, input_names=input_names, example_outputs=outputs,
+        torch.onnx.export(model, input, f, input_names=input_names,
                           operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
         f.seek(0)
 
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 3763282147d8d2..35ff1b75e11ce3 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -45,9 +45,9 @@ def to_numpy(tensor):
     else:
         return tensor.cpu().numpy()
 
-def convert_to_onnx(model, input=None, opset_version=9, example_outputs=None,
-                    do_constant_folding=True, keep_initializers_as_inputs=True,
-                    dynamic_axes=None, input_names=None, output_names=None,
+def convert_to_onnx(model, input=None, opset_version=9, do_constant_folding=True,
+                    keep_initializers_as_inputs=True, dynamic_axes=None,
+                    input_names=None, output_names=None,
                     fixed_batch_size=False, training=None,
                     onnx_shape_inference=True):
     # export the model to ONNX
@@ -55,7 +55,6 @@ def convert_to_onnx(model, input=None, opset_version=9, example_outputs=None,
     input_copy = copy.deepcopy(input)
     torch.onnx._export(model, input_copy, f,
                        opset_version=opset_version,
-                       example_outputs=example_outputs,
                        do_constant_folding=do_constant_folding,
                        keep_initializers_as_inputs=keep_initializers_as_inputs,
                        dynamic_axes=dynamic_axes,
@@ -132,7 +131,7 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
             input = input + ({},)
 
         ort_sess = convert_to_onnx(model, input=input, opset_version=self.opset_version,
-                                   example_outputs=output, do_constant_folding=do_constant_folding,
+                                   do_constant_folding=do_constant_folding,
                                    keep_initializers_as_inputs=self.keep_initializers_as_inputs,
                                    dynamic_axes=dynamic_axes, input_names=input_names,
                                    output_names=output_names, fixed_batch_size=fixed_batch_size, training=training,
@@ -284,9 +283,9 @@ def _run_test(m, remained_onnx_input_idx):
         _run_test(model, tracing_remained_onnx_input_idx)
 
     def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
-                                          example_outputs=None, do_constant_folding=True,
-                                          dynamic_axes=None, input_names=None, output_names=None,
-                                          ort_optim_on=True, training=None):
+                                          do_constant_folding=True, dynamic_axes=None,
+                                          input_names=None, output_names=None,
+                                          ort_optim_on=True, training=None, use_external_data_format=None):
         import os
         import tempfile
 
@@ -310,13 +309,12 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
                 input_copy = copy.deepcopy(input)
                 torch.onnx.export(model, input_copy, model_file_name,
                                   opset_version=self.opset_version,
-                                  example_outputs=output,
                                   verbose=False,
                                   do_constant_folding=do_constant_folding,
                                   keep_initializers_as_inputs=self.keep_initializers_as_inputs,
                                   dynamic_axes=dynamic_axes,
                                   input_names=input_names, output_names=output_names,
-                                  use_external_data_format=True)
+                                  use_external_data_format=use_external_data_format)
                 # compute onnxruntime output prediction
                 ort_sess_opt = onnxruntime.SessionOptions()
                 ort_sess_opt.graph_optimization_level = \
@@ -366,19 +364,55 @@ def forward(self, x):
                 return x + torch.ones(2, 1024)
 
         x = torch.randn(2, 1)
-        self.run_model_test_with_external_data(LargeModel(), x)
+        self.run_model_test_with_external_data(LargeModel(), x, use_external_data_format=None)
 
     @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
-    @unittest.skip("Enable this once large model with subgraph is supported in ORT")
-    def test_subgraph_with_external_data(self):
+    def test_largemodel_without_use_external_data_format_param(self):
         class LargeModel(torch.nn.Module):
-            def forward(self, x):
-                for i in range(x.size(0)):
-                    x = x + torch.ones(2, 1024)
-                return x
+            def __init__(self):
+                super(LargeModel, self).__init__()
+                dim = 5
+                n = 40 * 4 * 10 ** 6
+                self.emb = torch.nn.Embedding(n, dim)
+                self.lin1 = torch.nn.Linear(dim, 1)
+                self.seq = torch.nn.Sequential(
+                    self.emb,
+                    self.lin1,
+                )
 
-        x = torch.randn(2, 1)
-        self.run_model_test_with_external_data(torch.jit.script(LargeModel()), x)
+            def forward(self, input):
+                return self.seq(input)
+
+        model = LargeModel()
+        x = torch.tensor([2], dtype=torch.long)
+        self.run_model_test_with_external_data(LargeModel(), x, use_external_data_format=None)
+
+    @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
+    def test_largemodel_with_use_external_data_format_False(self):
+        class LargeModel(torch.nn.Module):
+            def __init__(self):
+                super(LargeModel, self).__init__()
+                dim = 5
+                n = 30 * 4 * 10 ** 6
+                self.emb = torch.nn.Embedding(n, dim)
+                self.lin1 = torch.nn.Linear(dim, 1)
+                self.seq = torch.nn.Sequential(
+                    self.emb,
+                    self.lin1,
+                )
+
+            def forward(self, input):
+                return self.seq(input)
+
+        model = LargeModel()
+        x = torch.tensor([3], dtype=torch.long)
+
+        with self.assertRaises(RuntimeError) as cm:
+            self.run_model_test_with_external_data(LargeModel(), x, use_external_data_format=False)
+
+            the_exception = cm.exception
+            self.assertEqual("RuntimeError: Exporting model exceed maximum protobuf size of 2GB. " +
+                             "Please call torch.onnx.export without setting use_external_data_format parameter.")
 
     def test_fuse_conv_bn1d(self):
         class Fuse(torch.nn.Module):
@@ -7784,7 +7818,6 @@ def forward(self, x):
         script_model = torch.jit.script(model)
         output = model(x)
         ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
-                                   example_outputs=output,
                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
@@ -7828,7 +7861,6 @@ def forward(self, x):
         y = model(input)
         output = y.cpu().numpy()
         ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
-                                   example_outputs=y,
                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
         ort_mask = np.where(ort_outs[0] != 0, 1, 0)
@@ -7913,11 +7945,9 @@ def forward(self, box_regression: torch.Tensor, proposals: List[torch.Tensor]):
         model = torch.jit.script(MyModule())
         box_regression = torch.randn([4, 4])
         proposal = [torch.randn(2, 4), torch.randn(2, 4)]
-        outputs = model(box_regression, proposal)
 
         with self.assertRaises(RuntimeError) as cm:
-            convert_to_onnx(model, input=(box_regression, proposal),
-                            example_outputs=outputs)
+            convert_to_onnx(model, input=(box_regression, proposal))
 
     def test_initializer_sequence(self):
         class MyModule(torch.nn.Module):
@@ -7939,7 +7969,7 @@ def forward(self, x):
 
         x = torch.randn(32, 3)
         f = io.BytesIO()
-        torch.onnx._export(test_model, (x,), f, _retain_param_name=True, do_constant_folding=False)
+        torch.onnx._export(test_model, (x,), f, do_constant_folding=False)
         loaded_model = onnx.load_from_string(f.getvalue())
 
         actual_list = [p.name for p in loaded_model.graph.initializer]
@@ -7986,10 +8016,9 @@ def forward(self, x, y):
 
         x = torch.ones(2, 3, dtype=torch.float)
         y = torch.tensor(5, dtype=torch.long)
-        example_output = (test_model(x, y),)
         f = io.BytesIO()
 
-        torch.onnx.export(test_model, (x, y), f, example_outputs=example_output, _retain_param_name=True, do_constant_folding=False)
+        torch.onnx.export(test_model, (x, y), f, do_constant_folding=False)
         loaded_model = onnx.load_from_string(f.getvalue())
 
         actual_list = [p.name for p in loaded_model.graph.initializer]
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index b87fa06d648a44..191cd1c3cb30b5 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -32,7 +32,6 @@ def setUp(self):
 
     def _model_to_graph(self, model, input,
                         do_constant_folding=True,
-                        example_outputs=None,
                         training=TrainingMode.EVAL,
                         operator_export_type=OperatorExportTypes.ONNX,
                         input_names=None,
@@ -49,7 +48,6 @@ def _model_to_graph(self, model, input,
                                                               _disable_torch_constant_prop=True,
                                                               operator_export_type=operator_export_type,
                                                               training=training,
-                                                              example_outputs=example_outputs,
                                                               input_names=input_names,
                                                               dynamic_axes=dynamic_axes)
         _set_onnx_shape_inference(True)
@@ -116,11 +114,11 @@ def forward(self, input_t, n):
         example_output = model(input_t, n)
 
         with self.assertRaises(RuntimeError):
-            torch.onnx.export(model,
-                              (input_t, n),
-                              "test.onnx",
-                              opset_version=self.opset_version,
-                              example_outputs=[example_output])
+            torch.onnx._export(model,
+                               (input_t, n),
+                               "test.onnx",
+                               opset_version=self.opset_version,
+                               example_outputs=[example_output])
 
     def test_constant_fold_transpose(self):
         class TransposeModule(torch.nn.Module):
@@ -558,27 +556,27 @@ def forward(self, x):
             assert node.kind() != "onnx::Shape"
         assert len(list(graph.nodes())) == 1
 
-    def test_strip_doc_string(self):
+    def test_verbose(self):
         class MyModule(torch.nn.Module):
             def forward(self, input):
                 return torch.exp(input)
         x = torch.randn(3, 4)
 
-        def is_model_stripped(f, strip_doc_string=None):
-            if strip_doc_string is None:
+        def is_model_stripped(f, verbose=None):
+            if verbose is None:
                 torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
             else:
-                torch.onnx.export(MyModule(), x, f, strip_doc_string=strip_doc_string,
+                torch.onnx.export(MyModule(), x, f, verbose=verbose,
                                   opset_version=self.opset_version)
             model = onnx.load(io.BytesIO(f.getvalue()))
             model_strip = copy.copy(model)
             onnx.helper.strip_doc_string(model_strip)
             return model == model_strip
 
-        # test strip_doc_string=True (default)
+        # test verbose=False (default)
         self.assertTrue(is_model_stripped(io.BytesIO()))
-        # test strip_doc_string=False
-        self.assertFalse(is_model_stripped(io.BytesIO(), False))
+        # test verbose=True
+        self.assertFalse(is_model_stripped(io.BytesIO(), True))
 
     # NB: remove this test once DataParallel can be correctly handled
     def test_error_on_data_parallel(self):
@@ -720,9 +718,8 @@ def forward(self, x):
         q_model = torch.quantization.convert(q_model, inplace=False)
 
         q_model.eval()
-        output = q_model(*pt_inputs)
 
-        graph, _, __ = self._model_to_graph(q_model, pt_inputs, example_outputs=output,
+        graph, _, __ = self._model_to_graph(q_model, pt_inputs,
                                             operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
                                             input_names=['pt_inputs'],
                                             dynamic_axes={'pt_inputs': [0, 1, 2, 3]})
@@ -749,9 +746,8 @@ def forward(self, x):
 
         x = torch.tensor([2])
         model = PrimModule()
-        output = model(x)
         model.eval()
-        graph, _, __ = self._model_to_graph(model, (x,), example_outputs=output,
+        graph, _, __ = self._model_to_graph(model, (x,),
                                             operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
                                             input_names=['x'], dynamic_axes={'x': [0]})
         iter = graph.nodes()
@@ -814,10 +810,9 @@ def forward(self, x):
 
         model = torch.jit.script(MyModule())
         x = torch.randn(10, 3, 128, 128)
-        example_outputs = model(x)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True, example_outputs=example_outputs,
+        graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True,
                                             operator_export_type=OperatorExportTypes.ONNX,
                                             training=torch.onnx.TrainingMode.TRAINING,
                                             input_names=['x'], dynamic_axes={'x': [0, 1, 2, 3]})
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 144c209bd50fd4..df0383b1f354bb 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -1233,8 +1233,6 @@ def _test_lower_graph_impl(self, model, data):
         input_names = ["x"]
 
         def export_to_onnx(model, input, input_names):
-            outputs = model(input)
-
             traced = torch.jit.trace(model, input)
             buf = io.BytesIO()
             torch.jit.save(traced, buf)
@@ -1242,7 +1240,7 @@ def export_to_onnx(model, input, input_names):
 
             model = torch.jit.load(buf)
             f = io.BytesIO()
-            torch.onnx.export(model, input, f, input_names=input_names, example_outputs=outputs,
+            torch.onnx.export(model, input, f, input_names=input_names,
                               operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
         onnx_model = export_to_onnx(model, data, input_names)
 
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index cf35e6b13265d9..979f23d0698709 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -80,6 +80,24 @@ def test_cuda_extension(self):
         # 2 * sigmoid(0) = 2 * 0.5 = 1
         self.assertEqual(z, torch.ones_like(z))
 
+    @common.skipIfRocm
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_cublas_extension(self):
+        from torch_test_cpp_extension import cublas_extension
+
+        x = torch.zeros(100, device="cuda", dtype=torch.float32)
+        z = cublas_extension.noop_cublas_function(x)
+        self.assertEqual(z, x)
+
+    @common.skipIfRocm
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_cusolver_extension(self):
+        from torch_test_cpp_extension import cusolver_extension
+
+        x = torch.zeros(100, device="cuda", dtype=torch.float32)
+        z = cusolver_extension.noop_cusolver_function(x)
+        self.assertEqual(z, x)
+
     @unittest.skipIf(IS_WINDOWS, "Not available on Windows")
     def test_no_python_abi_suffix_sets_the_correct_library_name(self):
         # For this test, run_test.py will call `python setup.py install` in the
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 07628f6559ceef..6802cc8890dd6c 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -1524,6 +1524,28 @@ def test_sampler_reproducibility(self):
         ):
             self.assertEqual(list(fn()), list(fn()))
 
+        for sampler in (
+            RandomSampler(self.dataset, num_samples=5, replacement=True),
+            RandomSampler(self.dataset, replacement=False),
+            WeightedRandomSampler(weights, num_samples=5, replacement=True),
+            WeightedRandomSampler(weights, num_samples=5, replacement=False),
+            SubsetRandomSampler(range(10)),
+        ):
+            torch.manual_seed(0)
+            l1 = list(sampler) + list(sampler)
+
+            torch.manual_seed(0)
+            l2 = list(sampler) + list(sampler)
+            self.assertEqual(l1, l2)
+
+            its = (iter(sampler), iter(sampler))
+            ls = ([], [])
+            for idx in range(len(sampler)):
+                for i in range(2):
+                    if idx == 0:
+                        torch.manual_seed(0)
+                    ls[i].append(next(its[i]))
+            self.assertEqual(ls[0], ls[1])
 
     def _test_sampler(self, **kwargs):
         indices = range(2, 12)  # using a regular iterable
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 0da0c0a1dcebd3..c951b0c53e793a 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -1,3 +1,4 @@
+import copy
 import http.server
 import itertools
 import os
@@ -414,13 +415,30 @@ def test_demux_mux_datapipe(self):
 
         # Test Case: Uneven DataPipes
         source_numbers = list(range(0, 10)) + [10, 12]
-        numbers_dp = IDP(source_numbers)
+        numbers_dp = dp.iter.IterableWrapper(source_numbers)
         n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
         self.assertEqual([1, 3, 5, 7, 9], list(n2))
         n = n1.mux(n2)
         self.assertEqual(source_numbers, list(n))
 
+    @suppress_warnings  # Suppress warning for lambda fn
+    def test_map_with_col_file_handle_datapipe(self):
+        temp_dir = self.temp_dir.name
+        datapipe1 = dp.iter.FileLister(temp_dir, '')
+        datapipe2 = dp.iter.FileLoader(datapipe1)
+
+        def _helper(datapipe):
+            dp1 = datapipe.map(lambda x: x.read(), input_col=1)
+            dp2 = datapipe.map(lambda x: (x[0], x[1].read()))
+            self.assertEqual(list(dp1), list(dp2))
+
+        # tuple
+        _helper(datapipe2)
+        # list
+        datapipe3 = datapipe2.map(lambda x: list(x))
+        _helper(datapipe3)
+
 
 class TestDataFramesPipes(TestCase):
     """
@@ -619,24 +637,12 @@ def __init__(self, input_dp):
         super().__init__()
         self.input_dp = input_dp
 
+    # Prevent in-place modification
     def __iter__(self):
-        for i in self.input_dp:
-            yield i
-
-
-class IDP(IterDataPipe):
-    def __init__(self, input_dp):
-        super().__init__()
-        self.input_dp = input_dp
-        self.length = len(input_dp)
-
-    def __iter__(self):
-        for i in self.input_dp:
+        input_dp = self.input_dp if isinstance(self.input_dp, IterDataPipe) else copy.deepcopy(self.input_dp)
+        for i in input_dp:
             yield i
 
-    def __len__(self):
-        return self.length
-
 
 class MDP(MapDataPipe):
     def __init__(self, input_dp):
@@ -669,19 +675,19 @@ class TestFunctionalIterDataPipe(TestCase):
     def _test_picklable(self):
         arr = range(10)
         picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [
-            (dp.iter.Mapper, IDP(arr), (), {}),
-            (dp.iter.Mapper, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}),
-            (dp.iter.Collator, IDP(arr), (), {}),
-            (dp.iter.Collator, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}),
-            (dp.iter.Filter, IDP(arr), (_fake_filter_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (), {}),
+            (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Collator, dp.iter.IterableWrapper(arr), (), {}),
+            (dp.iter.Collator, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Filter, dp.iter.IterableWrapper(arr), (_fake_filter_fn, (0, ), {'test': True}), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes:
             p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
 
         unpicklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [
-            (dp.iter.Mapper, IDP(arr), (lambda x: x, ), {}),
-            (dp.iter.Collator, IDP(arr), (lambda x: x, ), {}),
-            (dp.iter.Filter, IDP(arr), (lambda x: x >= 5, ), {}),
+            (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (lambda x: x, ), {}),
+            (dp.iter.Collator, dp.iter.IterableWrapper(arr), (lambda x: x, ), {}),
+            (dp.iter.Filter, dp.iter.IterableWrapper(arr), (lambda x: x >= 5, ), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes:
             with warnings.catch_warnings(record=True) as wa:
@@ -692,8 +698,8 @@ def _test_picklable(self):
                     p = pickle.dumps(datapipe)
 
     def test_concat_datapipe(self):
-        input_dp1 = IDP(range(10))
-        input_dp2 = IDP(range(5))
+        input_dp1 = dp.iter.IterableWrapper(range(10))
+        input_dp2 = dp.iter.IterableWrapper(range(5))
 
         with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"):
             dp.iter.Concater()
@@ -718,7 +724,7 @@ def test_concat_datapipe(self):
 
 
     def test_fork_datapipe(self):
-        input_dp = IDP(range(10))
+        input_dp = dp.iter.IterableWrapper(range(10))
 
         with self.assertRaises(ValueError):
             input_dp.fork(num_instances=0)
@@ -836,7 +842,7 @@ def test_fork_datapipe(self):
         self.assertEqual(len(input_dp), len(dp3))
 
     def test_demux_datapipe(self):
-        input_dp = IDP(range(10))
+        input_dp = dp.iter.IterableWrapper(range(10))
 
         with self.assertRaises(ValueError):
             input_dp.demux(num_instances=0, classifier_fn=lambda x: 0)
@@ -882,8 +888,8 @@ def test_demux_datapipe(self):
         self.assertEqual(list(range(0, 5)), output2)
 
         # Test Case: classifer returns a value outside of [0, num_instance - 1]
-        dp = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
-        it = iter(dp[0])
+        dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
+        it = iter(dp0[0])
         with self.assertRaises(ValueError):
             next(it)
             next(it)
@@ -960,7 +966,7 @@ def test_demux_datapipe(self):
 
     @suppress_warnings  # Suppress warning for lambda fn
     def test_map_datapipe(self):
-        input_dp = IDP(range(10))
+        input_dp = dp.iter.IterableWrapper(range(10))
 
         def fn(item, dtype=torch.float, *, sum=False):
             data = torch.tensor(item, dtype=dtype)
@@ -1005,7 +1011,7 @@ def fn_nn(d0, d1):
 
         def _helper(ref_fn, fn, input_col=None, output_col=None):
             for constr in (list, tuple):
-                datapipe = IDP([constr((0, 1, 2)), constr((3, 4, 5)), constr((6, 7, 8))])
+                datapipe = dp.iter.IterableWrapper([constr((0, 1, 2)), constr((3, 4, 5)), constr((6, 7, 8))])
                 res_dp = datapipe.map(fn, input_col, output_col)
                 ref_dp = datapipe.map(ref_fn)
                 self.assertEqual(list(res_dp), list(ref_dp))
@@ -1072,9 +1078,11 @@ def _dict_update(data, newdata, remove_idx=None):
             return _data
 
         def _helper(ref_fn, fn, input_col=None, output_col=None):
-            datapipe = IDP([{"x": 0, "y": 1, "z": 2},
-                            {"x": 3, "y": 4, "z": 5},
-                            {"x": 6, "y": 7, "z": 8}])
+            datapipe = dp.iter.IterableWrapper(
+                [{"x": 0, "y": 1, "z": 2},
+                 {"x": 3, "y": 4, "z": 5},
+                 {"x": 6, "y": 7, "z": 8}]
+            )
             res_dp = datapipe.map(fn, input_col, output_col)
             ref_dp = datapipe.map(ref_fn)
             self.assertEqual(list(res_dp), list(ref_dp))
@@ -1117,7 +1125,7 @@ def _helper(ref_fn, fn, input_col=None, output_col=None):
     # TODO(VitalyFedyunin): If dill installed this test fails
     def _test_map_datapipe_nested_level(self):
 
-        input_dp = IDP([list(range(10)) for _ in range(3)])
+        input_dp = dp.iter.IterableWrapper([list(range(10)) for _ in range(3)])
 
         def fn(item, *, dtype=torch.float):
             return torch.tensor(item, dtype=dtype)
@@ -1153,7 +1161,7 @@ def fn(item, *, dtype=torch.float):
 
     def test_collate_datapipe(self):
         arrs = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
-        input_dp = IDP(arrs)
+        input_dp = dp.iter.IterableWrapper(arrs)
 
         def _collate_fn(batch):
             return torch.tensor(sum(batch), dtype=torch.float)
@@ -1172,7 +1180,7 @@ def _collate_fn(batch):
 
     def test_batch_datapipe(self):
         arrs = list(range(10))
-        input_dp = IDP(arrs)
+        input_dp = dp.iter.IterableWrapper(arrs)
         with self.assertRaises(AssertionError):
             input_dp.batch(batch_size=0)
 
@@ -1200,7 +1208,7 @@ def test_batch_datapipe(self):
     def test_unbatch_datapipe(self):
 
         target_length = 6
-        prebatch_dp = IDP(range(target_length))
+        prebatch_dp = dp.iter.IterableWrapper(range(target_length))
 
         input_dp = prebatch_dp.batch(3)
         unbatch_dp = input_dp.unbatch()
@@ -1208,13 +1216,13 @@ def test_unbatch_datapipe(self):
         for i, res in zip(prebatch_dp, unbatch_dp):
             self.assertEqual(i, res)
 
-        input_dp = IDP([[0, 1, 2], [3, 4, 5]])
+        input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]])
         unbatch_dp = input_dp.unbatch()
         self.assertEqual(len(list(unbatch_dp)), target_length)
         for i, res in zip(prebatch_dp, unbatch_dp):
             self.assertEqual(i, res)
 
-        input_dp = IDP([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
+        input_dp = dp.iter.IterableWrapper([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
 
         unbatch_dp = input_dp.unbatch()
         expected_dp = [[0, 1], [2, 3], [4, 5], [6, 7]]
@@ -1233,7 +1241,7 @@ def test_unbatch_datapipe(self):
         for i, res in zip(expected_dp2, unbatch_dp):
             self.assertEqual(i, res)
 
-        input_dp = IDP([[0, 1, 2], [3, 4, 5]])
+        input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]])
         with self.assertRaises(ValueError):
             unbatch_dp = input_dp.unbatch(unbatch_level=-2)
             for i in unbatch_dp:
@@ -1245,7 +1253,7 @@ def test_unbatch_datapipe(self):
                 print(i)
 
     def test_bucket_batch_datapipe(self):
-        input_dp = IDP(range(20))
+        input_dp = dp.iter.IterableWrapper(range(20))
         with self.assertRaises(AssertionError):
             dp.iter.BucketBatcher(input_dp, batch_size=0)
 
@@ -1258,7 +1266,7 @@ def _helper(**kwargs):
             data_len = 100
             arrs = list(range(data_len))
             random.shuffle(arrs)
-            input_dp = IDP(arrs)
+            input_dp = dp.iter.IterableWrapper(arrs)
             bucket_dp = dp.iter.BucketBatcher(input_dp, **kwargs)
 
             self.assertEqual(len(bucket_dp), data_len // 3 if kwargs['drop_last'] else data_len // 3 + 1)
@@ -1291,7 +1299,7 @@ def _sort_fn(data):
 
 
     def test_filter_datapipe(self):
-        input_ds = IDP(range(10))
+        input_ds = dp.iter.IterableWrapper(range(10))
 
         def _filter_fn(data, val, clip=False):
             if clip:
@@ -1318,7 +1326,7 @@ def _non_bool_fn(data):
 
     def test_filter_datapipe_nested_list(self):
 
-        input_ds = IDP(range(10)).batch(5)
+        input_ds = dp.iter.IterableWrapper(range(10)).batch(5)
 
         def _filter_fn(data, val):
             return data >= val
@@ -1340,7 +1348,7 @@ def _filter_fn(data, val):
             filter_dp = input_ds.filter(nesting_level=5, filter_fn=_filter_fn, fn_kwargs={'val': 5})
             temp = list(filter_dp)
 
-        input_ds = IDP(range(10)).batch(3)
+        input_ds = dp.iter.IterableWrapper(range(10)).batch(3)
 
         filter_dp = input_ds.filter(lambda ls: len(ls) >= 3)
         expected_dp3: List[List[int]] = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
@@ -1348,21 +1356,21 @@ def _filter_fn(data, val):
         for data, exp in zip(filter_dp, expected_dp3):
             self.assertEqual(data, exp)
 
-        input_ds = IDP([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]])
+        input_ds = dp.iter.IterableWrapper([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]])
         filter_dp = input_ds.filter(lambda x: x > 3, nesting_level=-1)
         expected_dp4 = [[[4, 5]], [[6, 7, 8]]]
         self.assertEqual(len(list(filter_dp)), len(expected_dp4))
         for data2, exp2 in zip(filter_dp, expected_dp4):
             self.assertEqual(data2, exp2)
 
-        input_ds = IDP([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]])
+        input_ds = dp.iter.IterableWrapper([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]])
         filter_dp = input_ds.filter(lambda x: x > 7, nesting_level=-1)
         expected_dp5 = [[[8]]]
         self.assertEqual(len(list(filter_dp)), len(expected_dp5))
         for data2, exp2 in zip(filter_dp, expected_dp5):
             self.assertEqual(data2, exp2)
 
-        input_ds = IDP([[[0, 1], [3, 4]], [[6, 7, 8], [1, 2, 3]]])
+        input_ds = dp.iter.IterableWrapper([[[0, 1], [3, 4]], [[6, 7, 8], [1, 2, 3]]])
         filter_dp = input_ds.filter(lambda ls: len(ls) >= 3, nesting_level=1)
         expected_dp6 = [[[6, 7, 8], [1, 2, 3]]]
         self.assertEqual(len(list(filter_dp)), len(expected_dp6))
@@ -1370,7 +1378,7 @@ def _filter_fn(data, val):
             self.assertEqual(data2, exp2)
 
     def test_sampler_datapipe(self):
-        input_dp = IDP(range(10))
+        input_dp = dp.iter.IterableWrapper(range(10))
         # Default SequentialSampler
         sampled_dp = dp.iter.Sampler(input_dp)  # type: ignore[var-annotated]
         self.assertEqual(len(sampled_dp), 10)
@@ -1387,7 +1395,7 @@ def test_sampler_datapipe(self):
 
     def test_shuffle_datapipe(self):
         exp = list(range(20))
-        input_ds = IDP(exp)
+        input_ds = dp.iter.IterableWrapper(exp)
 
         with self.assertRaises(AssertionError):
             shuffle_dp = input_ds.shuffle(buffer_size=0)
@@ -1413,15 +1421,15 @@ def test_shuffle_datapipe(self):
 
     def test_zip_datapipe(self):
         with self.assertRaises(TypeError):
-            dp.iter.Zipper(IDP(range(10)), list(range(10)))  # type: ignore[arg-type]
+            dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), list(range(10)))  # type: ignore[arg-type]
 
-        zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP_NoLen(range(5)))  # type: ignore[var-annotated]
+        zipped_dp = dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), IDP_NoLen(range(5)))  # type: ignore[var-annotated]
         with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
             len(zipped_dp)
         exp = list((i, i) for i in range(5))
         self.assertEqual(list(zipped_dp), exp)
 
-        zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP(range(5)))
+        zipped_dp = dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), dp.iter.IterableWrapper(range(5)))
         self.assertEqual(len(zipped_dp), 5)
         self.assertEqual(list(zipped_dp), exp)
         # Reset
@@ -1506,32 +1514,32 @@ def fn(item, dtype=torch.float, *, sum=False):
     def test_mux_datapipe(self):
 
         # Test Case: Elements are yielded one at a time from each DataPipe, until they are all exhausted
-        input_dp1 = IDP(range(4))
-        input_dp2 = IDP(range(4, 8))
-        input_dp3 = IDP(range(8, 12))
+        input_dp1 = dp.iter.IterableWrapper(range(4))
+        input_dp2 = dp.iter.IterableWrapper(range(4, 8))
+        input_dp3 = dp.iter.IterableWrapper(range(8, 12))
         output_dp = input_dp1.mux(input_dp2, input_dp3)
         expected_output = [0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11]
         self.assertEqual(len(expected_output), len(output_dp))
         self.assertEqual(expected_output, list(output_dp))
 
         # Test Case: Uneven input Data Pipes
-        input_dp1 = IDP([1, 2, 3, 4])
-        input_dp2 = IDP([10])
-        input_dp3 = IDP([100, 200, 300])
+        input_dp1 = dp.iter.IterableWrapper([1, 2, 3, 4])
+        input_dp2 = dp.iter.IterableWrapper([10])
+        input_dp3 = dp.iter.IterableWrapper([100, 200, 300])
         output_dp = input_dp1.mux(input_dp2, input_dp3)
         expected_output = [1, 10, 100, 2, 200, 3, 300, 4]
         self.assertEqual(len(expected_output), len(output_dp))
         self.assertEqual(expected_output, list(output_dp))
 
         # Test Case: Empty Data Pipe
-        input_dp1 = IDP([0, 1, 2, 3])
-        input_dp2 = IDP([])
+        input_dp1 = dp.iter.IterableWrapper([0, 1, 2, 3])
+        input_dp2 = dp.iter.IterableWrapper([])
         output_dp = input_dp1.mux(input_dp2)
         self.assertEqual(len(input_dp1), len(output_dp))
         self.assertEqual(list(input_dp1), list(output_dp))
 
         # Test Case: raises TypeError when __len__ is called and an input doesn't have __len__
-        input_dp1 = IDP(range(10))
+        input_dp1 = dp.iter.IterableWrapper(range(10))
         input_dp_no_len = IDP_NoLen(range(10))
         output_dp = input_dp1.mux(input_dp_no_len)
         with self.assertRaises(TypeError):
@@ -1665,8 +1673,8 @@ def __iter__(self) -> Iterator[Tuple[int, str]]:
         self.assertTrue(issubclass(DP1, IterDataPipe))
         dp1 = DP1(10)
         self.assertTrue(DP1.type.issubtype(dp1.type) and dp1.type.issubtype(DP1.type))
-        dp2 = DP1(5)
-        self.assertEqual(dp1.type, dp2.type)
+        dp1_ = DP1(5)
+        self.assertEqual(dp1.type, dp1_.type)
 
         with self.assertRaisesRegex(TypeError, r"is not a generic class"):
             class InvalidDP5(DP1[tuple]):  # type: ignore[type-arg]
@@ -1679,10 +1687,10 @@ def __iter__(self) -> Iterator[T_co]:
                     yield d  # type: ignore[misc]
 
         self.assertTrue(issubclass(DP2, IterDataPipe))
-        dp1 = DP2()  # type: ignore[assignment]
-        self.assertTrue(DP2.type.issubtype(dp1.type) and dp1.type.issubtype(DP2.type))
-        dp2 = DP2()  # type: ignore[assignment]
-        self.assertEqual(dp1.type, dp2.type)
+        dp2 = DP2()  # type: ignore[var-annotated]
+        self.assertTrue(DP2.type.issubtype(dp2.type) and dp2.type.issubtype(DP2.type))
+        dp2_ = DP2()  # type: ignore[var-annotated]
+        self.assertEqual(dp2.type, dp2_.type)
 
         class DP3(IterDataPipe[Tuple[T_co, str]]):
             r""" DataPipe without fixed type with __init__ function"""
@@ -1695,10 +1703,10 @@ def __iter__(self) -> Iterator[Tuple[T_co, str]]:
                     yield d, str(d)
 
         self.assertTrue(issubclass(DP3, IterDataPipe))
-        dp1 = DP3(range(10))  # type: ignore[assignment]
-        self.assertTrue(DP3.type.issubtype(dp1.type) and dp1.type.issubtype(DP3.type))
-        dp2 = DP3(5)  # type: ignore[assignment]
-        self.assertEqual(dp1.type, dp2.type)
+        dp3 = DP3(range(10))  # type: ignore[var-annotated]
+        self.assertTrue(DP3.type.issubtype(dp3.type) and dp3.type.issubtype(DP3.type))
+        dp3_ = DP3(5)  # type: ignore[var-annotated]
+        self.assertEqual(dp3.type, dp3_.type)
 
         class DP4(IterDataPipe[tuple]):
             r""" DataPipe without __iter__ annotation"""
@@ -1707,8 +1715,8 @@ def __iter__(self):
                 raise NotImplementedError
 
         self.assertTrue(issubclass(DP4, IterDataPipe))
-        dp = DP4()
-        self.assertTrue(dp.type.param == tuple)
+        dp4 = DP4()
+        self.assertTrue(dp4.type.param == tuple)
 
         class DP5(IterDataPipe):
             r""" DataPipe without type annotation"""
@@ -1717,9 +1725,9 @@ def __iter__(self) -> Iterator[str]:
                 raise NotImplementedError
 
         self.assertTrue(issubclass(DP5, IterDataPipe))
-        dp = DP5()  # type: ignore[assignment]
+        dp5 = DP5()
         from torch.utils.data._typing import issubtype
-        self.assertTrue(issubtype(dp.type.param, Any) and issubtype(Any, dp.type.param))
+        self.assertTrue(issubtype(dp5.type.param, Any) and issubtype(Any, dp5.type.param))
 
         class DP6(IterDataPipe[int]):
             r""" DataPipe with plain Iterator"""
@@ -1728,13 +1736,13 @@ def __iter__(self) -> Iterator:
                 raise NotImplementedError
 
         self.assertTrue(issubclass(DP6, IterDataPipe))
-        dp = DP6()  # type: ignore[assignment]
-        self.assertTrue(dp.type.param == int)
+        dp6 = DP6()
+        self.assertTrue(dp6.type.param == int)
 
         class DP7(IterDataPipe[Awaitable[T_co]]):
             r""" DataPipe with abstract base class"""
 
-        self.assertTrue(issubclass(DP6, IterDataPipe))
+        self.assertTrue(issubclass(DP7, IterDataPipe))
         self.assertTrue(DP7.type.param == Awaitable[T_co])
 
         class DP8(DP7[str]):
@@ -1765,11 +1773,11 @@ def __iter__(self) -> Iterator[int]:
         # Non-DataPipe input with DataPipe hint
         datasource = [(1, '1'), (2, '2'), (3, '3')]
         with self.assertRaisesRegex(TypeError, r"Expected argument 'dp' as a IterDataPipe"):
-            dp = DP0(datasource)
+            dp0 = DP0(datasource)
 
-        dp = DP0(IDP(range(10)))
+        dp0 = DP0(dp.iter.IterableWrapper(range(10)))
         with self.assertRaisesRegex(TypeError, r"Expected type of argument 'dp' as a subtype"):
-            dp = DP1(dp)
+            dp1 = DP1(dp0)
 
     def test_runtime(self):
         class DP(IterDataPipe[Tuple[int, T_co]]):
@@ -1784,26 +1792,26 @@ def __iter__(self) -> Iterator[Tuple[int, T_co]]:
         dss = ([(1, '1'), (2, '2')],
                [(1, 1), (2, '2')])
         for ds in dss:
-            dp = DP(ds)  # type: ignore[var-annotated]
-            self.assertEqual(list(dp), ds)
+            dp0 = DP(ds)  # type: ignore[var-annotated]
+            self.assertEqual(list(dp0), ds)
             # Reset __iter__
-            self.assertEqual(list(dp), ds)
+            self.assertEqual(list(dp0), ds)
 
         dss = ([(1, 1), ('2', 2)],  # type: ignore[assignment, list-item]
                [[1, '1'], [2, '2']],  # type: ignore[list-item]
                [1, '1', 2, '2'])
         for ds in dss:
-            dp = DP(ds)
+            dp0 = DP(ds)
             with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"):
-                list(dp)
+                list(dp0)
 
             with runtime_validation_disabled():
-                self.assertEqual(list(dp), ds)
+                self.assertEqual(list(dp0), ds)
                 with runtime_validation_disabled():
-                    self.assertEqual(list(dp), ds)
+                    self.assertEqual(list(dp0), ds)
 
             with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"):
-                list(dp)
+                list(dp0)
 
     def test_reinforce(self):
         T = TypeVar('T', int, str)
@@ -1819,26 +1827,26 @@ def __iter__(self) -> Iterator[T]:
 
         ds = list(range(10))
         # Valid type reinforcement
-        dp = DP(ds).reinforce_type(int)
-        self.assertTrue(dp.type, int)
-        self.assertEqual(list(dp), ds)
+        dp0 = DP(ds).reinforce_type(int)
+        self.assertTrue(dp0.type, int)
+        self.assertEqual(list(dp0), ds)
 
         # Invalid type
         with self.assertRaisesRegex(TypeError, r"'expected_type' must be a type"):
-            dp = DP(ds).reinforce_type(1)
+            dp1 = DP(ds).reinforce_type(1)
 
         # Type is not subtype
         with self.assertRaisesRegex(TypeError, r"Expected 'expected_type' as subtype of"):
-            dp = DP(ds).reinforce_type(float)
+            dp2 = DP(ds).reinforce_type(float)
 
         # Invalid data at runtime
-        dp = DP(ds).reinforce_type(str)
+        dp3 = DP(ds).reinforce_type(str)
         with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"):
-            list(dp)
+            list(dp3)
 
         # Context Manager to disable the runtime validation
         with runtime_validation_disabled():
-            self.assertEqual(list(d for d in dp), ds)
+            self.assertEqual(list(d for d in dp3), ds)
 
 
 class NumbersDataset(IterDataPipe):
@@ -1900,7 +1908,7 @@ def test_simple_sharding(self):
         self.assertEqual(sorted(all_items), sorted(items))
 
     def test_sharding_length(self):
-        numbers_dp = IDP(range(13))
+        numbers_dp = dp.iter.IterableWrapper(range(13))
         sharded_dp0 = numbers_dp.sharding_filter()
         torch.utils.data.sharding.apply_sharding(sharded_dp0, 3, 0)
         sharded_dp1 = numbers_dp.sharding_filter()
@@ -1912,7 +1920,7 @@ def test_sharding_length(self):
         self.assertEqual(4, len(sharded_dp1))
         self.assertEqual(4, len(sharded_dp2))
 
-        numbers_dp = IDP(range(1))
+        numbers_dp = dp.iter.IterableWrapper(range(1))
         sharded_dp0 = numbers_dp.sharding_filter()
         torch.utils.data.sharding.apply_sharding(sharded_dp0, 2, 0)
         sharded_dp1 = numbers_dp.sharding_filter()
@@ -1922,11 +1930,11 @@ def test_sharding_length(self):
 
     @skipIfNoDill
     def test_old_dataloader(self):
-        dp = self._get_pipeline()
-        expected = list(dp)
+        dp0 = self._get_pipeline()
+        expected = list(dp0)
 
-        dp = self._get_pipeline().sharding_filter()
-        dl = DataLoader(dp, batch_size=1, shuffle=False, num_workers=2,
+        dp0 = self._get_pipeline().sharding_filter()
+        dl = DataLoader(dp0, batch_size=1, shuffle=False, num_workers=2,
                         worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
         items = []
         for i in dl:
diff --git a/test/test_nn.py b/test/test_nn.py
index 92357d9ce15394..b6dd466faadfde 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9704,12 +9704,6 @@ def test_cosine_similarity(self):
         self.assertEqual(input1.grad, torch.zeros_like(input1))
         self.assertEqual(input2.grad, input1 * 1e8)
 
-        # Check error when inputs are not the same shape
-        input1 = torch.randn(2, 2, 1)
-        input2 = torch.randn(2, 1, 3)
-        with self.assertRaises(RuntimeError):
-            F.cosine_similarity(input1, input2)
-
         # Check type promotion, issue #61454
         input = torch.tensor(12.)
         out = F.cosine_similarity(input.to(torch.int8), input, dim=-1)
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 38af5bb2c78246..20db5c7972b83d 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -449,5 +449,15 @@ def test_nested_enable_python_mode(self) -> None:
                 with enable_python_mode(LoggingTensor):
                     pass
 
+    def test_tolist_numpy_with_python_mode(self) -> None:
+        x = LoggingTensor(torch.tensor([2.0, 3.0]))
+        with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."):
+            x.tolist()
+        with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."):
+            x.numpy()
+        with self.assertRaises(AssertionError):
+            self.assertEqual(x, None)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index f632e95d9c7046..d66a1e5e452161 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -14,7 +14,7 @@
      skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf)
 from torch.testing._internal.common_methods_invocations import spectral_funcs, SpectralFuncInfo
 
-from setuptools import distutils
+from distutils.version import LooseVersion
 from typing import Optional, List
 
 
@@ -217,7 +217,7 @@ def get_op_name(op):
     @ops([op for op in spectral_funcs if not op.ndimensional])
     def test_reference_1d(self, device, dtype, op):
         norm_modes = ((None, "forward", "backward", "ortho")
-                      if distutils.version.LooseVersion(np.__version__) >= '1.20.0'
+                      if LooseVersion(np.__version__) >= '1.20.0'
                       else (None, "ortho"))
         test_args = [
             *product(
@@ -370,7 +370,7 @@ def test_fft_half_and_bfloat16_errors(self, device, dtype, op):
     @ops([op for op in spectral_funcs if op.ndimensional])
     def test_reference_nd(self, device, dtype, op):
         norm_modes = ((None, "forward", "backward", "ortho")
-                      if distutils.version.LooseVersion(np.__version__) >= '1.20.0'
+                      if LooseVersion(np.__version__) >= '1.20.0'
                       else (None, "ortho"))
 
         # input_ndim, s, dim
@@ -469,7 +469,7 @@ def test_fftn_invalid(self, device, dtype, op):
     @dtypes(torch.double, torch.complex128)
     def test_fft2_numpy(self, device, dtype):
         norm_modes = ((None, "forward", "backward", "ortho")
-                      if distutils.version.LooseVersion(np.__version__) >= '1.20.0'
+                      if LooseVersion(np.__version__) >= '1.20.0'
                       else (None, "ortho"))
 
         # input_ndim, s
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index e72b156c715db7..9530fbff8890eb 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -3258,6 +3258,10 @@ def helper(self, device, dtype, ptype, t_transform, std_transform):
             self.assertEqual(t_transform(r[:, :50]).std(), std_transform(4), atol=0.3, rtol=0)
             self.assertEqual(t_transform(r[:, 50:]).std(), std_transform(1), atol=0.2, rtol=0)
 
+            # test empty mean/std
+            out = torch.normal(mean=torch.empty((0, 2)), std=torch.empty((0, 1)))
+            self.assertEqual(out.size(), torch.Size([0, 2]))
+
             r.fill_(42)
             r = torch.normal(2, 3, (100, 100), dtype=dtype, device=device)
             self.assertEqual(r.dtype, dtype)
diff --git a/test/test_torch.py b/test/test_torch.py
index 3c47fd87122dc7..f800a888493581 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5976,8 +5976,6 @@ def test_masked_scatter_bool_tensor(self, device):
         dst = dst.masked_scatter(mask, src)
         self.assertEqual(dst, torch.tensor([True, True, True], device=device))
 
-    # refer https://github.com/pytorch/pytorch/issues/60190
-    @skipIfRocm
     @onlyCUDA
     @largeTensorTest('30GB')
     def test_masked_scatter_large_tensor(self, device):
@@ -8399,6 +8397,13 @@ def generate_inputs(num_batches):
         finally:
             torch.set_num_threads(num_threads)
 
+    def test_conj_neg_tolist(self):
+        x = torch.randn(2, dtype=torch.cfloat)
+        y1 = x.conj()
+        y1_expect = x.conj_physical()
+        y2 = y1.imag
+        self.assertEqual(y1, y1_expect.tolist())
+        self.assertEqual(y2, y1_expect.imag.tolist())
 
 # TODO: these empy classes are temporarily instantiated for XLA compatibility
 #   once XLA updates their test suite it should be removed
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 6deda4032c7c58..9fee75d2e3719e 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -365,6 +365,16 @@ def test_conj_imag_view(self, device, dtype) -> None:
             self.assertEqual(v_imag, t_numpy_conj.imag)
             self.assertTrue(v_imag.is_neg())
 
+    @onlyOnCPUAndCUDA
+    def test_conj_view_with_shared_memory(self, device) -> None:
+        a = _make_tensor((4, 5,), torch.cfloat, device)
+        b = a.conj()
+        c = a.conj()
+
+        self.assertEqual(torch.add(a, b), a.add_(b))
+        self.assertEqual(torch.add(b, c), torch.add(b, c, out=a))
+        self.assertEqual(torch.add(b, c), b.add_(c))
+
     @onlyOnCPUAndCUDA
     @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes()))
     @suppress_warnings
diff --git a/test/test_vmap.py b/test/test_vmap.py
index 35b28db84f61f0..6f6996b09bb550 100644
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@@ -1,7 +1,8 @@
 from torch.testing._internal.common_utils import TestCase, run_tests
 import torch
 import torch.nn.functional as F
-from torch import Tensor, vmap
+from torch import Tensor
+from torch._vmap_internals import vmap
 import functools
 import itertools
 import warnings
diff --git a/third_party/kineto b/third_party/kineto
index dd32f3accff289..57188f45650e63 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit dd32f3accff289bc017d84f0ec1af01c36aaa6f8
+Subproject commit 57188f45650e63721d59f5a6005294e79d7d9f3a
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 70f7e7a83e1ec2..15a5a70872342c 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -89,6 +89,8 @@
     "tools/autograd/templates/python_variable_methods.cpp",
 ]
 
+includes = [os.path.join(proj_dir, include) for include in includes]
+
 for new_dir in args.extra_include_dir:
     abs_new_dir = os.path.join(proj_dir, new_dir)
     if os.path.exists(abs_new_dir):
@@ -112,6 +114,8 @@
     "torch/include/*",
 ]
 
+ignores = [os.path.join(proj_dir, ignore) for ignore in ignores]
+
 # Check if the compiler is hip-clang.
 def is_hip_clang() -> bool:
     try:
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index dd8f817aba2c25..a29430520d0ff7 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -8,7 +8,7 @@
 from subprocess import check_call, check_output, CalledProcessError
 import sys
 import sysconfig
-from setuptools import distutils  # type: ignore[import]
+from distutils.version import LooseVersion
 from typing import IO, Any, Dict, List, Optional, Union, cast
 
 from . import which
@@ -120,10 +120,10 @@ def _get_cmake_command() -> str:
             return cmake_command
         cmake3 = which('cmake3')
         cmake = which('cmake')
-        if cmake3 is not None and CMake._get_version(cmake3) >= distutils.version.LooseVersion("3.10.0"):
+        if cmake3 is not None and CMake._get_version(cmake3) >= LooseVersion("3.10.0"):
             cmake_command = 'cmake3'
             return cmake_command
-        elif cmake is not None and CMake._get_version(cmake) >= distutils.version.LooseVersion("3.10.0"):
+        elif cmake is not None and CMake._get_version(cmake) >= LooseVersion("3.10.0"):
             return cmake_command
         else:
             raise RuntimeError('no cmake or cmake3 with version >= 3.10.0 found')
@@ -134,7 +134,7 @@ def _get_version(cmd: str) -> Any:
 
         for line in check_output([cmd, '--version']).decode('utf-8').split('\n'):
             if 'version' in line:
-                return distutils.version.LooseVersion(line.strip().split(' ')[2])
+                return LooseVersion(line.strip().split(' ')[2])
         raise RuntimeError('no version found')
 
     def run(self, args: List[str], env: Dict[str, str]) -> None:
@@ -387,7 +387,7 @@ def build(self, my_env: Dict[str, str]) -> None:
             # then. Until then, we use "--" to pass parameters to the
             # underlying build system.
             build_args += ['--']
-            if IS_WINDOWS:
+            if IS_WINDOWS and not USE_NINJA:
                 # We are likely using msbuild here
                 build_args += ['/p:CL_MPCount={}'.format(max_jobs)]
             else:
diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py
index 8c1f7973442f37..7c214919549afc 100755
--- a/tools/stats/print_test_stats.py
+++ b/tools/stats/print_test_stats.py
@@ -108,7 +108,7 @@ def plural(n: int) -> str:
 
 def get_base_commit(sha1: str) -> str:
     return subprocess.check_output(
-        ["git", "merge-base", sha1, "origin/master"],
+        ["git", "merge-base", sha1, "origin/release/1.10"],
         encoding="ascii",
     ).strip()
 
diff --git a/tools/test/test_cmake.py b/tools/test/test_cmake.py
index a9fae6dc90e026..ecbce07f52d23f 100644
--- a/tools/test/test_cmake.py
+++ b/tools/test/test_cmake.py
@@ -22,7 +22,11 @@ def test_build_jobs(self, mock_cpu_count: unittest.mock.MagicMock) -> None:
             # MAX_JOBS, USE_NINJA, IS_WINDOWS,         want
             ((     '8',      True,     False),          ['-j', '8']),  # noqa: E201,E241
             ((    None,      True,     False),                 None),  # noqa: E201,E241
+            ((     '7',     False,     False),          ['-j', '7']),  # noqa: E201,E241
+            ((    None,     False,     False),         ['-j', '13']),  # noqa: E201,E241
+            ((     '6',      True,      True),          ['-j', '6']),  # noqa: E201,E241
             ((    None,      True,      True),                 None),  # noqa: E201,E241
+            ((    '11',     False,      True), ['/p:CL_MPCount=11']),  # noqa: E201,E241
             ((    None,     False,      True), ['/p:CL_MPCount=13']),  # noqa: E201,E241
         ]
         for (max_jobs, use_ninja, is_windows), want in cases:
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 743a5d442197ae..38baee4799b548 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -9,6 +9,13 @@ if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 endif()
 
+if(BUILD_BINARY)
+  add_library(aot_compiler SHARED
+          ${TORCH_SRC_DIR}/csrc/jit/mobile/nnc/aot_compiler.cpp
+          )
+  install(TARGETS aot_compiler DESTINATION lib)
+endif()
+
 if(NOT BUILD_PYTHON)
   return()
 endif()
@@ -430,9 +437,3 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   # Pybind11 requires explicit linking of the torch_python library
   target_link_libraries(nnapi_backend torch torch_python)
 endif()
-
-if(BUILD_BINARY)
-  add_library(aot_compiler SHARED
-          ${TORCH_SRC_DIR}/csrc/jit/mobile/nnc/aot_compiler.cpp
-          )
-endif()
diff --git a/torch/__init__.py b/torch/__init__.py
index 18c9b08a8b5368..abd8c1f918af8e 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -756,8 +756,6 @@ def compiled_with_cxx11_abi():
 # torch.jit.script as a decorator, for instance):
 from ._lobpcg import lobpcg as lobpcg
 
-from ._vmap_internals import vmap as vmap
-
 # These were previously defined in native_functions.yaml and appeared on the
 # `torch` namespace, but we moved them to c10 dispatch to facilitate custom
 # class usage. We add these lines here to preserve backward compatibility.
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 2a1de8e82a774e..7f0451b089dad1 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -145,6 +145,9 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
     // probably operate with names.
     at::NoNamesGuard no_names_guard;
 
+    // Keep track of backward pass for rocblas.
+    at::BackwardPassGuard in_backward;
+
     bool pre_sampled = false;
     if (at::shouldRunRecordFunction(&pre_sampled)) {
       // Using RecordFunction to trigger observers in the backward pass
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 5760c485958a9a..17372137f927fe 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -362,19 +362,21 @@ void ConvertGraphToONNXProto(
     SymbolDimMap& symbol_map,
     int opset_version) {
   RawDataExportMap export_map;
-  std::tie(model_proto, export_map, symbol_map) = export_onnx(
-      graph,
-      {},
-      opset_version,
-      {},
-      false,
-      onnx_torch::OperatorExportTypes::ONNX,
-      true,
-      true,
-      {},
-      true,
-      false,
-      std::string());
+  bool val_use_external_data_format;
+  std::tie(model_proto, export_map, symbol_map, val_use_external_data_format) =
+      export_onnx(
+          graph,
+          {},
+          opset_version,
+          {},
+          false,
+          onnx_torch::OperatorExportTypes::ONNX,
+          true,
+          true,
+          {},
+          true,
+          false,
+          std::string());
   for (int i = 0; i < model_proto->graph().output_size(); ++i) {
     model_proto->mutable_graph()->mutable_output(i)->clear_type();
   }
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 2c8246daec92b1..bcab4e676d8361 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -263,19 +263,25 @@ void initPythonIRBindings(PyObject* module_) {
             std::shared_ptr<::ONNX_NAMESPACE::ModelProto> model_proto;
             RawDataExportMap export_map;
             SymbolDimMap symbol_map;
-            std::tie(model_proto, export_map, symbol_map) = export_onnx(
-                g,
-                initializers,
-                onnx_opset_version,
-                dynamic_axes,
-                defer_weight_export,
-                operator_export_type,
-                strip_doc_string,
-                keep_initializers_as_inputs,
-                custom_opsets,
-                add_node_names,
-                use_external_data_format,
-                onnx_file_path);
+            bool val_use_external_data_format = false;
+            std::tie(
+                model_proto,
+                export_map,
+                symbol_map,
+                val_use_external_data_format) =
+                export_onnx(
+                    g,
+                    initializers,
+                    onnx_opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    strip_doc_string,
+                    keep_initializers_as_inputs,
+                    custom_opsets,
+                    add_node_names,
+                    use_external_data_format,
+                    onnx_file_path);
             std::unordered_map<std::string, py::bytes>
                 python_serialized_export_map;
             for (auto& kv : export_map) {
@@ -289,7 +295,9 @@ void initPythonIRBindings(PyObject* module_) {
             }
             graph = serialize_model_proto_to_string(model_proto);
             return std::make_tuple(
-                py::bytes(graph), python_serialized_export_map);
+                py::bytes(graph),
+                python_serialized_export_map,
+                val_use_external_data_format);
           },
           py::arg("initializers"),
           py::arg("onnx_opset_version") = 0,
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 87792092061cfe..6f5aa2f044ab06 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -231,6 +231,12 @@ class EncoderBase {
       bool use_external_data_format = false,
       const std::string& onnx_file_path = std::string());
 
+  unsigned long long int GetGraphProtoSize(
+      onnx::GraphProto* graph_proto,
+      const std::shared_ptr<Graph>& graph,
+      const std::map<std::string, at::Tensor>& initializers =
+          std::map<std::string, at::Tensor>());
+
   virtual void EncodeTensor(
       onnx::TensorProto* tensor_proto,
       const at::Tensor& tensor,
@@ -581,7 +587,6 @@ void EncoderBase::AddInitializersIntoGraphProto(
     bool use_external_data_format,
     const std::string& onnx_file_path) {
   AT_ASSERT(block->inputs().size() >= initializers.size());
-
   for (auto input : block->inputs()) {
     auto name_tensor_pair = initializers.find(input->debugName());
     if (name_tensor_pair == initializers.end()) {
@@ -598,6 +603,38 @@ void EncoderBase::AddInitializersIntoGraphProto(
   }
 }
 
+unsigned long long int EncoderBase::GetGraphProtoSize(
+    onnx::GraphProto* graph_proto,
+    const std::shared_ptr<Graph>& graph,
+    const std::map<std::string, at::Tensor>& initializers) {
+  unsigned long long int sizes = 0;
+  for (auto input : graph->inputs()) {
+    auto name_tensor_pair = initializers.find(input->debugName());
+    if (name_tensor_pair == initializers.end()) {
+      continue;
+    }
+    onnx::GraphProto* graph_proto_copy = new onnx::GraphProto(*graph_proto);
+    auto tensor_proto = graph_proto_copy->add_initializer();
+    const at::Tensor tensor = name_tensor_pair->second;
+    for (auto d : tensor.sizes()) {
+      tensor_proto->add_dims(d);
+    }
+    tensor_proto->set_data_type(ATenTypeToOnnxType(tensor.scalar_type()));
+    at::Tensor t;
+    if (tensor.is_quantized()) {
+      t = tensor.contiguous();
+    } else {
+      t = tensor.contiguous().cpu();
+    }
+    tensor_proto->set_raw_data(std::string(
+        static_cast<char*>(t.data_ptr()), t.element_size() * t.numel()));
+    sizes += tensor_proto->ByteSizeLong();
+    delete graph_proto_copy;
+    graph_proto_copy = nullptr;
+  }
+  return sizes;
+}
+
 void EncoderBase::AddAttribute(
     onnx::NodeProto* node_proto,
     const jit::Node* node,
@@ -726,6 +763,10 @@ class GraphEncoder : public EncoderBase {
     return raw_data_export_map_;
   }
 
+  bool get_use_external_data_format() {
+    return use_external_data_format_;
+  }
+
  private:
   void EncodeTensor(
       onnx::TensorProto* tensor_proto,
@@ -736,6 +777,7 @@ class GraphEncoder : public EncoderBase {
 
   RawDataExportMap raw_data_export_map_;
   bool defer_weight_export_;
+  bool use_external_data_format_;
 };
 
 GraphEncoder::GraphEncoder(
@@ -754,8 +796,21 @@ GraphEncoder::GraphEncoder(
     bool use_external_data_format,
     const std::string& onnx_file_path)
     : EncoderBase(operator_export_type, strip_doc),
-      defer_weight_export_(defer_weight_export) {
+      defer_weight_export_(defer_weight_export),
+      use_external_data_format_(use_external_data_format) {
   validateGraph(graph, operator_export_type);
+  // If graph proto size exceed maximum protobuf size of 2GB, set
+  // use_external_data_format to true.
+  if (!use_external_data_format && !onnx_file_path.empty() &&
+      GetGraphProtoSize(model_proto_.mutable_graph(), graph, initializers) >
+          INT_MAX) {
+    GRAPH_DEBUG(
+        "Exporting model exceed maximum protobuf size of 2GB. Storing model parameters in external data files");
+    use_external_data_format = true;
+    // use_external_data_format_ is one of graph_encoder private variable set
+    // for return `use_external_data_format` value.
+    use_external_data_format_ = use_external_data_format;
+  }
 
   if (use_external_data_format) {
     TORCH_CHECK(
@@ -895,7 +950,8 @@ std::string pretty_print_onnx(
 std::tuple<
     std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
     RawDataExportMap,
-    SymbolDimMap>
+    SymbolDimMap,
+    bool>
 export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
@@ -929,7 +985,8 @@ export_onnx(
       std::make_shared<::ONNX_NAMESPACE::ModelProto>(
           graph_encoder.get_model_proto()),
       graph_encoder.get_raw_data_export_map(),
-      graph_encoder.get_symbol_dim_param_map());
+      graph_encoder.get_symbol_dim_param_map(),
+      graph_encoder.get_use_external_data_format());
 }
 
 std::string serialize_model_proto_to_string(
@@ -938,7 +995,7 @@ std::string serialize_model_proto_to_string(
   TORCH_CHECK(
       proto_size <= INT_MAX,
       "Exporting model exceed maximum protobuf size of 2GB. "
-      "Please call torch.onnx.export with use_external_data_format=True.");
+      "Please call torch.onnx.export without setting use_external_data_format parameter.");
   return model_proto->SerializeAsString();
 }
 
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 80f0884224392e..37d3433fa5f908 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -33,7 +33,8 @@ using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
 TORCH_API std::tuple<
     std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
     RawDataExportMap,
-    SymbolDimMap>
+    SymbolDimMap,
+    bool>
 export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index 7948734f1e58a2..1cde3a196d6bbb 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -30,7 +30,8 @@ static PyObject* recursive_to_list(
 }
 
 PyObject* tensor_to_list(const Tensor& tensor) {
-  Tensor data = tensor;
+  TORCH_CHECK(!tensor.unsafeGetTensorImpl()->is_python_dispatch(), ".tolist() is not supported for tensor subclasses.");
+  Tensor data = tensor.resolve_conj().resolve_neg();
   if (!data.device().is_cpu()) {
     pybind11::gil_scoped_release no_gil;
     data = data.toBackend(Backend::CPU);
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 25e9a5962614fb..959e0d1d30e562 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -194,7 +194,7 @@ void recursive_store(char* data, IntArrayRef sizes, IntArrayRef strides, int64_t
   PyObject** items = PySequence_Fast_ITEMS(seq.get());
   for(const auto i : c10::irange(n)) {
 #ifdef USE_NUMPY
-    if (PyArray_Check(items[i])) {
+    if (is_numpy_available() && PyArray_Check(items[i])) {
       TORCH_WARN_ONCE(
         "Creating a tensor from a list of numpy.ndarrays is extremely slow. "
         "Please consider converting the list to a single numpy.ndarray with "
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index 433d1e2e680841..e507ffbf448bb3 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -130,6 +130,8 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor) {
       "Can't call numpy() on Tensor that has negative bit set. "
       "Use tensor.resolve_neg().numpy() instead.");
 
+  TORCH_CHECK(!tensor.unsafeGetTensorImpl()->is_python_dispatch(), ".numpy() is not supported for tensor subclasses.");
+
   auto dtype = aten_to_numpy_dtype(tensor.scalar_type());
   auto sizes = to_numpy_shape(tensor.sizes());
   auto strides = to_numpy_shape(tensor.strides());
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index e0c2d8907e2b69..a85cc9d224b9e3 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1519,8 +1519,11 @@ def _object_to_tensor(obj):
     f = io.BytesIO()
     _pickler(f).dump(obj)
     byte_storage = torch.ByteStorage.from_buffer(f.getvalue())  # type: ignore[attr-defined]
-    byte_tensor = torch.tensor(byte_storage, dtype=torch.uint8)
-    local_size = torch.tensor([byte_tensor.numel()], dtype=torch.long)
+    # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
+    # Otherwise, it will casue 100X slowdown.
+    # See: https://github.com/pytorch/pytorch/issues/65696
+    byte_tensor = torch.ByteTensor(byte_storage)
+    local_size = torch.LongTensor([byte_tensor.numel()])
     return byte_tensor, local_size
 
 
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index 4a8238f1259e85..c84df1a8e43426 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -21,10 +21,11 @@
     WorkerState,
 )
 from torch.distributed.elastic.metrics.api import prof
-from torch.distributed.elastic.multiprocessing import start_processes, PContext
+from torch.distributed.elastic.multiprocessing import PContext, start_processes
 from torch.distributed.elastic.utils import macros
 from torch.distributed.elastic.utils.logging import get_logger
 
+
 log = get_logger()
 
 
diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index c1be1e093484f2..be955d28d4704b 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -51,6 +51,7 @@
 import json
 import os
 import signal
+import socket
 import time
 import warnings
 from dataclasses import dataclass, field
@@ -109,7 +110,7 @@ def __post_init__(self):
             try:
                 with open(self.error_file, "r") as fp:
                     self.error_file_data = json.load(fp)
-                    log.info(
+                    log.debug(
                         f"User process failed with error data: {json.dumps(self.error_file_data, indent=2)}"
                     )
                     self.message, self.timestamp = self._get_error_data(
@@ -130,7 +131,7 @@ def __post_init__(self):
                     f" received by PID {self.pid}"
                 )
             else:
-                self.message = f"Process failed with exitcode {self.exitcode}"
+                self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
 
     def _get_error_data(self, error_file_data: Dict[str, Any]) -> Tuple[str, int]:
         message = error_file_data["message"]
@@ -162,24 +163,24 @@ def timestamp_isoformat(self):
 GlobalRank = int
 
 _FAILURE_FORMAT_TEMPLATE = """[${idx}]:
-  time: ${time}
-  rank: ${rank} (local_rank: ${local_rank})
-  exitcode: ${exitcode} (pid: ${pid})
+  time      : ${time}
+  host      : ${hostname}
+  rank      : ${rank} (local_rank: ${local_rank})
+  exitcode  : ${exitcode} (pid: ${pid})
   error_file: ${error_file}
-  msg: ${message}"""
+  traceback : ${message}"""
 
 # extra new lines before and after are intentional
 _MSG_FORMAT_TEMPLATE = """
 ${boarder}
 ${title}
 ${section}
-Root Cause:
-${root_failure}
-${section}
-Other Failures:
+Failures:
 ${other_failures}
-${boarder}
-"""
+${section}
+Root Cause (first observed failure):
+${root_failure}
+${boarder}"""
 
 
 class ChildFailedError(Exception):
@@ -230,8 +231,8 @@ def get_first_failure(self) -> Tuple[GlobalRank, ProcessFailure]:
         rank = min(self.failures.keys(), key=lambda r: self.failures[r].timestamp)
         return rank, self.failures[rank]
 
-    def format_msg(self, boarder_delim="*", section_delim="="):
-        title = f"  {self.name} FAILED  "
+    def format_msg(self, boarder_delim="=", section_delim="-"):
+        title = f"{self.name} FAILED"
         root_rank, root_failure = self.get_first_failure()
 
         root_failure_fmt: str = ""
@@ -246,11 +247,11 @@ def format_msg(self, boarder_delim="*", section_delim="="):
                 other_failures_fmt.append(fmt)
 
         # upper boundary on width
-        width = min(width, 80)
+        width = min(width, 60)
 
         return Template(_MSG_FORMAT_TEMPLATE).substitute(
             boarder=boarder_delim * width,
-            title=title.center(width),
+            title=title,
             section=section_delim * width,
             root_failure=root_failure_fmt,
             other_failures="\n".join(other_failures_fmt or ["  <NO_OTHER_FAILURES>"]),
@@ -279,6 +280,7 @@ def _format_failure(
         fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
             idx=idx,
             time=failure.timestamp_isoformat(),
+            hostname=socket.getfqdn(),
             rank=rank,
             local_rank=failure.local_rank,
             exitcode=failure.exitcode,
@@ -292,32 +294,6 @@ def _format_failure(
         return fmt, width
 
 
-def _no_error_file_warning_msg(rank: int, failure: ProcessFailure) -> str:
-    msg = [
-        "CHILD PROCESS FAILED WITH NO ERROR_FILE",
-        f"Child process {failure.pid} (local_rank {rank}) FAILED (exitcode {failure.exitcode})",
-        f"Error msg: {failure.message}",
-        f"Without writing an error file to {failure.error_file}.",
-        "While this DOES NOT affect the correctness of your application,",
-        "no trace information about the error will be available for inspection.",
-        "Consider decorating your top level entrypoint function with",
-        "torch.distributed.elastic.multiprocessing.errors.record. Example:",
-        "",
-        r"  from torch.distributed.elastic.multiprocessing.errors import record",
-        "",
-        r"  @record",
-        r"  def trainer_main(args):",
-        r"     # do train",
-    ]
-    width = 0
-    for line in msg:
-        width = max(width, len(line))
-
-    boarder = "*" * width
-    header = "CHILD PROCESS FAILED WITH NO ERROR_FILE".center(width)
-    return "\n".join(["\n", boarder, header, boarder, *msg, boarder])
-
-
 def record(
     fn: Callable[..., T], error_handler: Optional[ErrorHandler] = None
 ) -> Callable[..., T]:
@@ -372,7 +348,13 @@ def wrapper(*args, **kwargs):
                 if failure.error_file != _NOT_AVAILABLE:
                     error_handler.dump_error_file(failure.error_file, failure.exitcode)
                 else:
-                    warnings.warn(_no_error_file_warning_msg(rank, failure))
+                    log.info(
+                        (
+                            f"local_rank {rank} FAILED with no error file."
+                            f" Decorate your entrypoint fn with @record for traceback info."
+                            f" See: https://pytorch.org/docs/stable/elastic/errors.html"
+                        )
+                    )
                 raise
             except Exception as e:
                 error_handler.record_exception(e)
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
index 74586e9fd85233..da47eb77e8a55a 100644
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -107,7 +107,7 @@ def dump_error_file(self, rootcause_error_file: str, error_code: int = 0):
                 else:
                     rootcause_error["message"]["errorCode"] = error_code
 
-            log.info(
+            log.debug(
                 f"child error file ({rootcause_error_file}) contents:\n"
                 f"{json.dumps(rootcause_error, indent=2)}"
             )
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index c6e84d6f65f4bc..127d91355a4f47 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -304,6 +304,27 @@ def train():
 
       if should_checkpoint:
         save_checkpoint(checkpoint_path)
+
+9. (Recommended) On worker errors, this tool will summarize the details of the error
+   (e.g. time, rank, host, pid, traceback, etc). On each node, the first error (by timestamp)
+   is heuristically reported as the "Root Cause" error. To get tracebacks as part of this
+   error summary print out, you must decorate your main entrypoint function in your
+   training script as shown in the example below. If not decorated, then the summary
+   will not include the traceback of the exception and will only contain the exitcode.
+   For details on torchelastic error handling see: https://pytorch.org/docs/stable/elastic/errors.html
+
+::
+
+  from torch.distributed.elastic.multiprocessing.errors import record
+
+  @record
+  def main():
+      # do train
+      pass
+
+  if __name__ == "__main__":
+      main()
+
 """
 import logging
 import os
@@ -597,7 +618,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
     if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
         omp_num_threads = 1
         log.warning(
-            f"*****************************************\n"
+            f"\n*****************************************\n"
             f"Setting OMP_NUM_THREADS environment variable for each process to be "
             f"{omp_num_threads} in default, to avoid your system being overloaded, "
             f"please further tune the variable for optimal performance in "
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 4b0449c8f56727..5f14f31d15a46c 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -4256,7 +4256,10 @@ def pairwise_distance(x1: Tensor, x2: Tensor, p: float = 2.0, eps: float = 1e-6,
     r"""
 cosine_similarity(x1, x2, dim=1, eps=1e-8) -> Tensor
 
-Returns cosine similarity between x1 and x2, computed along dim.
+Returns cosine similarity between ``x1`` and ``x2``, computed along dim. ``x1`` and ``x2`` must be broadcastable
+to a common shape. ``dim`` refers to the dimension in this common shape. Dimension ``dim`` of the output is
+squeezed (see :func:`torch.squeeze`), resulting in the
+output tensor having 1 fewer dimension.
 
 .. math ::
     \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}
@@ -4265,16 +4268,11 @@ def pairwise_distance(x1: Tensor, x2: Tensor, p: float = 2.0, eps: float = 1e-6,
 
 Args:
     x1 (Tensor): First input.
-    x2 (Tensor): Second input (with the same number of dimensions as x1, matching x1 size at dimension `dim`,
-        and broadcastable with x1 at other dimensions).
-    dim (int, optional): Dimension of vectors. Default: 1
+    x2 (Tensor): Second input.
+    dim (int, optional): Dimension along which cosine similarity is computed. Default: 1
     eps (float, optional): Small value to avoid division by zero.
         Default: 1e-8
 
-Shape:
-    - Input: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`.
-    - Output: :math:`(\ast_1, \ast_2)`
-
 Example::
 
     >>> input1 = torch.randn(100, 128)
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 2494c2d6dff854..3b641f85709db0 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -31,10 +31,10 @@ def _export(*args, **kwargs):
 
 def export(model, args, f, export_params=True, verbose=False, training=TrainingMode.EVAL,
            input_names=None, output_names=None, operator_export_type=None,
-           opset_version=None, _retain_param_name=True, do_constant_folding=True,
-           example_outputs=None, strip_doc_string=True, dynamic_axes=None,
-           keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=True,
-           use_external_data_format=False):
+           opset_version=None, _retain_param_name=None, do_constant_folding=True,
+           example_outputs=None, strip_doc_string=None, dynamic_axes=None,
+           keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=None,
+           use_external_data_format=None):
     r"""
     Exports a model into ONNX format. If ``model`` is not a
     :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
@@ -105,7 +105,9 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
             In this case, the exported model will first take all of its parameters
             as arguments, with the ordering as specified by ``model.state_dict().values()``
         verbose (bool, default False): if True, prints a description of the
-            model being exported to stdout.
+            model being exported to stdout. In addition, the final ONNX graph will include the
+            field ``doc_string``` from the exported model which mentions the source code locations
+            for ``model``.
         training (enum, default TrainingMode.EVAL):
             * ``TrainingMode.EVAL``: export the model in inference mode.
             * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
@@ -181,16 +183,17 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         opset_version (int, default 9):
             Must be ``== _onnx_main_opset or in _onnx_stable_opsets``,
             defined in torch/onnx/symbolic_helper.py.
+        _retain_param_name (bool, default True): [Deprecated and ignored. Will be removed in next PyTorch
+            release]
         do_constant_folding (bool, default False): Apply the constant-folding optimization.
             Constant-folding will replace some of the ops that have all constant inputs
             with pre-computed constant nodes.
         example_outputs (T or a tuple of T, where T is Tensor or convertible to Tensor, default None):
+            [Deprecated and ignored. Will be removed in next PyTorch release],
             Must be provided when exporting a ScriptModule or ScriptFunction, ignored otherwise.
             Used to determine the type and shape of the outputs without tracing the execution of
             the model. A single object is treated as equivalent to a tuple of one element.
-        strip_doc_string (bool, default True): do not include the field
-            ``doc_string``` from the exported model. Otherwise the field will mention the source
-            code locations for ``model``.
+        strip_doc_string (bool, default True): [Deprecated and ignored. Will be removed in next PyTorch release]
         dynamic_axes (dict<string, dict<int, string>> or dict<string, list(int)>, default empty dict):
 
             By default the exported model will have the shapes of all input and output tensors
@@ -293,10 +296,11 @@ def forward(self, x):
 
         enable_onnx_checker (bool, default True): Deprecated and ignored. Will be removed in next
             Pytorch release.
-        use_external_data_format (bool, default False): If True, then some of the model
-            parameters are stored in external data files and not in the ONNX model file itself.
-            Models larger than 2GB cannot be exported in one file because of size limits imposed
-            by Protocol Buffers.
+        use_external_data_format (bool, default False): [Deprecated and ignored. Will be removed in
+            next Pytorch release.]
+            If True, then some of the model parameters are stored in external data files and not in
+            the ONNX model file itself. Models larger than 2GB cannot be exported in one file because
+            of size limits imposed by Protocol Buffers.
             For details see
             `onnx.proto <https://github.com/onnx/onnx/blob/32c7cb66/onnx/onnx.proto#L562>`_.
             If True,  argument ``f`` must be a string specifying the location of the model.
@@ -316,9 +320,24 @@ def forward(self, x):
                         custom_opsets, enable_onnx_checker, use_external_data_format)
 
 
-def export_to_pretty_string(*args, **kwargs):
+def export_to_pretty_string(*args, **kwargs) -> str:
     r"""
-    Same as :func:`export`, but returns a text representation of the exported model.
+    Similar to :func:`export`, but returns a text representation of the ONNX
+    model. Only differences in args listed below. All other args are the same
+    as :func:`export`.
+
+    Args:
+      f:  Deprecated and ignored. Will be removed in the next release of
+          PyTorch.
+      add_node_names (bool, default True): Whether or not to set
+          NodeProto.name. This makes no difference unless
+          ``google_printer=True``.
+      google_printer (bool, default False): If False, will return a custom,
+          compact representation of the model. If True will return the
+          protobuf's `Message::DebugString()`, which is more verbose.
+
+    Returns:
+      A UTF-8 str containing a human-readable representation of the ONNX model.
     """
     from torch.onnx import utils
     return utils.export_to_pretty_string(*args, **kwargs)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 17b5953a2db533..b0418f7cd547b3 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -12,6 +12,7 @@
 import re
 import collections
 import contextlib
+import copy
 import numbers
 import warnings
 from torch._six import string_classes
@@ -75,24 +76,37 @@ def select_model_mode_for_export(model, mode):
 
 def export(model, args, f, export_params=True, verbose=False, training=None,
            input_names=None, output_names=None, operator_export_type=None,
-           opset_version=None, _retain_param_name=True, do_constant_folding=True,
-           example_outputs=None, strip_doc_string=True, dynamic_axes=None,
+           opset_version=None, _retain_param_name=None, do_constant_folding=True,
+           example_outputs=None, strip_doc_string=None, dynamic_axes=None,
            keep_initializers_as_inputs=None, custom_opsets=None,
-           enable_onnx_checker=None, use_external_data_format=False):
+           enable_onnx_checker=None, use_external_data_format=None):
     if operator_export_type is None:
         if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE:
             operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK
         else:
             operator_export_type = OperatorExportTypes.ONNX
+
     if enable_onnx_checker is not None:
-        warnings.warn("`enable_onnx_checker' is deprecated and ignored. It will be removed in"
-                      "the next PyTorch release. To proceed despite ONNX checker failures, you"
-                      "can catch torch.onnx.ONNXCheckerError.")
+        warnings.warn("'enable_onnx_checker' is deprecated and ignored. It will be removed in "
+                      "the next PyTorch release. To proceed despite ONNX checker failures, "
+                      "catch torch.onnx.ONNXCheckerError.")
+    if _retain_param_name is not None:
+        warnings.warn("'_retain_param_name' is deprecated and ignored. "
+                      "It will be removed in the next PyTorch release.")
+    if strip_doc_string is not None:
+        warnings.warn("`strip_doc_string' is deprecated and ignored. Will be removed in "
+                      "next PyTorch release. It's combined with `verbose' argument now. ")
+    if example_outputs is not None:
+        warnings.warn("`example_outputs' is deprecated and ignored. Will be removed in "
+                      "next PyTorch release.")
+    if use_external_data_format is not None:
+        warnings.warn("`use_external_data_format' is deprecated and ignored. Will be removed in next "
+                      "PyTorch release. The code will work as it is False if models are not larger than 2GB, "
+                      "Otherwise set to False because of size limits imposed by Protocol Buffers.")
 
     _export(model, args, f, export_params, verbose, training, input_names, output_names,
             operator_export_type=operator_export_type, opset_version=opset_version,
-            _retain_param_name=_retain_param_name, do_constant_folding=do_constant_folding,
-            example_outputs=example_outputs, strip_doc_string=strip_doc_string,
+            do_constant_folding=do_constant_folding, example_outputs=example_outputs,
             dynamic_axes=dynamic_axes, keep_initializers_as_inputs=keep_initializers_as_inputs,
             custom_opsets=custom_opsets, use_external_data_format=use_external_data_format)
 
@@ -310,7 +324,10 @@ def _decide_external_data_format(use_external_data_format, operator_export_type,
     # string specifying the location of the model. For large model cases, if f is not a non-empty string,
     # then this method returns an empty string, which is an error condition for the large model export code
     # path later (but not for regular model export code path).
-    model_file_location = f if val_use_external_data_format and isinstance(f, str) else str()
+    if (val_use_external_data_format is None or val_use_external_data_format is True) and isinstance(f, str):
+        model_file_location = f
+    else:
+        model_file_location = str()
     return val_use_external_data_format, model_file_location
 
 def _decide_input_format(model, args):
@@ -389,7 +406,7 @@ def _get_param_count_list(method_graph, args_params):
     return param_count_list
 
 
-def _create_jit_graph(model, args, _retain_param_name):
+def _create_jit_graph(model, args):
     torch_out = None
     params: Union[List, Tuple]
     if isinstance(model, torch.jit.ScriptModule):
@@ -420,13 +437,12 @@ def _create_jit_graph(model, args, _retain_param_name):
         graph, torch_out = _trace_and_get_graph_from_model(model, args)
         state_dict = _unique_state_dict(model)
         params = list(state_dict.values())
-        if _retain_param_name:
-            graph_inputs = list(graph.inputs())
-            user_input_num = len(graph_inputs) - len(state_dict)
-            param_names = list(state_dict.keys())
-            for i, inp in enumerate(graph_inputs):
-                if i >= user_input_num:
-                    inp.setDebugName(param_names[i - user_input_num])
+        graph_inputs = list(graph.inputs())
+        user_input_num = len(graph_inputs) - len(state_dict)
+        param_names = list(state_dict.keys())
+        for i, inp in enumerate(graph_inputs):
+            if i >= user_input_num:
+                inp.setDebugName(param_names[i - user_input_num])
         torch._C._jit_pass_onnx_function_substitution(graph)
         return graph, params, torch_out, None
 
@@ -437,12 +453,25 @@ def _get_named_param_dict(graph, params):
     _params_dict = dict(zip(param_names, params))
     return _params_dict
 
+def _get_example_outputs(model, args):
+    input_args = copy.deepcopy(args)
+    input_kwargs = {}
+    if input_args and isinstance(input_args[-1], dict):
+        input_kwargs = input_args[-1]
+        input_args = input_args[:-1]
+
+    example_outputs = model(*input_args, **input_kwargs)
+    if isinstance(example_outputs, (torch.Tensor, int, float, bool)):
+        example_outputs = (example_outputs,)
+
+    if isinstance(example_outputs, list):
+        example_outputs = [example_outputs]
+    return example_outputs
 
 def _model_to_graph(model, args, verbose=False,
                     input_names=None, output_names=None,
                     operator_export_type=OperatorExportTypes.ONNX,
-                    example_outputs=None,
-                    _retain_param_name=False, do_constant_folding=True,
+                    example_outputs=None, do_constant_folding=True,
                     _disable_torch_constant_prop=False, fixed_batch_size=False,
                     training=None, dynamic_axes=None):
     r"""Converts model into an ONNX graph.
@@ -461,11 +490,7 @@ def _model_to_graph(model, args, verbose=False,
     if isinstance(args, (torch.Tensor, int, float, bool)):
         args = (args, )
 
-    if isinstance(example_outputs, (torch.Tensor, int, float, bool)):
-        example_outputs = (example_outputs,)
-
-    graph, params, torch_out, module = _create_jit_graph(model, args,
-                                                         _retain_param_name)
+    graph, params, torch_out, module = _create_jit_graph(model, args)
 
     params_dict = _get_named_param_dict(graph, params)
 
@@ -476,13 +501,22 @@ def _model_to_graph(model, args, verbose=False,
                             module=module)
     from torch.onnx.symbolic_helper import _onnx_shape_inference
     if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):
-        assert example_outputs is not None, "example_outputs must be provided when exporting a ScriptModule or " \
-                                            "ScriptFunction."
-        if isinstance(example_outputs, list):
-            example_outputs = [example_outputs]
+        if example_outputs is None:
+            example_outputs = _get_example_outputs(model, args)
+        else:
+            # example_outpus specified
+            if isinstance(example_outputs, (torch.Tensor, int, float, bool)):
+                example_outputs = (example_outputs,)
+
+            if isinstance(example_outputs, list):
+                example_outputs = [example_outputs]
 
         out_vars, desc = torch.jit._flatten(tuple(example_outputs))
         torch._C._jit_pass_onnx_assign_output_shape(graph, out_vars, desc, _onnx_shape_inference)
+    else:
+        flatten_args, _ = torch._C._jit_flatten(args)
+        # make sure that the param dict and the graph match each other
+        assert len(params) + len(flatten_args) == sum(1 for _ in graph.inputs())
 
     # NB: ONNX requires complete information about output types, which might be
     # erased by some optimizations, so we need to set it explicitly again.
@@ -533,14 +567,18 @@ def _model_to_graph(model, args, verbose=False,
 def export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=None,
                             input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
                             export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None,
-                            google_printer=False, opset_version=None, _retain_param_name=True,
+                            google_printer=False, opset_version=None, _retain_param_name=None,
                             keep_initializers_as_inputs=None, custom_opsets=None, add_node_names=True,
                             do_constant_folding=True, dynamic_axes=None):
+    if f is not None:
+        warnings.warn("'f' is deprecated and ignored. It will be removed in the next PyTorch release.")
+    if _retain_param_name is not None:
+        warnings.warn("'_retain_param_name' is deprecated and ignored. "
+                      "It will be removed in the next PyTorch release.")
     return _export_to_pretty_string(model, args, f, export_params, verbose, training,
                                     input_names, output_names, operator_export_type,
                                     export_type, example_outputs, google_printer,
-                                    opset_version, _retain_param_name,
-                                    do_constant_folding=do_constant_folding,
+                                    opset_version, do_constant_folding=do_constant_folding,
                                     add_node_names=add_node_names,
                                     keep_initializers_as_inputs=keep_initializers_as_inputs,
                                     custom_opsets=custom_opsets, dynamic_axes=dynamic_axes)
@@ -549,7 +587,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t
 def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=None,
                              input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
                              export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None,
-                             google_printer=False, opset_version=None, _retain_param_name=False,
+                             google_printer=False, opset_version=None,
                              do_constant_folding=True, keep_initializers_as_inputs=None,
                              fixed_batch_size=False, custom_opsets=None, add_node_names=True,
                              onnx_shape_inference=True, dynamic_axes=None):
@@ -572,8 +610,8 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False,
         args = _decide_input_format(model, args)
         graph, params_dict, torch_out = _model_to_graph(model, args, verbose, input_names,
                                                         output_names, operator_export_type,
-                                                        example_outputs, _retain_param_name,
-                                                        val_do_constant_folding, fixed_batch_size=fixed_batch_size,
+                                                        example_outputs, val_do_constant_folding,
+                                                        fixed_batch_size=fixed_batch_size,
                                                         training=training, dynamic_axes=dynamic_axes)
 
         return graph._pretty_print_onnx(params_dict, opset_version, False,
@@ -633,10 +671,10 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini
 def _export(model, args, f, export_params=True, verbose=False, training=None,
             input_names=None, output_names=None, operator_export_type=None,
             export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None,
-            opset_version=None, _retain_param_name=False, do_constant_folding=True,
-            strip_doc_string=True, dynamic_axes=None, keep_initializers_as_inputs=None,
+            opset_version=None, do_constant_folding=True,
+            dynamic_axes=None, keep_initializers_as_inputs=None,
             fixed_batch_size=False, custom_opsets=None, add_node_names=True,
-            use_external_data_format=False, onnx_shape_inference=True):
+            use_external_data_format=None, onnx_shape_inference=True):
 
     if isinstance(model, torch.nn.DataParallel):
         raise ValueError("torch.nn.DataParallel is not supported by ONNX "
@@ -685,8 +723,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
             graph, params_dict, torch_out = \
                 _model_to_graph(model, args, verbose, input_names,
                                 output_names, operator_export_type,
-                                example_outputs, _retain_param_name,
-                                val_do_constant_folding,
+                                example_outputs, val_do_constant_folding,
                                 fixed_batch_size=fixed_batch_size,
                                 training=training,
                                 dynamic_axes=dynamic_axes)
@@ -697,14 +734,14 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
                 custom_opsets = {}
 
             if export_params:
-                proto, export_map = graph._export_onnx(
+                proto, export_map, val_use_external_data_format = graph._export_onnx(
                     params_dict, opset_version, dynamic_axes, defer_weight_export,
-                    operator_export_type, strip_doc_string, val_keep_init_as_ip, custom_opsets,
+                    operator_export_type, not verbose, val_keep_init_as_ip, custom_opsets,
                     val_add_node_names, val_use_external_data_format, model_file_location)
             else:
-                proto, export_map = graph._export_onnx(
+                proto, export_map, val_use_external_data_format = graph._export_onnx(
                     {}, opset_version, dynamic_axes, False, operator_export_type,
-                    strip_doc_string, val_keep_init_as_ip, custom_opsets, val_add_node_names,
+                    not verbose, val_keep_init_as_ip, custom_opsets, val_add_node_names,
                     val_use_external_data_format, model_file_location)
 
             if export_type == ExportTypes.PROTOBUF_FILE:
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 5d0849bb8407d7..78af6dcc968806 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -6,7 +6,7 @@
 from torch.testing._internal.common_utils import TEST_NUMBA
 import inspect
 import contextlib
-from setuptools import distutils
+from distutils.version import LooseVersion
 
 
 TEST_CUDA = torch.cuda.is_available()
@@ -16,7 +16,7 @@
 TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE))
 TEST_CUDNN_VERSION = torch.backends.cudnn.version() if TEST_CUDNN else 0
 
-CUDA11OrLater = torch.version.cuda and distutils.version.LooseVersion(torch.version.cuda) >= "11.0"
+CUDA11OrLater = torch.version.cuda and LooseVersion(torch.version.cuda) >= "11.0"
 CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.')
 SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3)
 SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d817c9e56ac5bf..42582268b55d9e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -37,7 +37,7 @@
      GRADCHECK_NONDET_TOL, skipIfTBB)
 import torch.testing._internal.opinfo_helper as opinfo_helper
 
-from setuptools import distutils
+from distutils.version import LooseVersion
 
 if TEST_SCIPY:
     import scipy.special
@@ -1256,6 +1256,8 @@ def generator():
             yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs)
         # Test for Broadcasting
         yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
+        yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2})
+        yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
 
     return list(generator())
 
@@ -9353,11 +9355,11 @@ def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/pull/49155#issuecomment-742664611
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    active_if=TEST_SCIPY and distutils.version.LooseVersion(scipy.__version__) < "1.4.0"),
+                                    active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                    active_if=TEST_SCIPY and distutils.version.LooseVersion(scipy.__version__) < "1.4.0"),
+                                    active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
-                                    active_if=TEST_SCIPY and distutils.version.LooseVersion(scipy.__version__) < "1.4.0"),
+                                    active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"),
                    )),
     UnaryUfuncInfo('lgamma',
                    ref=reference_lgamma if TEST_SCIPY else _NOTHING,
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index b32c3a159fc0d0..b4ef79620d8d92 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1781,8 +1781,10 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *,
         assert (atol is None) == (rtol is None), "If one of atol or rtol is specified, then the other must be too"
         debug_msg: Optional[str] = None
 
+        if x is None or y is None:
+            self.assertTrue(x is None and y is None)
         # Tensor x Number and Number x Tensor comparisons
-        if isinstance(x, torch.Tensor) and isinstance(y, Number):
+        elif isinstance(x, torch.Tensor) and isinstance(y, Number):
             self.assertEqual(x.item(), y, atol=atol, rtol=rtol, msg=msg,
                              exact_dtype=exact_dtype, exact_device=exact_device)
         elif isinstance(y, torch.Tensor) and isinstance(x, Number):
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index bb0a85982c665c..a482f5d8e29cbd 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -16,7 +16,7 @@
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
 from .hipify import hipify_python
-from .hipify.hipify_python import get_hip_file_path, GeneratedFileCleaner
+from .hipify.hipify_python import GeneratedFileCleaner
 from typing import List, Optional, Union
 
 from setuptools.command.build_ext import build_ext
@@ -939,16 +939,19 @@ def CUDAExtension(name, sources, *args, **kwargs):
         hipify_result = hipify_python.hipify(
             project_directory=build_dir,
             output_directory=build_dir,
-            includes=[os.path.join(os.path.relpath(include_dir, build_dir), '*') for include_dir in include_dirs] if include_dirs else ['*'],
+            header_include_dirs=include_dirs,
+            includes=[os.path.join(build_dir, '*')],  # limit scope to build_dir only
             extra_files=[os.path.abspath(s) for s in sources],
             show_detailed=True,
             is_pytorch_extension=True,
+            hipify_extra_files_only=True,  # don't hipify everything in includes path
         )
 
         hipified_sources = set()
         for source in sources:
             s_abs = os.path.abspath(source)
-            hipified_sources.add(hipify_result[s_abs]["hipified_path"] if s_abs in hipify_result else s_abs)
+            hipified_sources.add(hipify_result[s_abs]["hipified_path"] if (s_abs in hipify_result and
+                                 hipify_result[s_abs]["hipified_path"] is not None) else s_abs)
 
         sources = list(hipified_sources)
 
@@ -1325,15 +1328,25 @@ def _jit_compile(name,
             try:
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
                     if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
-                        hipify_python.hipify(
+                        hipify_result = hipify_python.hipify(
                             project_directory=build_directory,
                             output_directory=build_directory,
-                            includes=os.path.join(build_directory, '*'),
+                            header_include_dirs=(extra_include_paths if extra_include_paths is not None else []),
                             extra_files=[os.path.abspath(s) for s in sources],
+                            ignores=[os.path.join(ROCM_HOME, '*'), os.path.join(_TORCH_PATH, '*')],  # no need to hipify ROCm or PyTorch headers
                             show_detailed=verbose,
+                            show_progress=verbose,
                             is_pytorch_extension=True,
                             clean_ctx=clean_ctx
                         )
+
+                        hipified_sources = set()
+                        for source in sources:
+                            s_abs = os.path.abspath(source)
+                            hipified_sources.add(hipify_result[s_abs]["hipified_path"] if s_abs in hipify_result else s_abs)
+
+                        sources = list(hipified_sources)
+
                     _write_ninja_file_and_build_library(
                         name=name,
                         sources=sources,
@@ -1636,7 +1649,7 @@ def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
     # Use same defaults from file cmake/public/LoadHIP.cmake.
     # Must keep in sync if defaults change.
     # Allow env var to override, just like during initial cmake build.
-    archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx803;gfx900;gfx906;gfx908')
+    archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx900;gfx906;gfx908;gfx90a')
     flags = ['--amdgpu-target=%s' % arch for arch in archs.split(';')]
     flags += ['-fno-gpu-rdc']
     return flags
@@ -1826,10 +1839,6 @@ def _write_ninja_file_to_build_library(path,
         cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
         cuda_flags += extra_cuda_cflags
         cuda_flags += _get_rocm_arch_flags(cuda_flags)
-        sources = [s if not _is_cuda_file(s) else
-                   os.path.abspath(os.path.join(
-                       path, get_hip_file_path(os.path.relpath(s, path), is_pytorch_extension=True)))
-                   for s in sources]
     elif with_cuda:
         cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
         if IS_WINDOWS:
@@ -1940,6 +1949,8 @@ def sanitize_flags(flags):
             nvcc = _join_cuda_home('bin', 'nvcc')
         config.append(f'nvcc = {nvcc}')
 
+    if IS_HIP_EXTENSION:
+        post_cflags = COMMON_HIP_FLAGS + post_cflags
     flags = [f'cflags = {" ".join(cflags)}']
     flags.append(f'post_cflags = {" ".join(post_cflags)}')
     if with_cuda:
diff --git a/torch/utils/data/datapipes/README.md b/torch/utils/data/datapipes/README.md
index 69cd56d3cfbd1b..e19e1efdc6e2cb 100644
--- a/torch/utils/data/datapipes/README.md
+++ b/torch/utils/data/datapipes/README.md
@@ -21,7 +21,9 @@ class MapperIterDataPipe(IterDataPipe):
         self.dp = dp
         self.fn = fn
 ```
-Note: Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory.
+Note:
+- Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory.
+- If `IterDataPipe` instance holds data in memory, please be ware of the in-place modification of data. When second iterator is created from the instance, the data may have already changed. Please take [`IterableWrapper`](https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes/iter/utils.py) class as reference to `deepcopy` data for each iterator.
 
 ### Iterator
 For `IterDataPipe`, an `__iter__` function is needed to consume data from the source `IterDataPipe` then apply operation over the data before yield.
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index e7f81842b233ef..bd1134c23e1f97 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -1,4 +1,3 @@
-import copy
 import warnings
 from torch.utils.data import IterDataPipe, _utils, functional_datapipe, DataChunk
 from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar
@@ -99,8 +98,6 @@ def _apply_fn(self, data):
             data = list(data)
         else:
             t_flag = False
-            # Deepcopy data to prevent the original data modified. E.g. list, dict
-            data = copy.deepcopy(data)
 
         if self.output_col is None:
             if isinstance(self.input_col, (list, tuple)):
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index f8486811adeeb2..37dcc6238be2cb 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,9 +1,9 @@
 import random
-import warnings
 
 from collections import defaultdict
 
 from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk
+from torch.utils.data.datapipes.utils.common import deprecation_warning_torchdata
 from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar
 
 T_co = TypeVar('T_co', covariant=True)
@@ -185,8 +185,7 @@ def __init__(self,
         assert batch_size > 0, "Batch size is required to be larger than 0!"
         assert batch_num > 0, "Number of batches is required to be larger than 0!"
         assert bucket_num > 0, "Number of buckets is required to be larger than 0!"
-
-        warnings.warn("`BucketBatcher` is going to be removed from PyTorch Core")
+        deprecation_warning_torchdata(type(self).__name__)
         super().__init__()
 
         # TODO: Verify _datapippe is not going to be serialized twice
diff --git a/torch/utils/data/datapipes/iter/httpreader.py b/torch/utils/data/datapipes/iter/httpreader.py
index 0c8e2fc818e9fa..f13aa5bfc7e8ec 100644
--- a/torch/utils/data/datapipes/iter/httpreader.py
+++ b/torch/utils/data/datapipes/iter/httpreader.py
@@ -3,6 +3,7 @@
 from urllib.error import HTTPError, URLError
 import urllib.request as urllib
 from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes.utils.common import deprecation_warning_torchdata
 
 
 class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
@@ -19,6 +20,7 @@ class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
     def __init__(self, datapipe, timeout=None):
         self.datapipe = datapipe
         self.timeout = timeout
+        deprecation_warning_torchdata(type(self).__name__)
 
     def __iter__(self):
         for furl in self.datapipe:
diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py
index 04b992d647b777..311053eb973ed2 100644
--- a/torch/utils/data/datapipes/iter/linereader.py
+++ b/torch/utils/data/datapipes/iter/linereader.py
@@ -1,5 +1,6 @@
 from typing import Tuple
 from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes.utils.common import deprecation_warning_torchdata
 
 
 class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]):
@@ -14,6 +15,7 @@ class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]):
 
     def __init__(self, datapipe):
         self.datapipe = datapipe
+        deprecation_warning_torchdata(type(self).__name__)
 
     def __iter__(self):
         for file_name, stream in self.datapipe:
diff --git a/torch/utils/data/datapipes/iter/tararchivereader.py b/torch/utils/data/datapipes/iter/tararchivereader.py
index 95c3331ee647bd..60bb16f149e909 100644
--- a/torch/utils/data/datapipes/iter/tararchivereader.py
+++ b/torch/utils/data/datapipes/iter/tararchivereader.py
@@ -1,5 +1,5 @@
 from torch.utils.data import IterDataPipe
-from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple
+from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple, deprecation_warning_torchdata
 from typing import Iterable, Iterator, Tuple, Optional, IO, cast
 from io import BufferedIOBase
 
@@ -34,11 +34,13 @@ def __init__(
         self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe
         self.mode: str = mode
         self.length: int = length
+        deprecation_warning_torchdata(type(self).__name__)
 
     def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
         for data in self.datapipe:
             validate_pathname_binary_tuple(data)
             pathname, data_stream = data
+            folder_name = os.path.dirname(pathname)
             try:
                 # typing.cast is used here to silence mypy's type checker
                 tar = tarfile.open(fileobj=cast(Optional[IO[bytes]], data_stream), mode=self.mode)
@@ -49,14 +51,12 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
                     if extracted_fobj is None:
                         warnings.warn("failed to extract file {} from source tarfile {}".format(tarinfo.name, pathname))
                         raise tarfile.ExtractError
-                    inner_pathname = os.path.normpath(os.path.join(pathname, tarinfo.name))
+                    inner_pathname = os.path.normpath(os.path.join(folder_name, tarinfo.name))
                     yield (inner_pathname, extracted_fobj)  # type: ignore[misc]
             except Exception as e:
                 warnings.warn(
                     "Unable to extract files from corrupted tarfile stream {} due to: {}, abort!".format(pathname, e))
                 raise e
-            finally:
-                data_stream.close()
 
     def __len__(self):
         if self.length == -1:
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index 9ba80e3576f77d..246244a6efdbaa 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -1,3 +1,5 @@
+import copy
+import warnings
 from torch.utils.data import IterDataPipe
 
 
@@ -8,12 +10,34 @@ class IterableWrapperIterDataPipe(IterDataPipe):
 
     Args:
         iterable: Iterable object to be wrapped into an IterDataPipe
+        deepcopy: Option to deepcopy input iterable object for each
+            iteration.
+
+    .. note::
+      If `deepcopy` is set to False explicitly, users should ensure
+      that data pipeline doesn't contain any in-place operations over
+      the iterable instance, in order to prevent data inconsistency
+      across iterations.
     """
-    def __init__(self, iterable):
+    def __init__(self, iterable, deepcopy=True):
         self.iterable = iterable
+        self.deepcopy = deepcopy
 
     def __iter__(self):
-        for data in self.iterable:
+        source_data = self.iterable
+        if self.deepcopy:
+            try:
+                source_data = copy.deepcopy(self.iterable)
+            # For the case that data cannot be deep-copied,
+            # all in-place operations will affect iterable variable.
+            # When this DataPipe is iterated second time, it will
+            # yield modified items.
+            except TypeError:
+                warnings.warn(
+                    "The input iterable can not be deepcopied, "
+                    "please be aware of in-place modification would affect source data"
+                )
+        for data in source_data:
             yield data
 
     def __len__(self):
diff --git a/torch/utils/data/datapipes/iter/ziparchivereader.py b/torch/utils/data/datapipes/iter/ziparchivereader.py
index 66af28e1fbb4a8..988384b79b2f7b 100644
--- a/torch/utils/data/datapipes/iter/ziparchivereader.py
+++ b/torch/utils/data/datapipes/iter/ziparchivereader.py
@@ -1,5 +1,5 @@
 from torch.utils.data import IterDataPipe
-from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple
+from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple, deprecation_warning_torchdata
 from typing import Iterable, Iterator, Tuple, IO, cast
 from io import BufferedIOBase
 
@@ -31,11 +31,13 @@ def __init__(
         super().__init__()
         self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe
         self.length: int = length
+        deprecation_warning_torchdata(type(self).__name__)
 
     def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
         for data in self.datapipe:
             validate_pathname_binary_tuple(data)
             pathname, data_stream = data
+            folder_name = os.path.dirname(pathname)
             try:
                 # typing.cast is used here to silence mypy's type checker
                 zips = zipfile.ZipFile(cast(IO[bytes], data_stream))
@@ -47,7 +49,7 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
                     elif zipinfo.filename.endswith('/'):
                         continue
                     extracted_fobj = zips.open(zipinfo)
-                    inner_pathname = os.path.normpath(os.path.join(pathname, zipinfo.filename))
+                    inner_pathname = os.path.normpath(os.path.join(folder_name, zipinfo.filename))
                     yield (inner_pathname, extracted_fobj)  # type: ignore[misc]
             except Exception as e:
                 warnings.warn(
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index bdb5a26660d8c9..a0ed36d388154f 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -63,3 +63,9 @@ def validate_pathname_binary_tuple(data):
         raise TypeError("pathname binary tuple should have string type pathname, but got {}".format(type(data[0])))
     if not isinstance(data[1], BufferedIOBase):
         raise TypeError("pathname binary tuple should have BufferedIOBase based binary type, but got {}".format(type(data[1])))
+
+# Warns user that the DataPipe has been moved to TorchData and will be removed from `torch`
+def deprecation_warning_torchdata(name):
+    warnings.warn(f"{name} and its functional API are deprecated and will be removed from the package `torch`. "
+                  f"Please import those features from the new package TorchData: https://github.com/pytorch/data",
+                  DeprecationWarning)
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index 79033470995dc5..232302e53c5418 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -112,15 +112,18 @@ def num_samples(self) -> int:
     def __iter__(self) -> Iterator[int]:
         n = len(self.data_source)
         if self.generator is None:
-            self.generator = torch.Generator()
-            self.generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = self.generator
 
         if self.replacement:
             for _ in range(self.num_samples // 32):
-                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=self.generator).tolist()
-            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=self.generator).tolist()
+                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
+            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
         else:
-            yield from torch.randperm(n, generator=self.generator).tolist()
+            yield from torch.randperm(n, generator=generator).tolist()
 
     def __len__(self) -> int:
         return self.num_samples
@@ -140,7 +143,8 @@ def __init__(self, indices: Sequence[int], generator=None) -> None:
         self.generator = generator
 
     def __iter__(self) -> Iterator[int]:
-        return (self.indices[i] for i in torch.randperm(len(self.indices), generator=self.generator))
+        for i in torch.randperm(len(self.indices), generator=self.generator):
+            yield self.indices[i]
 
     def __len__(self) -> int:
         return len(self.indices)
@@ -183,7 +187,7 @@ def __init__(self, weights: Sequence[float], num_samples: int,
 
     def __iter__(self) -> Iterator[int]:
         rand_tensor = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator)
-        return iter(rand_tensor.tolist())
+        yield from iter(rand_tensor.tolist())
 
     def __len__(self) -> int:
         return self.num_samples
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
old mode 100644
new mode 100755
index ad2903f7ad655f..b9d084a8decdac
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -117,15 +117,16 @@ def match_extensions(filename: str, extensions: Iterable) -> bool:
     """Helper method to see if filename ends with certain extension"""
     return any(filename.endswith(e) for e in extensions)
 
+def _fnmatch(filepath, patterns):
+    return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)
+
 def matched_files_iter(
         root_path: str,
-        includes: Iterable = ('*',),
+        includes: Iterable = (),
         ignores: Iterable = (),
         extensions: Iterable = (),
         out_of_place_only: bool = False,
         is_pytorch_extension: bool = False) -> Iterator[str]:
-    def _fnmatch(filepath, patterns):
-        return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)
 
     exact_matches = set(includes)
 
@@ -145,7 +146,8 @@ def _fnmatch(filepath, patterns):
             if "third_party" in dirs:
                 dirs.remove("third_party")
         for filename in filenames:
-            filepath = os.path.join(rel_dirpath, filename)
+            filepath = os.path.join(abs_dirpath, filename)
+            rel_filepath = os.path.join(rel_dirpath, filename)
             # We respect extensions, UNLESS you wrote the entire
             # filename verbatim, in which case we always accept it
             if (
@@ -154,9 +156,9 @@ def _fnmatch(filepath, patterns):
                 and (match_extensions(filepath, extensions) or filepath in exact_matches)
             ):
                 if not is_pytorch_extension:  # for pytorch extensions, consider all files
-                    if not is_pytorch_file(filepath) and not is_caffe2_gpu_file(filepath):
+                    if not is_pytorch_file(rel_filepath) and not is_caffe2_gpu_file(rel_filepath):
                         continue
-                    if out_of_place_only and not is_out_of_place(filepath):
+                    if out_of_place_only and not is_out_of_place(rel_filepath):
                         continue
                 yield filepath
 
@@ -165,59 +167,23 @@ def preprocess_file_and_save_result(
         output_directory: str,
         filepath: str,
         all_files: Iterable,
-        includes: Iterable,
+        header_include_dirs: Iterable,
         stats: Dict[str, List],
         hip_clang_launch: bool,
         is_pytorch_extension: bool,
         clean_ctx: GeneratedFileCleaner,
         show_progress: bool) -> None:
-    result = preprocessor(output_directory, filepath, all_files, includes, stats,
+    result = preprocessor(output_directory, filepath, all_files, header_include_dirs, stats,
                           hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
 
     fin_path = os.path.abspath(os.path.join(output_directory, filepath))
     # Show what happened
-    if show_progress:
+    if show_progress and "ignored" not in result["status"]:
         print(
             fin_path, "->",
-            result["hipified_path"], result["status"])
-
-    if result["hipified_path"] is not None:
-        HIPIFY_FINAL_RESULT[fin_path] = result
-
-
-def preprocess(
-        output_directory: str,
-        all_files: Iterable,
-        includes: Iterable,
-        show_detailed: bool = False,
-        show_progress: bool = True,
-        hip_clang_launch: bool = False,
-        is_pytorch_extension: bool = False,
-        clean_ctx: GeneratedFileCleaner = None) -> HipifyFinalResult:
-    """
-    Call preprocessor on selected files.
-
-    Arguments)
-        show_detailed - Show a detailed summary of the transpilation process.
-    """
-
-    if clean_ctx is None:
-        clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
-
-    # Preprocessing statistics.
-    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
-
-    for filepath in all_files:
-        preprocess_file_and_save_result(output_directory, filepath, all_files, includes, stats,
-                                        hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
+            result["hipified_path"], result["status"], flush=True)
 
-    print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr)
-
-    # Show detailed summary
-    if show_detailed:
-        compute_stats(stats)
-
-    return HIPIFY_FINAL_RESULT
+    HIPIFY_FINAL_RESULT[fin_path] = result
 
 
 def compute_stats(stats):
@@ -509,16 +475,17 @@ def replace_extern_shared(input_string):
     return output_string
 
 
-def get_hip_file_path(filepath, is_pytorch_extension=False):
+def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
     """
     Returns the new name of the hipified file
     """
     # At the moment, some PyTorch source files are HIPified in place.  The predicate
     # is_out_of_place tells us if this is the case or not.
-    if not is_pytorch_extension and not is_out_of_place(filepath):
-        return filepath
+    assert(not os.path.isabs(rel_filepath))
+    if not is_pytorch_extension and not is_out_of_place(rel_filepath):
+        return rel_filepath
 
-    dirpath, filename = os.path.split(filepath)
+    dirpath, filename = os.path.split(rel_filepath)
     root, ext = os.path.splitext(filename)
 
     # Here's the plan:
@@ -562,6 +529,7 @@ def get_hip_file_path(filepath, is_pytorch_extension=False):
     orig_dirpath = dirpath
 
     dirpath = dirpath.replace('cuda', 'hip')
+    dirpath = dirpath.replace('CUDA', 'HIP')
     dirpath = dirpath.replace('THC', 'THH')
 
     root = root.replace('cuda', 'hip')
@@ -579,31 +547,34 @@ def get_hip_file_path(filepath, is_pytorch_extension=False):
     return os.path.join(dirpath, root + ext)
 
 
-def is_out_of_place(filepath):
-    if filepath.startswith("torch/"):
+def is_out_of_place(rel_filepath):
+    assert(not os.path.isabs(rel_filepath))
+    if rel_filepath.startswith("torch/"):
         return False
-    if filepath.startswith("tools/autograd/templates/"):
+    if rel_filepath.startswith("tools/autograd/templates/"):
         return False
     return True
 
 
 # Keep this synchronized with includes/ignores in build_amd.py
-def is_pytorch_file(filepath):
-    if filepath.startswith("aten/"):
-        if filepath.startswith("aten/src/ATen/core/"):
+def is_pytorch_file(rel_filepath):
+    assert(not os.path.isabs(rel_filepath))
+    if rel_filepath.startswith("aten/"):
+        if rel_filepath.startswith("aten/src/ATen/core/"):
             return False
         return True
-    if filepath.startswith("torch/"):
+    if rel_filepath.startswith("torch/"):
         return True
-    if filepath.startswith("tools/autograd/templates/"):
+    if rel_filepath.startswith("tools/autograd/templates/"):
         return True
     return False
 
 
-def is_caffe2_gpu_file(filepath):
-    if filepath.startswith("c10/cuda"):
+def is_caffe2_gpu_file(rel_filepath):
+    assert(not os.path.isabs(rel_filepath))
+    if rel_filepath.startswith("c10/cuda"):
         return True
-    filename = os.path.basename(filepath)
+    filename = os.path.basename(rel_filepath)
     _, ext = os.path.splitext(filename)
     return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename)
 
@@ -697,31 +668,36 @@ def pattern(self):
 Returns a dict with the following keys:
     "hipified_path" : absolute path of hipified source file
     "status"        : "ok"      if hipified file was written out
-                      "skipped" if an identical hipified file already existed
-                      "ignored" if the source file was a hipified file itself
+                      "skipped" if an identical hipified file already existed or hipified file couldn't be written out
+                      "ignored" if the source file was a hipified file itself or not meant to be hipified
 """
 def preprocessor(
         output_directory: str,
         filepath: str,
         all_files: Iterable,
-        includes: Iterable,
+        header_include_dirs: Iterable,
         stats: Dict[str, List],
         hip_clang_launch: bool,
         is_pytorch_extension: bool,
         clean_ctx: GeneratedFileCleaner,
         show_progress: bool) -> HipifyResult:
     """ Executes the CUDA -> HIP conversion on the specified file. """
+    if filepath not in all_files:
+        return {"hipified_path": None, "status": "[ignored, not to be hipified]"}
+
     fin_path = os.path.abspath(os.path.join(output_directory, filepath))
+    rel_filepath = os.path.relpath(filepath, output_directory)
 
     with open(fin_path, 'r', encoding='utf-8') as fin:
         if fin.readline() == HIPIFY_C_BREADCRUMB:
-            return {"hipified_path": None, "status": "ignored"}
+            return {"hipified_path": None, "status": "[ignored, input is hipified output]"}
         fin.seek(0)
         output_source = fin.read()
 
     orig_output_source = output_source
 
-    fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension)))
+    # get_hip_file_path needs a relative path to work correctly
+    fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(rel_filepath, is_pytorch_extension)))
     if not os.path.exists(os.path.dirname(fout_path)):
         clean_ctx.makedirs(os.path.dirname(fout_path))
 
@@ -732,7 +708,7 @@ def pt_repl(m):
     if is_pytorch_extension:
         output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
     else:
-        if is_pytorch_file(filepath):
+        if is_pytorch_file(rel_filepath):
             output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
         else:
             def c2_repl(m):
@@ -766,8 +742,8 @@ def repl(m):
                         header_filepath = header_path_to_check
                 # If not found, look in include dirs one by one and first match wins
                 if header_filepath is None:
-                    for include in includes:
-                        header_dir_to_check = os.path.join(output_directory, os.path.dirname(include))
+                    for header_include_dir in header_include_dirs:
+                        header_dir_to_check = os.path.join(output_directory, header_include_dir)
                         header_path_to_check = os.path.abspath(os.path.join(header_dir_to_check, f))
                         if os.path.exists(header_path_to_check):
                             header_dir = header_dir_to_check
@@ -778,12 +754,12 @@ def repl(m):
                 # Hipify header file first if needed
                 if header_filepath not in HIPIFY_FINAL_RESULT:
                     preprocess_file_and_save_result(output_directory,
-                                                    os.path.relpath(header_filepath, output_directory),
-                                                    all_files, includes, stats, hip_clang_launch, is_pytorch_extension,
-                                                    clean_ctx, show_progress)
-                value = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"]
-                assert value is not None
-                return templ.format(os.path.relpath(value, header_dir))
+                                                    header_filepath,
+                                                    all_files, header_include_dirs, stats, hip_clang_launch,
+                                                    is_pytorch_extension, clean_ctx, show_progress)
+                hipified_header_filepath = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"]
+                return templ.format(os.path.relpath(hipified_header_filepath if hipified_header_filepath is not None 
+                                                    else header_filepath, header_dir))
 
             return m.group(0)
         return repl
@@ -817,7 +793,7 @@ def repl(m):
         and orig_output_source == output_source
         and os.path.dirname(fin_path) == os.path.dirname(fout_path)
     ):
-        return {"hipified_path": fin_path, "status": "ok"}
+        return {"hipified_path": fin_path, "status": "[skipped, no changes]"}
 
     # Add hipify breadcrumb for C-style files to avoid re-hipification
     if fin_path != fout_path and match_extensions(fin_path, (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".hpp")):
@@ -831,13 +807,13 @@ def repl(m):
         try:
             with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout:
                 fout.write(output_source)
-            return {"hipified_path": fout_path, "status": "ok"}
+            return {"hipified_path": fout_path, "status": "[ok]"}
         except PermissionError as e:
             print(f"{bcolors.WARNING}Failed to save {fout_path} with \"{e.strerror}\", leaving {fin_path} unchanged.{bcolors.ENDC}",
                   file=sys.stderr)
-            return {"hipified_path": fin_path, "status": "skipped"}
+            return {"hipified_path": fin_path, "status": "[skipped, no permissions]"}
     else:
-        return {"hipified_path": fout_path, "status": "skipped"}
+        return {"hipified_path": fout_path, "status": "[skipped, already hipified]"}
 
 def file_specific_replacement(filepath, search_string, replace_string, strict=False):
     with openf(filepath, "r+") as f:
@@ -932,14 +908,17 @@ def hipify(
     project_directory: str,
     show_detailed: bool = False,
     extensions: Iterable = (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".in", ".hpp"),
+    header_extensions: Iterable = (".cuh", ".h", ".hpp"),
     output_directory: str = "",
-    includes: Iterable = (),
+    header_include_dirs: Iterable = (),
+    includes: Iterable = ('*',),
     extra_files: Iterable = (),
     out_of_place_only: bool = False,
     ignores: Iterable = (),
     show_progress: bool = True,
     hip_clang_launch: bool = False,
     is_pytorch_extension: bool = False,
+    hipify_extra_files_only: bool = False,
     clean_ctx: GeneratedFileCleaner = None
 ) -> HipifyFinalResult:
     if project_directory == "":
@@ -955,6 +934,10 @@ def hipify(
         project_directory.rstrip("/")
         output_directory = project_directory + "_amd"
 
+    if project_directory != output_directory:
+        includes = [include.replace(project_directory, output_directory) for include in includes]
+        ignores = [ignore.replace(project_directory, output_directory) for ignore in ignores]
+
     # Copy from project directory to output directory if not done already.
     if not os.path.exists(output_directory):
         shutil.copytree(project_directory, output_directory)
@@ -964,19 +947,42 @@ def hipify(
                                         out_of_place_only=out_of_place_only,
                                         is_pytorch_extension=is_pytorch_extension))
     all_files_set = set(all_files)
-    # Convert extra_files to relative paths since all_files has all relative paths
     for f in extra_files:
-        f_rel = os.path.relpath(f, output_directory)
-        if f_rel not in all_files_set:
-            all_files.append(f_rel)
-
-    # Start Preprocessor
-    return preprocess(
-        output_directory,
-        all_files,
-        includes,
-        show_detailed=show_detailed,
-        show_progress=show_progress,
-        hip_clang_launch=hip_clang_launch,
-        is_pytorch_extension=is_pytorch_extension,
-        clean_ctx=clean_ctx)
+        if not os.path.isabs(f):
+            f = os.path.join(output_directory, f)
+        if f not in all_files_set:
+            all_files.append(f)
+
+    # List all files in header_include_paths to ensure they are hipified
+    from pathlib import Path
+    for header_include_dir in header_include_dirs:
+        if os.path.isabs(header_include_dir):
+            header_include_dir_path = Path(header_include_dir)
+        else:
+            header_include_dir_path = Path(os.path.join(output_directory, header_include_dir))
+        for path in header_include_dir_path.rglob('*'):
+            if (
+                path.is_file()
+                and _fnmatch(str(path), includes)
+                and (not _fnmatch(str(path), ignores))
+                and match_extensions(path.name, header_extensions)
+            ):
+                all_files.append(str(path))
+
+    if clean_ctx is None:
+        clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
+
+    # Preprocessing statistics.
+    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
+
+    for filepath in (all_files if not hipify_extra_files_only else extra_files):
+        preprocess_file_and_save_result(output_directory, filepath, all_files, header_include_dirs,
+                                        stats, hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
+
+    print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr)
+
+    # Show detailed summary
+    if show_detailed:
+        compute_stats(stats)
+
+    return HIPIFY_FINAL_RESULT