diff --git a/.circleci/config.yml b/.circleci/config.yml index 9318a2d07720f8..0bd8e8e1a4a272 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -632,7 +632,8 @@ jobs: } if is_vanilla_build; then - echo "apt-get update && apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash + echo "apt-get update || apt-get install libgnutls30" | docker exec -u root -i "$id" bash + echo "apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash echo "cd workspace/build; qemu-x86_64 -g 2345 -cpu Broadwell -E ATEN_CPU_CAPABILITY=default ./bin/basic --gtest_filter=BasicTest.BasicTestCPU & gdb ./bin/basic -ex 'set pagination off' -ex 'target remote :2345' -ex 'continue' -ex 'bt' -ex='set confirm off' -ex 'quit \$_isvoid(\$_exitcode)'" | docker exec -u jenkins -i "$id" bash else echo "Skipping for ${BUILD_ENVIRONMENT}" @@ -1734,16 +1735,17 @@ jobs: # install fastlane sudo gem install bundler && bundle install # install certificates - echo ${IOS_CERT_KEY} >> cert.txt + echo ${IOS_CERT_KEY_2022} >> cert.txt base64 --decode cert.txt -o Certificates.p12 rm cert.txt - bundle exec fastlane install_cert + bundle exec fastlane install_root_cert + bundle exec fastlane install_dev_cert # install the provisioning profile - PROFILE=PyTorch_CI_2021.mobileprovision + PROFILE=PyTorch_CI_2022.mobileprovision PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles mkdir -pv "${PROVISIONING_PROFILES}" cd "${PROVISIONING_PROFILES}" - echo ${IOS_SIGN_KEY} >> cert.txt + echo ${IOS_SIGN_KEY_2022} >> cert.txt base64 --decode cert.txt -o ${PROFILE} rm cert.txt - run: @@ -1802,7 +1804,7 @@ jobs: command: | set -e PROJ_ROOT=/Users/distiller/project - PROFILE=PyTorch_CI_2021 + PROFILE=PyTorch_CI_2022 # run the ruby build script if ! [ -x "$(command -v xcodebuild)" ]; then echo 'Error: xcodebuild is not installed.' diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index bebbd3f04ddb5b..6e6acf518d8903 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -329,6 +329,7 @@ docker build \ --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ --build-arg "KATEX=${KATEX:-}" \ --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ + --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906;gfx908}" \ -f $(dirname ${DOCKERFILE})/Dockerfile \ -t "$tmp_tag" \ "$@" \ diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh index 044ff869a2bf71..57156b31fd8e35 100755 --- a/.circleci/docker/common/install_base.sh +++ b/.circleci/docker/common/install_base.sh @@ -11,8 +11,13 @@ install_ubuntu() { # "$UBUNTU_VERSION" == "18.04" if [[ "$UBUNTU_VERSION" == "18.04"* ]]; then cmake3="cmake=3.10*" + maybe_libiomp_dev="libiomp-dev" + elif [[ "$UBUNTU_VERSION" == "20.04"* ]]; then + cmake3="cmake=3.16*" + maybe_libiomp_dev="" else cmake3="cmake=3.5*" + maybe_libiomp_dev="libiomp-dev" fi # Install common dependencies @@ -33,7 +38,7 @@ install_ubuntu() { git \ libatlas-base-dev \ libc6-dbg \ - libiomp-dev \ + ${maybe_libiomp_dev} \ libyaml-dev \ libz-dev \ libjpeg-dev \ @@ -107,11 +112,13 @@ case "$ID" in esac # Install Valgrind separately since the apt-get version is too old. +OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') +if [[ $ID == centos && $OS_VERSION == 7 ]]; then WGET_FLAG="--no-check-certificate" ; else WGET_FLAG=""; fi mkdir valgrind_build && cd valgrind_build VALGRIND_VERSION=3.16.1 -if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2 +if ! wget $WGET_FLAG http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2 then - wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2 + wget $WGET_FLAG https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2 fi tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2 cd valgrind-${VALGRIND_VERSION} diff --git a/.circleci/docker/common/install_rocm.sh b/.circleci/docker/common/install_rocm.sh index fa4df9a81b4087..d454b9bfddc4f6 100644 --- a/.circleci/docker/common/install_rocm.sh +++ b/.circleci/docker/common/install_rocm.sh @@ -14,7 +14,7 @@ install_magma() { cp make.inc-examples/make.inc.hip-gcc-mkl make.inc echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc - echo 'DEVCCFLAGS += --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906 --amdgpu-target=gfx908 --gpu-max-threads-per-block=256' >> make.inc + echo 'DEVCCFLAGS += --amdgpu-target=gfx900 --amdgpu-target=gfx906 --amdgpu-target=gfx908 --amdgpu-target=gfx90a --gpu-max-threads-per-block=256' >> make.inc # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition sed -i 's/^FOPENMP/#FOPENMP/g' make.inc export PATH="${PATH}:/opt/rocm/bin" @@ -29,12 +29,19 @@ ver() { printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); } +# Map ROCm version to AMDGPU version +declare -A AMDGPU_VERSIONS=( ["4.5.2"]="21.40.2" ["5.0"]="21.50" ["5.0.1"]="21.50.1" ["5.1"]="22.10" ["5.1.1"]="22.10.1" ["5.1.3"]="22.10.3" ["5.2"]="22.20" ["5.2.1"]="22.20.1" ) + install_ubuntu() { apt-get update if [[ $UBUNTU_VERSION == 18.04 ]]; then # gpg-agent is not available by default on 18.04 apt-get install -y --no-install-recommends gpg-agent fi + if [[ $UBUNTU_VERSION == 20.04 ]]; then + # gpg-agent is not available by default on 20.04 + apt-get install -y --no-install-recommends gpg-agent + fi apt-get install -y kmod apt-get install -y wget @@ -42,6 +49,13 @@ install_ubuntu() { apt-get install -y libc++1 apt-get install -y libc++abi1 + if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then + # Add amdgpu repository + UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` + local amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu" + echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list + fi + ROCM_REPO="ubuntu" if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then ROCM_REPO="xenial" @@ -49,7 +63,8 @@ install_ubuntu() { # Add rocm repository wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - - echo "deb [arch=amd64] http://repo.radeon.com/rocm/apt/${ROCM_VERSION} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list + local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}" + echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list apt-get update --allow-insecure-repositories DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ @@ -86,11 +101,24 @@ install_centos() { yum install -y epel-release yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r` + if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then + # Add amdgpu repository + local amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64" + echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo + echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo + echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo + echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo + fi + + local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" echo "[ROCm]" > /etc/yum.repos.d/rocm.repo echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo - echo "baseurl=http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" >> /etc/yum.repos.d/rocm.repo + echo "baseurl=${rocm_baseurl}" >> /etc/yum.repos.d/rocm.repo echo "enabled=1" >> /etc/yum.repos.d/rocm.repo - echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo + echo "gpgcheck=1" >> /etc/yum.repos.d/rocm.repo + echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/rocm.repo yum update -y diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh index db2b0660d9f506..8e5b0b99c854b7 100755 --- a/.circleci/scripts/binary_checkout.sh +++ b/.circleci/scripts/binary_checkout.sh @@ -61,7 +61,7 @@ git --no-pager log --max-count 1 popd # Clone the Builder master repo -retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT" +retry git clone -q https://github.com/pytorch/builder.git -b release/1.10 "$BUILDER_ROOT" pushd "$BUILDER_ROOT" echo "Using builder from " git --no-pager log --max-count 1 diff --git a/.circleci/scripts/binary_ios_test.sh b/.circleci/scripts/binary_ios_test.sh index 5280b62a4e708c..c3d91a231ee88f 100644 --- a/.circleci/scripts/binary_ios_test.sh +++ b/.circleci/scripts/binary_ios_test.sh @@ -8,16 +8,17 @@ cd ${PROJ_ROOT}/ios/TestApp # install fastlane sudo gem install bundler && bundle install # install certificates -echo "${IOS_CERT_KEY}" >> cert.txt +echo "${IOS_CERT_KEY_2022}" >> cert.txt base64 --decode cert.txt -o Certificates.p12 rm cert.txt -bundle exec fastlane install_cert +bundle exec fastlane install_root_cert +bundle exec fastlane install_dev_cert # install the provisioning profile -PROFILE=PyTorch_CI_2021.mobileprovision +PROFILE=PyTorch_CI_2022.mobileprovision PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles mkdir -pv "${PROVISIONING_PROFILES}" cd "${PROVISIONING_PROFILES}" -echo "${IOS_SIGN_KEY}" >> cert.txt +echo "${IOS_SIGN_KEY_2022}" >> cert.txt base64 --decode cert.txt -o ${PROFILE} rm cert.txt # run the ruby build script @@ -25,5 +26,5 @@ if ! [ -x "$(command -v xcodebuild)" ]; then echo 'Error: xcodebuild is not installed.' exit 1 fi -PROFILE=PyTorch_CI_2021 +PROFILE=PyTorch_CI_2022 ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID} -f Accelerate,MetalPerformanceShaders,CoreML diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml index 765c1d82067e10..f82512236b8735 100644 --- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml @@ -467,16 +467,17 @@ # install fastlane sudo gem install bundler && bundle install # install certificates - echo ${IOS_CERT_KEY} >> cert.txt + echo ${IOS_CERT_KEY_2022} >> cert.txt base64 --decode cert.txt -o Certificates.p12 rm cert.txt - bundle exec fastlane install_cert + bundle exec fastlane install_root_cert + bundle exec fastlane install_dev_cert # install the provisioning profile - PROFILE=PyTorch_CI_2021.mobileprovision + PROFILE=PyTorch_CI_2022.mobileprovision PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles mkdir -pv "${PROVISIONING_PROFILES}" cd "${PROVISIONING_PROFILES}" - echo ${IOS_SIGN_KEY} >> cert.txt + echo ${IOS_SIGN_KEY_2022} >> cert.txt base64 --decode cert.txt -o ${PROFILE} rm cert.txt - run: @@ -535,7 +536,7 @@ command: | set -e PROJ_ROOT=/Users/distiller/project - PROFILE=PyTorch_CI_2021 + PROFILE=PyTorch_CI_2022 # run the ruby build script if ! [ -x "$(command -v xcodebuild)" ]; then echo 'Error: xcodebuild is not installed.' diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index 437a28b1c98185..7e7a2e07989ed7 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -158,7 +158,8 @@ jobs: } if is_vanilla_build; then - echo "apt-get update && apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash + echo "apt-get update || apt-get install libgnutls30" | docker exec -u root -i "$id" bash + echo "apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash echo "cd workspace/build; qemu-x86_64 -g 2345 -cpu Broadwell -E ATEN_CPU_CAPABILITY=default ./bin/basic --gtest_filter=BasicTest.BasicTestCPU & gdb ./bin/basic -ex 'set pagination off' -ex 'target remote :2345' -ex 'continue' -ex 'bt' -ex='set confirm off' -ex 'quit \$_isvoid(\$_exitcode)'" | docker exec -u jenkins -i "$id" bash else echo "Skipping for ${BUILD_ENVIRONMENT}" diff --git a/.gitmodules b/.gitmodules index a7cc437f438402..0e2dc99d5bc5f4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -135,7 +135,7 @@ url = https://github.com/NVIDIA/cudnn-frontend.git [submodule "third_party/kineto"] path = third_party/kineto - url = https://github.com/pytorch/kineto + url = https://github.com/mwootton/kineto [submodule "third_party/pocketfft"] path = third_party/pocketfft url = https://github.com/mreineck/pocketfft diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index 0cfb5e01184a7c..338b3e1e3de7cd 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -63,9 +63,9 @@ function get_pr_change_files() { function file_diff_from_base() { # The fetch may fail on Docker hosts, this fetch is necessary for GHA set +e - git fetch origin master --quiet + git fetch origin release/1.10 --quiet set -e - git diff --name-only "$(git merge-base origin/master HEAD)" > "$1" + git diff --name-only "$(git merge-base origin/release/1.10 HEAD)" > "$1" } function get_bazel() { @@ -99,5 +99,5 @@ function checkout_install_torchvision() { } function clone_pytorch_xla() { - git clone --recursive https://github.com/pytorch/xla.git + git clone --recursive -b r1.10 https://github.com/pytorch/xla.git } diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 3f39431ba54a1d..60d4015c78e783 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -230,6 +230,8 @@ test_aten() { test_without_numpy() { pushd "$(dirname "${BASH_SOURCE[0]}")" python -c "import sys;sys.path.insert(0, 'fake_numpy');from unittest import TestCase;import torch;x=torch.randn(3,3);TestCase().assertRaises(RuntimeError, lambda: x.numpy())" + # Regression test for https://github.com/pytorch/pytorch/issues/66353 + python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))" popd } @@ -424,7 +426,7 @@ test_backward_compatibility() { python -m venv venv # shellcheck disable=SC1091 . venv/bin/activate - pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html + pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_nightly.html pip show torch python dump_all_function_schemas.py --filename nightly_schemas.txt deactivate diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c11507838fde1..0d8d40d01013e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,7 +148,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF) option(BUILD_DOCS "Build Caffe2 documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON) option(BUILD_PYTHON "Build Python binaries" ON) -option(BUILD_CAFFE2 "Master flag to build Caffe2" ON) +option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF) option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF) cmake_dependent_option( BUILD_CAFFE2_OPS "Build Caffe2 operators" ON diff --git a/aten/src/ATen/ConjugateFallback.cpp b/aten/src/ATen/ConjugateFallback.cpp index 21c044ee074e48..79076d67658c11 100644 --- a/aten/src/ATen/ConjugateFallback.cpp +++ b/aten/src/ATen/ConjugateFallback.cpp @@ -2,21 +2,12 @@ #include namespace at { - +namespace native { struct ConjFallback : MathOpFallback { ConjFallback() : MathOpFallback(DispatchKey::Conjugate, "conjugate") {} bool is_bit_set(const Tensor& tensor) override { return tensor.is_conj(); } - void _set_bit(const Tensor& tensor, bool value) override { - return tensor._set_conj(value); - } - Tensor resolve_bit(const Tensor& tensor) override { - return at::resolve_conj(tensor); - } - Tensor& math_op_(Tensor& tensor) override { - return at::conj_physical_(tensor); - } }; void conjugateFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { @@ -60,4 +51,5 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) { TENSOR_UTILITIES_AND_CONSTRUCTORS(m) } +} } // namespace at diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index bf06efa2a2bb5c..42abef9cc3cac4 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -254,6 +254,20 @@ bool NoTF32Guard::should_disable_tf32() { return override_allow_tf32_flag; } +thread_local bool BackwardPassGuard::is_backward_pass_; + +BackwardPassGuard::BackwardPassGuard() { + is_backward_pass_ = true; +} + +BackwardPassGuard::~BackwardPassGuard() { + is_backward_pass_ = false; +} + +bool BackwardPassGuard::is_backward_pass() { + return is_backward_pass_; +} + bool Context::areVmapFallbackWarningsEnabled() const { return display_vmap_fallback_warnings_; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 4a45ac6f8ac18e..f30ffb57f9f21c 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -380,4 +380,12 @@ struct TORCH_API NoTF32Guard { bool changed = false; }; +struct TORCH_API BackwardPassGuard { + BackwardPassGuard(); + ~BackwardPassGuard(); + static bool is_backward_pass(); +private: + static thread_local bool is_backward_pass_; +}; + } // namespace at diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index e9258603f4306d..f46245e72ca4ba 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -663,7 +663,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) { int64_t numel = this->numel(); if (numel == 0) { return; - } else if (numel < grain_size || at::get_num_threads() == 1) { + } else if (numel < internal::GRAIN_SIZE || at::get_num_threads() == 1) { return serial_for_each(loop, {0, numel}); } else { at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) { diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 70c3dda6f3401f..6904b6d72b2368 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -4,6 +4,13 @@ #include #include +#include +#include + +#ifdef __HIP_PLATFORM_HCC__ +#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +#endif #define CUDABLAS_POSINT_CHECK(FD, X) \ TORCH_CHECK( \ @@ -95,7 +102,7 @@ namespace at { namespace cuda { namespace blas { -const char* _cublasGetErrorEnum(cublasStatus_t error) { +C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) { if (error == CUBLAS_STATUS_SUCCESS) { return "CUBLAS_STATUS_SUCCESS"; } @@ -272,13 +279,17 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { float falpha = alpha; float fbeta = beta; #ifdef __HIP_PLATFORM_HCC__ + int flag = 0; +#if USE_GEMM_FLAGS_FP16_ALT_IMPL + flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +#endif TORCH_CUDABLAS_CHECK(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, rocblas_datatype_f16_r, (int)lda, stridea, b, rocblas_datatype_f16_r, (int)ldb, strideb, (void*)&fbeta, c, rocblas_datatype_f16_r, (int)ldc, stridec, c, rocblas_datatype_f16_r, (int)ldc, stridec, (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, - 0, 0)); + 0, flag)); #else #if defined(CUDA_VERSION) && CUDA_VERSION < 11000 // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH @@ -418,6 +429,10 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); GEMM_CHECK_ARGVALUES(at::Half); #ifdef __HIP_PLATFORM_HCC__ + int flag = 0; +#if USE_GEMM_FLAGS_FP16_ALT_IMPL + flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +#endif TORCH_CUDABLAS_CHECK(rocblas_gemm_ex( handle, opa, @@ -442,7 +457,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, - 0)); + flag)); #else cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 5) { diff --git a/aten/src/ATen/cuda/CUDASolver.cpp b/aten/src/ATen/cuda/CUDASolver.cpp index 355238f6b2dcf4..8b906d4931feba 100644 --- a/aten/src/ATen/cuda/CUDASolver.cpp +++ b/aten/src/ATen/cuda/CUDASolver.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef CUDART_VERSION @@ -9,7 +10,7 @@ namespace at { namespace cuda { namespace solver { -const char* cusolverGetErrorMessage(cusolverStatus_t status) { +C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) { switch (status) { case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCES"; case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h index 1414e319656bd3..d3fdfec69c9a62 100644 --- a/aten/src/ATen/cuda/Exceptions.h +++ b/aten/src/ATen/cuda/Exceptions.h @@ -2,6 +2,7 @@ #include #include +#include #ifdef CUDART_VERSION #include @@ -39,7 +40,7 @@ class CuDNNError : public c10::Error { } while (0) namespace at { namespace cuda { namespace blas { -const char* _cublasGetErrorEnum(cublasStatus_t error); +C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error); }}} // namespace at::cuda::blas #define TORCH_CUDABLAS_CHECK(EXPR) \ @@ -66,7 +67,7 @@ const char *cusparseGetErrorString(cusparseStatus_t status); #ifdef CUDART_VERSION namespace at { namespace cuda { namespace solver { -const char* cusolverGetErrorMessage(cusolverStatus_t status); +C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status); }}} // namespace at::cuda::solver #define TORCH_CUSOLVER_CHECK(EXPR) \ diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 26f804768e42c1..cf835219750964 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -235,6 +235,16 @@ struct chained_iterator { template inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, int64_t num_items) { +#if defined(USE_ROCM) && (ROCM_VERSION >= 50000) + //For ROCm, use hipCUB chained iterators + CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::InclusiveScan, + input, + output, + scan_op, + num_items, + at::cuda::getCurrentCUDAStream()); + C10_HIP_KERNEL_LAUNCH_CHECK(); +#else // non synchronizing cub call // even though cub is supposed to support tensors with int_max elements, in reality it doesn't, // so split at int_max/2 @@ -278,10 +288,22 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT size_cub, at::cuda::getCurrentCUDAStream()); } +#endif } template inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, InitValueT init_value, int64_t num_items) { +#if defined(USE_ROCM) && (ROCM_VERSION >= 50000) + //For ROCm, use hipCUB chained iterators + CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::ExclusiveScan, + input, + output, + scan_op, + init_value, + num_items, + at::cuda::getCurrentCUDAStream()); + C10_HIP_KERNEL_LAUNCH_CHECK(); +#else // non synchronizing cub call // even though cub is supposed to support tensors with int_max elements, in reality it doesn't, // so split at int_max/2 @@ -316,6 +338,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT size_cub, at::cuda::getCurrentCUDAStream()); } +#endif } template diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h index a7d6578b2889f2..3aaa9d06c5e91f 100644 --- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h @@ -10,7 +10,7 @@ class DataPtr; namespace hip { namespace HIPCachingAllocatorMasqueradingAsCUDA { -Allocator* get(); +C10_HIP_API Allocator* get(); C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream); } // namespace HIPCachingAllocatorMasqueradingAsCUDA diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp index 7974840dd3f971..9105c831e993ad 100644 --- a/aten/src/ATen/native/Distance.cpp +++ b/aten/src/ATen/native/Distance.cpp @@ -240,14 +240,11 @@ Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, c } Tensor cosine_similarity(const Tensor& x1, const Tensor& x2, int64_t dim, double eps) { - TORCH_CHECK(x1.ndimension() == x2.ndimension(), "cosine_similarity requires both inputs to have the same number of dimensions, but x1 has ", - x1.ndimension(), " and x2 has ", x2.ndimension()); - TORCH_CHECK(x1.ndimension() == 0 || x1.size(dim) == x2.size(dim), "cosine_similarity requires both inputs to have the same size at dimension ", dim, "but x1 has ", - x1.size(dim), " and x2 has ", x2.size(dim)); + auto common_size = at::infer_size_dimvector(x1.sizes(), x2.sizes()); auto commonDtype = at::result_type(x1, x2); TORCH_CHECK(at::isFloatingType(commonDtype), "expected common dtype to be floating point, yet common dtype is ", commonDtype); - Tensor x1_ = x1.to(commonDtype); - Tensor x2_ = x2.to(commonDtype); + Tensor x1_ = x1.to(commonDtype).expand(common_size); + Tensor x2_ = x2.to(commonDtype).expand(common_size); // Follow scipy impl to improve numerical precision // Use x / sqrt(x * x) instead of x / (sqrt(x) * sqrt(x)) Tensor w12 = at::sum(x1_ * x2_, dim); diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h index ed5aa847bea4e9..00211942ed7e98 100644 --- a/aten/src/ATen/native/DistributionTemplates.h +++ b/aten/src/ATen/native/DistributionTemplates.h @@ -238,7 +238,7 @@ template class normal_kernel, typename RNG> Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional gen) { TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex"); TORCH_CHECK( - std.min().ge(0).item(), + std.numel() == 0 || std.min().ge(0).item(), "normal expects all elements of std >= 0.0"); bool is_deprecated_th_impl = resize_output_for_normal(output, mean, std); normal_impl_(output, 0, 1, gen); diff --git a/aten/src/ATen/native/MathBitFallThroughLists.h b/aten/src/ATen/native/MathBitFallThroughLists.h index 052e730194ee90..ea3fed748c0b1f 100644 --- a/aten/src/ATen/native/MathBitFallThroughLists.h +++ b/aten/src/ATen/native/MathBitFallThroughLists.h @@ -50,7 +50,8 @@ namespace at { m.impl("vsplit.array", torch::CppFunction::makeFallthrough()); \ m.impl("conj", torch::CppFunction::makeFallthrough()); \ m.impl("_conj", torch::CppFunction::makeFallthrough()); \ - m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); + m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); \ + m.impl("resize_", torch::CppFunction::makeFallthrough()); #define TENSOR_UTILITIES_AND_CONSTRUCTORS(m) \ m.impl("empty_like", torch::CppFunction::makeFallthrough()); \ diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h index 2e78ebd66b66ec..89670699fcb6d7 100644 --- a/aten/src/ATen/native/MathBitsFallback.h +++ b/aten/src/ATen/native/MathBitsFallback.h @@ -3,42 +3,49 @@ #include #include #include +#include #include #include namespace at { - +namespace native { // This fallback should only be used for operations that are self inverse and have a corresponding tensor // bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit. // Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit. // Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called. +// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit. struct MathOpFallback { MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(op_name_) {} virtual bool is_bit_set(const Tensor&) = 0; - virtual void _set_bit(const Tensor&, bool) = 0; - // materializes the bit, i.e., returns a new tensor tensor containing the true output - // (after performing the math operation corresponding to the tensor bit) if the bit is set to 1 - // else returns self. - virtual Tensor resolve_bit(const Tensor&) = 0; - // in-place operation corresponding to the math op represented by the bit. Im the future if this class - // is generalized for ops that are not self inverse, then this must be replaced by op_inverse_inplace - virtual Tensor& math_op_(Tensor&) = 0; void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { - // Situations to handle: - // 1. Out-of-place operation. Easy: materialize all inputs and - // call it a day. - // 2. Inplace operation. Desugar x.add_(2) into x.conj_().add_(2).conj_(). - // Materialize other inputs as in (1). - // 3. out= operation. Desugar add(x, 2, out=y) into y.copy_(add(x, 2)) - // Materialize other inputs as in (1). - // - // It is important to be able to tell if we READ from an argument and if we - // WRITE from an argument. Conservative approach is to assume that we always - // READ from an argument, but in out-of-place operations you can skip - // conjugating inputs on entry that never get used. In current schema we - // can't easily tell if inplace situation has happened, so don't do it. + /* + Situations to handle: + 1. Out-of-place operation. Easy: materialize all inputs and + call it a day. + 2. Inplace operation. Desugar x.add_(2) into x.conj_().add_(2).conj_(). + Materialize other inputs as in (1). + 3. out= operation. Desugar add(x, 2, out=y) into y.copy_(add(x, 2)) + Materialize other inputs as in (1). + + It is important to be able to tell if we READ from an argument and if we + WRITE to an argument. Conservative approach is to assume that we always + READ from an argument, but in out= operations you can skip + conjugating inputs on entry that never get used. In the current schema we + can't easily tell if the operation is in in-place or out= operation. + Note: + 1. Mutable tensorlists containing tensors whose math bit set to true are disallowed. + 2. Mutable tensors with math bit set to true are unconditionally cloned to ensure + correct behavior in the case when the mutable tensor shares memory with non mutable arguments. + + If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory + with these mutable inputs would read into wrong values in the following cases: + 1. Non mutable inputs have their math bit set to false. + 2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory + with one or more mutable arg(s)) are cloned. + At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs. + */ const auto& arguments = op.schema().arguments(); const auto num_arguments = arguments.size(); const auto stack_start = stack->size() - num_arguments; @@ -72,9 +79,8 @@ struct MathOpFallback { return; } - // Mutable inputs to be tracked separately - std::vector mutable_inputs; - + // Mutable inputs with math bit set to True and their clones + std::vector> mutable_inputs_with_their_clones; for (const auto i : c10::irange(num_arguments)) { auto& ivalue = (*stack)[stack_start + i]; if (!(ivalue.isTensor() || ivalue.isTensorList())) { @@ -91,31 +97,26 @@ struct MathOpFallback { if (!is_bit_set(ivalue.toTensor())) { continue; } - auto tensor = std::move(ivalue).toTensor(); TORCH_CHECK_NOT_IMPLEMENTED(!tensor.is_meta(), op_name, " fallback does not support meta tensors."); + auto resolved_tensor = at::clone(tensor); if (mut_arg) { - // TODO: This is a waste if the argument is write only - _set_bit(tensor, false); - math_op_(tensor); - mutable_inputs.emplace_back(tensor); - } else { - tensor = resolve_bit(tensor); + TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ", + op_name, "bit set to true."); + mutable_inputs_with_their_clones.emplace_back(std::make_pair(std::move(tensor), resolved_tensor)); } - (*stack)[stack_start + i] = std::move(tensor); + (*stack)[stack_start + i] = std::move(resolved_tensor); } else if (ivalue.isTensorList()) { auto tensors = std::move(ivalue).toTensorList(); - if (mut_arg) { - for(const auto j : c10::irange(tensors.size())) { - Tensor t = tensors[j]; - _set_bit(t, false); - math_op_(t); - mutable_inputs.emplace_back(t); - } - } else { - for(const auto j : c10::irange(tensors.size())) { - tensors[j] = resolve_bit(tensors[j]); + for(const auto j : c10::irange(tensors.size())) { + const auto& tensor = tensors[j]; + if (!is_bit_set(tensor)) { + continue; } + TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ", + op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ", + op.schema().name()); + tensors[j] = at::clone(tensor); } (*stack)[stack_start + i] = std::move(tensors); } @@ -123,9 +124,22 @@ struct MathOpFallback { op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); - for (auto& mutable_input : mutable_inputs) { - math_op_(mutable_input); - _set_bit(mutable_input, true); + TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1); + + for (std::pair mut_tensors: mutable_inputs_with_their_clones) { + auto& mutable_input = mut_tensors.first; + auto& cloned_mutable_input = mut_tensors.second; + auto& ivalue = (*stack)[stack_start]; + auto returned_output = std::move(ivalue).toTensor(); + + // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input + TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output)); + + // necessary for out= arg + at::native::resize_output(mutable_input, returned_output.sizes()); + + mutable_input.copy_(returned_output); + (*stack)[stack_start] = std::move(mutable_input); } } @@ -134,5 +148,5 @@ struct MathOpFallback { DispatchKey key; string op_name; }; - -} // namespace at +} +}// namespace at diff --git a/aten/src/ATen/native/NegateFallback.cpp b/aten/src/ATen/native/NegateFallback.cpp index 8920a2a4d78721..62e7eccea7113e 100644 --- a/aten/src/ATen/native/NegateFallback.cpp +++ b/aten/src/ATen/native/NegateFallback.cpp @@ -2,21 +2,12 @@ #include namespace at { - +namespace native { struct NegFallback : MathOpFallback { NegFallback() : MathOpFallback(DispatchKey::Negative, "negation") {} bool is_bit_set(const Tensor& tensor) override { return tensor.is_neg(); } - void _set_bit(const Tensor& tensor, bool value) override { - return tensor._set_neg(value); - } - Tensor resolve_bit(const Tensor& tensor) override { - return at::resolve_neg(tensor); - } - Tensor& math_op_(Tensor& tensor) override { - return tensor.neg_(); - } }; void negationFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { @@ -42,4 +33,5 @@ TORCH_LIBRARY_IMPL(aten, Negative, m) { TENSOR_UTILITIES_AND_CONSTRUCTORS(m) } +} } // namespace at diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index d11a5bb074c5d6..4b9a51f4b2fedf 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -60,12 +60,12 @@ namespace at { namespace native { template -C10_LAUNCH_BOUNDS_1(num_threads) +C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) { using traits = function_traits; - int remaining = N - block_work_size * blockIdx.x; + int remaining = N - block_work_size() * blockIdx.x; - if (remaining < block_work_size) { // if this block handles the reminder, just do a naive unrolled loop + if (remaining < block_work_size()) { // if this block handles the reminder, just do a naive unrolled loop auto input_calc = TrivialOffsetCalculator(); auto output_calc = TrivialOffsetCalculator<1>(); auto loader = memory::LoadWithoutCast(); @@ -80,11 +80,11 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) { } template -C10_LAUNCH_BOUNDS_1(num_threads) +C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s) { - int remaining = N - block_work_size * blockIdx.x; + int remaining = N - block_work_size() * blockIdx.x; auto policy = memory::policies::unroll(data, remaining, ic, oc, l, s); elementwise_kernel_helper(f, policy); } @@ -94,17 +94,17 @@ template static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t data) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); using traits = function_traits; - int64_t grid = (N + block_work_size - 1) / block_work_size; + int64_t grid = (N + block_work_size() - 1) / block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); int vec_size = memory::can_vectorize_up_to(data); switch (vec_size) { case 4: - vectorized_elementwise_kernel<4, func_t, array_t><<>>(N, f, data); + vectorized_elementwise_kernel<4, func_t, array_t><<>>(N, f, data); C10_CUDA_KERNEL_LAUNCH_CHECK(); break; case 2: - vectorized_elementwise_kernel<2, func_t, array_t><<>>(N, f, data); + vectorized_elementwise_kernel<2, func_t, array_t><<>>(N, f, data); C10_CUDA_KERNEL_LAUNCH_CHECK(); break; case 1: { @@ -112,7 +112,7 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t auto output_calc = TrivialOffsetCalculator<1>(); auto loader = memory::LoadWithoutCast(); auto storer = memory::StoreWithoutCast(); - unrolled_elementwise_kernel<<>>(N, f, data, input_calc, output_calc, loader, storer); + unrolled_elementwise_kernel<<>>(N, f, data, input_calc, output_calc, loader, storer); C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } @@ -126,9 +126,9 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); - int64_t grid = (N + block_work_size - 1) / block_work_size; + int64_t grid = (N + block_work_size() - 1) / block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); - unrolled_elementwise_kernel<<>>(N, f, data, ic, oc, l, s); + unrolled_elementwise_kernel<<>>(N, f, data, ic, oc, l, s); C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu index 06bea0aea972a9..3d06ce3fdf31a2 100644 --- a/aten/src/ATen/native/cuda/CrossKernel.cu +++ b/aten/src/ATen/native/cuda/CrossKernel.cu @@ -36,7 +36,7 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride, const auto N = iter.numel(); auto offset_calculator = make_element_offset_calculator<3>(iter); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(N > 0 && N <= std::numeric_limits::max()); - int64_t grid = (N + NUM_THREADS - 1) / NUM_THREADS; + int64_t grid = (N + num_threads() - 1) / num_threads(); auto stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kHalf, iter.common_dtype(), "cross_cuda", [&] { @@ -45,11 +45,11 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride, auto x2 = static_cast(iter.data_ptr(2)); constexpr int64_t int_max = std::numeric_limits::max(); if (ostride * 2 > int_max || x1stride * 2 > int_max || x2stride * 2 > int_max) { - cross_kernel<<>>( + cross_kernel<<>>( N, out, x1, x2, offset_calculator, ostride, x1stride, x2stride); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - cross_kernel<<>>( + cross_kernel<<>>( N, out, x1, x2, offset_calculator, static_cast(ostride), static_cast(x1stride), diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 481b5589684484..2f25b224e77fb0 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -188,11 +188,11 @@ __global__ void distribution_binary_elementwise_kernel( using input_t_1 = typename function_traits::template arg<1>::type; using input_t_2 = typename function_traits::template arg<2>::type; - input_t_1 inputs_1[thread_work_size]; - input_t_2 inputs_2[thread_work_size]; + input_t_1 inputs_1[thread_work_size()]; + input_t_2 inputs_2[thread_work_size()]; - int base_index = BLOCK_WORK_SIZE * blockIdx.x; - int remaining = std::min(numel - base_index, BLOCK_WORK_SIZE); + int base_index = block_work_size() * blockIdx.x; + int remaining = std::min(numel - base_index, block_work_size()); curandStatePhilox4_32_10_t state; curand_init(std::get<0>(seeds), @@ -203,7 +203,7 @@ __global__ void distribution_binary_elementwise_kernel( // load data into registers int thread_idx = threadIdx.x; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= remaining) { break; } @@ -212,20 +212,20 @@ __global__ void distribution_binary_elementwise_kernel( inputs_1[i] = input_data_1[offsets[0]]; inputs_2[i] = input_data_2[offsets[1]]; - thread_idx += num_threads; + thread_idx += num_threads(); } // compute and store thread_idx = threadIdx.x; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= remaining) { break; } int input_idx = thread_idx + base_index; auto offsets = out_calc.get(input_idx); output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]); - thread_idx += num_threads; + thread_idx += num_threads(); } } @@ -254,16 +254,16 @@ void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_arg const input_t_1 *input_data_1 = static_cast(iter.data_ptr(1)); const input_t_2 *input_data_2 = static_cast(iter.data_ptr(2)); - int64_t grid = (numel + block_work_size - 1) / block_work_size; + int64_t grid = (numel + block_work_size() - 1) / block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); if (iter.is_contiguous()) { - distribution_binary_elementwise_kernel<<>>( + distribution_binary_elementwise_kernel<<>>( numel, f, philox_args, output_data, input_data_1, input_data_2, TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>()); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - distribution_binary_elementwise_kernel<<>>( + distribution_binary_elementwise_kernel<<>>( numel, f, philox_args, output_data, input_data_1, input_data_2, make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter)); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu index 180385aaf05202..57462ffe5da0dd 100644 --- a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu +++ b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu @@ -85,7 +85,7 @@ void _compute_linear_combination_internal_kernel( } }; - _lauch_kernel(iter.numel(), loop); + _lauch_kernel(iter.numel(), loop); } void _compute_linear_combination_cuda_kernel( diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu index b4936c069b0b12..c6439e586f0b51 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -137,7 +137,7 @@ void _unpack_pivots_internal_kernel( } }; - _launch_kernel(iter.numel(), loop); + _launch_kernel(iter.numel(), loop); } void unpack_pivots_cuda_kernel( diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index 8849293e20210e..d3a7027a8fb02c 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -9,13 +9,9 @@ #include -#define NUM_THREADS (C10_WARP_SIZE * 2) -#define THREAD_WORK_SIZE 4 -#define BLOCK_WORK_SIZE (THREAD_WORK_SIZE * num_threads) - -constexpr int num_threads = NUM_THREADS; -constexpr int thread_work_size = THREAD_WORK_SIZE; -constexpr int block_work_size = BLOCK_WORK_SIZE; +constexpr int num_threads() { return C10_WARP_SIZE * 4; } +constexpr int thread_work_size() { return 4; } +constexpr int block_work_size() { return thread_work_size() * num_threads(); } #include @@ -55,15 +51,15 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) { int idx = blockIdx.x; - return_t results[thread_work_size]; - args_t args[thread_work_size]; + return_t results[thread_work_size()]; + args_t args[thread_work_size()]; // load policy.load(args, idx); // compute #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (policy.check_inbounds(i)) { results[i] = c10::guts::apply(f, args[i]); } @@ -209,18 +205,18 @@ template struct is_tuple: std::false_type {}; template struct is_tuple>: std::true_type {}; template -C10_LAUNCH_BOUNDS_1(num_threads) +C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) { - int remaining = N - block_work_size * blockIdx.x; + int remaining = N - block_work_size() * blockIdx.x; elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll(data, remaining, ic, oc)); } template static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); - int64_t grid = (N + block_work_size - 1) / block_work_size; + int64_t grid = (N + block_work_size() - 1) / block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); - unrolled_elementwise_kernel_for_multi_outputs<<>>(N, f, data, ic, oc); + unrolled_elementwise_kernel_for_multi_outputs<<>>(N, f, data, ic, oc); C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh index 9bc1dd954eebb2..b9d94d6ce71b63 100644 --- a/aten/src/ATen/native/cuda/MemoryAccess.cuh +++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh @@ -59,7 +59,7 @@ struct vectorized_load_helper { using arg_t = std::tuple_element_t; // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we // need a +1 offset to get the input - auto ptr = reinterpret_cast(self.data[arg_index + 1]) + block_work_size * idx; + auto ptr = reinterpret_cast(self.data[arg_index + 1]) + block_work_size() * idx; auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get(args[thread_unroll_idx]); }; self.load_single_arg(args_accessor, ptr); } @@ -164,7 +164,7 @@ struct unroll { data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {} __device__ inline bool check_inbounds(int thread_work_elem) { - return ((threadIdx.x + thread_work_elem*num_threads) < remaining); + return ((threadIdx.x + thread_work_elem*num_threads()) < remaining); } template @@ -172,30 +172,30 @@ struct unroll { constexpr int arity = std::tuple_size::value; int thread_idx = threadIdx.x; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= remaining) { return; } - int linear_idx = thread_idx + block_work_size * idx; + int linear_idx = thread_idx + block_work_size() * idx; auto offset = input_offset_calculator.get(linear_idx); detail::static_unroll::with_args(*this, args, offset, loader, i, num_outputs); - thread_idx += num_threads; + thread_idx += num_threads(); } } template __device__ inline void store(scalar_t *from, int idx) { int thread_idx = threadIdx.x; - scalar_t *to = reinterpret_cast(data[0]) + block_work_size * idx; + scalar_t *to = reinterpret_cast(data[0]) + block_work_size() * idx; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= remaining) { return; } - int linear_idx = thread_idx + block_work_size * idx; + int linear_idx = thread_idx + block_work_size() * idx; int offset = output_offset_calculator.get(linear_idx)[0]; storer.store(from[i], data[0], offset); - thread_idx += num_threads; + thread_idx += num_threads(); } } }; @@ -208,8 +208,8 @@ struct unroll { template // vec_size: number of scalars, can be 1, 2, or 4. struct vectorized { - static_assert(thread_work_size % vec_size == 0, "The workload per thread must be a multiple of vec_size"); - static constexpr int loop_size = thread_work_size / vec_size; + static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size"); + static constexpr int loop_size = thread_work_size() / vec_size; data_t data; @@ -226,7 +226,7 @@ struct vectorized { int thread_idx = threadIdx.x; #pragma unroll for (int i = 0; i < loop_size; i++) { - int index = thread_idx + i * num_threads; + int index = thread_idx + i * num_threads(); vec_t v = from_[index]; #pragma unroll for (int j = 0; j < vec_size; j++) { @@ -244,12 +244,12 @@ struct vectorized { template __device__ inline void store(scalar_t *from, int idx) { using vec_t = aligned_vector; - scalar_t *to = reinterpret_cast(data[0]) + block_work_size * idx; + scalar_t *to = reinterpret_cast(data[0]) + block_work_size() * idx; vec_t *to_ = reinterpret_cast(to); int thread_idx = threadIdx.x; #pragma unroll for (int i = 0; i < loop_size; i++) { - int index = thread_idx + i * num_threads; + int index = thread_idx + i * num_threads(); vec_t v; for (int j = 0; j < vec_size; j++) { v.val[j] = from[vec_size * i + j]; @@ -274,7 +274,7 @@ struct multi_outputs_unroll { data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {} __device__ inline bool check_inbounds(int thread_work_elem) { - return ((threadIdx.x + thread_work_elem*num_threads) < remaining); + return ((threadIdx.x + thread_work_elem*num_threads()) < remaining); } template @@ -282,14 +282,14 @@ struct multi_outputs_unroll { constexpr int arity = std::tuple_size::value; int thread_idx = threadIdx.x; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= remaining) { return; } - int linear_idx = thread_idx + block_work_size * idx; + int linear_idx = thread_idx + block_work_size() * idx; auto offset = input_offset_calculator.get(linear_idx); detail::static_unroll::with_args(*this, args, offset, loader, i, num_outputs); - thread_idx += num_threads; + thread_idx += num_threads(); } } @@ -298,14 +298,14 @@ struct multi_outputs_unroll { __device__ inline void store(return_t *from, int idx) { int thread_idx = threadIdx.x; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { + for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= this->remaining) { return; } - int linear_idx = thread_idx + block_work_size * idx; + int linear_idx = thread_idx + block_work_size() * idx; auto offsets = this->output_offset_calculator.get(linear_idx); memory::detail::static_unroll::with_args(this->data, offsets, from[i]); - thread_idx += num_threads; + thread_idx += num_threads(); } } }; diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh index c339364b5a0211..2134c0f8c725fe 100644 --- a/aten/src/ATen/native/cuda/ROCmLoops.cuh +++ b/aten/src/ATen/native/cuda/ROCmLoops.cuh @@ -229,7 +229,7 @@ struct has_same_arg_types { } // namespace detail template -C10_LAUNCH_BOUNDS_1(num_threads) +C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void elementwise_kernel(int N, func_t f, array_t data) { // Assumption: // 1. all arguments of `f` have the same type, which could be different from the return type of `f` @@ -246,7 +246,7 @@ __global__ void elementwise_kernel(int N, func_t f, array_t data) { constexpr int nargs = traits::arity == 0 ? 1 : traits::arity; int tid = threadIdx.x; - int idx = block_work_size * blockIdx.x + tid; + int idx = block_work_size() * blockIdx.x + tid; // compute base pointers return_t *result_base = reinterpret_cast(data[0]) + idx; @@ -257,31 +257,31 @@ __global__ void elementwise_kernel(int N, func_t f, array_t data) { } // fetch data - return_t results[thread_work_size]; - arg_t args[thread_work_size][nargs]; + return_t results[thread_work_size()]; + arg_t args[thread_work_size()][nargs]; #pragma unroll - for (int i = 0; i < thread_work_size; i++) { - if (idx + num_threads * i < N) { + for (int i = 0; i < thread_work_size(); i++) { + if (idx + num_threads() * i < N) { #pragma unroll for (int j = 0; j < arity; j++) { - args[i][j] = *(args_base[j] + i * num_threads); + args[i][j] = *(args_base[j] + i * num_threads()); } } } // compute #pragma unroll - for (int i = 0; i < thread_work_size; i++) { - if (idx + num_threads * i < N) { + for (int i = 0; i < thread_work_size(); i++) { + if (idx + num_threads() * i < N) { results[i] = detail::invoke_with_array(f, args[i]); } } // store data #pragma unroll - for (int i = 0; i < thread_work_size; i++) { - if (idx + num_threads * i < N) { - *(result_base + i * num_threads) = results[i]; + for (int i = 0; i < thread_work_size(); i++) { + if (idx + num_threads() * i < N) { + *(result_base + i * num_threads()) = results[i]; } } } @@ -293,9 +293,9 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) { if (N == 0) { return; } - int64_t grid = (N + block_work_size - 1) / block_work_size; + int64_t grid = (N + block_work_size() - 1) / block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); - elementwise_kernel<<>>(N, f, data); + elementwise_kernel<<>>(N, f, data); C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu index 5f03cc450f2069..2bb45a59d84332 100644 --- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu +++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu @@ -120,7 +120,7 @@ struct _cuda_scatter_gather_internal_kernel { }; - _launch_scatter_gather_kernel(iter.numel(), loop); + _launch_scatter_gather_kernel(iter.numel(), loop); } }; // struct _cuda_scatter_fill_internal_kernel @@ -284,7 +284,7 @@ struct _cuda_scatter_fill_internal_kernel { }; - _launch_scatter_gather_kernel(iter.numel(), loop); + _launch_scatter_gather_kernel(iter.numel(), loop); } }; // struct _cuda_scatter_fill_internal_kernel diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index 1fd151c1402495..006dfd11b52295 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -14,11 +14,7 @@ namespace at { namespace native { -#ifdef __HIP_PLATFORM_HCC__ -constexpr int CAT_ARRAY_BATCH_SIZE = 1024; -#else constexpr int CAT_ARRAY_BATCH_SIZE = 128; -#endif constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4; namespace { @@ -85,45 +81,6 @@ struct TensorSizeStride { */ -// Use pinned memory and and pass the struct by pointer on ROCm -template -struct CatArrInputTensor { - T* input; - IndexType offset; - IndexType dimSize; - IndexType nElements; -}; - -template -C10_LAUNCH_BOUNDS_1(512) -__global__ void HIP_CatArrayBatchedCopy( - T* output, - CatArrInputTensor* inputs, - TensorSizeStride os, - const int concatDim, - IndexType dimStride) { - - IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; - IndexType nElements = inputs[blockIdx.y].nElements; - - if(tid >= nElements) return; - - T* data = inputs[blockIdx.y].input; - IndexType offset = inputs[blockIdx.y].offset; - IndexType dimSize = inputs[blockIdx.y].dimSize; - IndexType dataOffset = offset * dimStride; - - IndexType stride = gridDim.x * blockDim.x; - - while( tid < nElements){ - IndexType elementOffset = CatArrIndexToOffset::compute( - os.tensorSize, os.tensorStride, dimSize, concatDim, tid); - output[dataOffset + elementOffset] = data[tid]; - - tid += stride; - } -} - // pass meta data directly through kernel argument instead of pin memory // In contiguous case, we will not need stride_size, setting it as 1 as placeholder // to pass compile. @@ -173,127 +130,6 @@ __global__ void CatArrayBatchedCopy( } } -template -void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, - int nDims, c10::MemoryFormat memory_format) { - // First, let's set up our kernel parameters. We start with a raw pointer to - // the storage for the output Tensor. - scalar_t *data = out.data_ptr(); - - // Kernel Parameter - long tensorMetadataSize = - sizeof(CatArrInputTensor) * CAT_ARRAY_BATCH_SIZE; - auto d_inputs_storage = at::empty( - {tensorMetadataSize}, out.options().dtype(at::kByte)); - auto d_inputs = static_cast *>( - d_inputs_storage.data_ptr()); - - TensorSizeStride outputParam; - - // Next, let's initialize the size, stride arrays for the output Tensor. - if (memory_format == c10::MemoryFormat::Contiguous) { - for (int i = 0; i < nDims; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i); - outputParam.tensorStride[i] = out.stride(i); - } - } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { - // permute the semantics of dims from NCHW to NHWC so that the input - // tensor is now contiguous - outputParam.tensorSize[0] = at::native::size(out, 0); - outputParam.tensorStride[0] = out.stride(0); - for (int i = 1; i < nDims - 1; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i + 1); - outputParam.tensorStride[i] = out.stride(i + 1); - } - outputParam.tensorSize[nDims - 1] = at::native::size(out, 1); - outputParam.tensorStride[nDims - 1] = out.stride(1); - } else { - TORCH_CHECK(false, "unsupported memory format"); - } - - at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); - - // Now we loop - int batchCounter = 0; - int64_t offset = 0; - for (int i = 0; i < inputs.size() ; i += CAT_ARRAY_BATCH_SIZE) { - // Re-allocate stackInputs every iteration to avoid read-after-write hazard - { - auto stackInputs_storage = at::empty({tensorMetadataSize}, - out.options().dtype(at::kByte).device(at::kCPU).pinned_memory(true)); - auto stackInputs = - static_cast *>( - stackInputs_storage.data_ptr()); - for (batchCounter = 0; - batchCounter < CAT_ARRAY_BATCH_SIZE && - (i+batchCounter) < inputs.size(); - ++batchCounter) { - int64_t dimSize = 0; - // There is a legacy case where a 1-D empty tensor can be concat with - // high-dimensional tensor - if (inputs[i+batchCounter].numel() > 0) { - dimSize = at::native::size(inputs[i+batchCounter], dimension); - } - - stackInputs[batchCounter].input = - inputs[i+batchCounter].data_ptr(); - stackInputs[batchCounter].offset = offset; - stackInputs[batchCounter].dimSize = dimSize; - stackInputs[batchCounter].nElements = inputs[i+batchCounter].numel(); - - // update offset - offset += dimSize; - } - at::native::copy_(d_inputs_storage, stackInputs_storage, - /* non_blocking= */ true); - } - - // Next, let's consider how we set our kernel launch parameters. - // We borrow from THCApply, which the kernel's internal indexing - // is based on. - dim3 applyBlock = dim3(32*16); - - //Get grid where x dim fills half gpu and y dim is number of tensors. - //This will have cating two tensors fill the entire grid, but prevent - //many threads from needlessly load meta data if their sizes is small. - dim3 catGrid; - getCatGrid(batchCounter, catGrid); - - if (memory_format != c10::MemoryFormat::Contiguous) { - switch (dimension) { - case 0: - break; - case 1: - dimension = nDims - dimension; - break; - default: - dimension--; - } - } - // Template Declarations for dim = 1, 2, 3, 4 -#define HANDLE_CASE(DIMS) \ - HIP_CatArrayBatchedCopy<<<\ - catGrid, applyBlock, 0, stream.stream()>>>(\ - data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); - switch (nDims) { - case 1: - HANDLE_CASE(1); - break; - case 2: - HANDLE_CASE(2); - break; - case 3: - HANDLE_CASE(3); - break; - case 4: - HANDLE_CASE(4); - break; - } -#undef HANDLE_CASE - } -} - template void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, int nDims, c10::MemoryFormat memory_format) { @@ -546,19 +382,6 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { }); allSameType = allSameType && (out.scalar_type() == firstType); -#ifdef __HIP_PLATFORM_HCC__ - if (inputs.size() > 1 && - out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && - at::cuda::detail::canUse32BitIndexMath(out) && - allContiguous && - all32BitIndexable && - allSameType) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, - out.scalar_type(), "cat_cuda", [&]() { - hip_parallel_cat(out, inputs, dimension, nDims, memory_format); - }); -#else // We support the contiguous inputs and non-contiguous input (<=4 dims) in different ways // For contiguous input, we don't need to pass stride meta data to cuda kernel through constant // memory. Therefore, we could pass more inputs to cuda threads. @@ -587,7 +410,6 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { out.scalar_type(), "cat_cuda", [&]() { parallel_cat(out, inputs, dimension, nDims, memory_format); }); -#endif } else { int64_t offset = 0; for (int j = 0; j < inputs.size(); j++) diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu index 64bda79809bbbc..8b43900e92716c 100644 --- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu @@ -103,7 +103,7 @@ void _unfold_backward_internal_kernel( grad_out_data[grad_out_idx_dim * grad_out_dim_stride] = *grad_in_data; }; - _launch_unfold_backward_kernel(iter.numel(), loop); + _launch_unfold_backward_kernel(iter.numel(), loop); } else { auto offset_calc = make_offset_calculator<3>(iter); @@ -138,7 +138,7 @@ void _unfold_backward_internal_kernel( }; - _launch_unfold_backward_kernel(iter.numel(), loop); + _launch_unfold_backward_kernel(iter.numel(), loop); } } diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu index a3b04333a52807..f24f865440400a 100644 --- a/aten/src/ATen/test/cuda_vectorized_test.cu +++ b/aten/src/ATen/test/cuda_vectorized_test.cu @@ -8,11 +8,13 @@ using namespace at::native; using namespace at::native::memory; -__managed__ double4 buffer1[1024]; -__managed__ double4 buffer2[1024]; +constexpr int buffer_size = 1024; + +__managed__ double4 buffer1[buffer_size]; +__managed__ double4 buffer2[buffer_size]; void reset_buffers() { - for (int i = 0; i < 1024; i++) { + for (int i = 0; i < buffer_size; i++) { buffer1[i].x = i; buffer1[i].y = i + 0.1; buffer1[i].z = i + 0.2; @@ -74,6 +76,7 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) { // defined in `ATen/native/cuda/MemoryAccess.cuh` template __global__ void vectorized_copy(scalar_t *dst, scalar_t *src) { + static_assert(vec_size <= thread_work_size() && thread_work_size() % vec_size == 0, "Invalid vec_size"); using array_t = at::detail::Array; array_t data; data[0] = reinterpret_cast(dst); @@ -81,9 +84,15 @@ __global__ void vectorized_copy(scalar_t *dst, scalar_t *src) { int idx = blockIdx.x; using vectorized = policies::vectorized; auto policy = vectorized(data); - scalar_t buf[thread_work_size]; + scalar_t buf[thread_work_size()]; +#if defined(CUDA_VERSION) && CUDA_VERSION < 11000 + // This fails only on CUDA 10.x, remove this after CUDA 10.x support is dropped + scalar_t *buf_ = &buf[0]; + auto accessor = [&](int index) -> scalar_t & { return buf_[index]; }; +#else auto accessor = [&](int index) -> scalar_t & { return buf[index]; }; - policy.load_single_arg(accessor, src + 256 * blockIdx.x); +#endif + policy.load_single_arg(accessor, src + block_work_size() * blockIdx.x); policy.store(buf, idx); } @@ -98,7 +107,8 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { // vec4 copy reset_buffers(); cudaDeviceSynchronize(); - vectorized_copy<<<16, 64>>>(b2, b1); + constexpr int total_work_size = buffer_size * 4; + vectorized_copy<<>>(b2, b1); C10_CUDA_KERNEL_LAUNCH_CHECK(); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); @@ -112,7 +122,7 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { // vec2 copy reset_buffers(); cudaDeviceSynchronize(); - vectorized_copy<<<16, 64>>>(b2, b1); + vectorized_copy<<>>(b2, b1); C10_CUDA_KERNEL_LAUNCH_CHECK(); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); @@ -126,7 +136,7 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { // vec1 copy reset_buffers(); cudaDeviceSynchronize(); - vectorized_copy<<<16, 64>>>(b2, b1); + vectorized_copy<<>>(b2, b1); C10_CUDA_KERNEL_LAUNCH_CHECK(); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); @@ -136,8 +146,8 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { ASSERT_EQ(buffer1[i].z, buffer2[i].z); ASSERT_EQ(buffer1[i].w, buffer2[i].w); } -// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved +// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved #if 0 // unaligned for (int i = 0; i < 16; i++) { @@ -146,9 +156,7 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { b2 = reinterpret_cast(reinterpret_cast(buffer2) + j); cudaGetLastError(); cudaDeviceSynchronize(); - vectorized_copy<<<1, 64>>>(b2, b1); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - + vectorized_copy<<<1, num_threads()>>>(b2, b1); cudaDeviceSynchronize(); auto err = cudaGetLastError(); if (i % 16 == 0 && j % 16 == 0) { diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index db913c6fe23b42..bc257ef5430cf0 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -301,7 +301,7 @@ static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BF static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); } /* Special case fp32 atomic. */ -#if defined(__HIP_PLATFORM_HCC__) && defined(__gfx908__) +#if defined(__HIP_PLATFORM_HCC__) static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); } #else static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); } diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 26210cb56a1408..b0c6aef4e9be17 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -161,7 +161,7 @@ if(BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE)) # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit add_subdirectory(transforms) endif() -if(NOT BUILD_CAFFE2) +if(NOT BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE)) add_subdirectory(proto) endif() @@ -1817,7 +1817,7 @@ if(BUILD_TEST) endif() # For special tests that explicitly uses dependencies, we add them here - if(USE_MPI) + if(BUILD_CAFFE2 AND USE_MPI) target_link_libraries(mpi_test ${MPI_CXX_LIBRARIES}) if(USE_CUDA) target_link_libraries(mpi_gpu_test ${MPI_CXX_LIBRARIES}) diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 85ec5b4ad00aa8..dd3e890689c344 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -131,7 +131,7 @@ else() endif() if(NOT DEFINED ENV{PYTORCH_ROCM_ARCH}) - set(PYTORCH_ROCM_ARCH gfx803;gfx900;gfx906;gfx908) + set(PYTORCH_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a) else() set(PYTORCH_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH}) endif() @@ -187,21 +187,40 @@ if(HIP_FOUND) set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) ### Remove setting of Flags when FindHIP.CMake PR #558 is accepted.### - set(hip_DIR ${HIP_PATH}/lib/cmake/hip) - set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) - set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) - set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) - set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand) - set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand) - set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) - set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) - set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) - set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft) - set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) - set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl) - set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim) - set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub) - set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust) + # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/ + if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0") + set(hip_DIR ${ROCM_PATH}/lib/cmake/hip) + set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) + set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) + set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) + set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand) + set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand) + set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas) + set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen) + set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft) + set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft) + set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse) + set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl) + set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim) + set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub) + set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust) + else() + set(hip_DIR ${HIP_PATH}/lib/cmake/hip) + set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) + set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) + set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) + set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand) + set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand) + set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) + set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) + set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) + set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft) + set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) + set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl) + set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim) + set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub) + set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust) + endif() find_package_and_print_version(hip REQUIRED) find_package_and_print_version(hsa-runtime64 REQUIRED) diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index 8beeb55b448404..f307aa251dd642 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -307,11 +307,11 @@ If the operator is an ATen operator (shows up in the TorchScript graph with the * Define the symbolic function in ``torch/onnx/symbolic_opset.py``, for example `torch/onnx/symbolic_opset9.py `_. - Make sure the function has the same name as the ATen function, which may declared in + Make sure the function has the same name as the ATen function, which may be declared in ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at build time, so will not appear in your checkout until you build PyTorch). * The first arg is always the ONNX graph that is being built for export. - Other arg names must EXACTLY match the names in ``_VariableFunctions.pyi``, + Other arg names must EXACTLY match the names in the ``.pyi`` file, because dispatch is done with keyword arguments. * In the symbolic function, if the operator is in the `ONNX standard operator set `_, @@ -365,8 +365,8 @@ See the ``symbolic_opset*.py`` files for more examples. torch.autograd.Functions ^^^^^^^^^^^^^^^^^^^^^^^^ -If the operator is defined in a sub-class of :class:`torch.autograd.Function`, -there are two ways to export it. +If the operator is a sub-class of :class:`torch.autograd.Function`, there are two ways +to export it. Static Symbolic Method ~~~~~~~~~~~~~~~~~~~~~~ @@ -388,11 +388,11 @@ PythonOp Symbolic ~~~~~~~~~~~~~~~~~ Alternatively, you can register a custom symbolic function. -This gives the symoblic function access to more info through the +This gives the symbolic function access to more info through the TorchScript ``Node`` object for the original operation, which gets passed in as the second argument (after the ``Graph`` object). -All autograd ``Function``s are emitted in the TorchScript graph as ``prim::PythonOp`` nodes. +All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes. In order to differentiate between different ``Function`` subclasses, the symbolic function should use the ``name`` kwarg which gets set to the name of the class. @@ -400,11 +400,12 @@ symbolic function should use the ``name`` kwarg which gets set to the name of th the ``prim`` namespace, so for this use case, there's a back door: register the symbolic for ``"::prim_PythonOp"``. -Please also consider adding shape inference logic when you regiester a custom symbolic function -via setType API. This can help the exporter to obtain correct shape inference. -An example of setType is test_aten_embedding_2 in test_operators.py. -Although it is not required to add shape inference logic, -the exporter emits a warning message if it is not added. +Custom symbolic functions should add type and shape information by calling ``setType(...)`` +on Value objects before returning them (implemented in C++ by +``torch::jit::Value::setType``). This is not required, but it can help the exporter's +shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see +``test_aten_embedding_2`` in +`test_operators.py `_. The example below shows how you can access ``requires_grad`` via the ``Node`` object:: @@ -430,13 +431,17 @@ The example below shows how you can access ``requires_grad`` via the ``Node`` ob print("arg {}: {}, requires grad: {}".format(i, arg, requires_grad)) name = kwargs["name"] + ret = None if name == "MyClip": - return g.op("Clip", args[0], min_f=args[1]) + ret = g.op("Clip", args[0], min_f=args[1]) elif name == "MyRelu": - return g.op("Relu", args[0]) + ret = g.op("Relu", args[0]) else: # Logs a warning and returns None return _unimplemented("prim::PythonOp", "unknown node kind: " + name) + # Copy type and shape from original node. + ret.setType(n.type()) + return ret from torch.onnx import register_custom_op_symbolic register_custom_op_symbolic("::prim_PythonOp", symbolic_pythonop, 1) diff --git a/docs/source/torch.rst b/docs/source/torch.rst index 084dad349004a8..c828f0ecbf2a12 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -597,5 +597,4 @@ Utilities are_deterministic_algorithms_enabled set_warn_always is_warn_always_enabled - vmap _assert diff --git a/ios/TestApp/AppleWWDRCAG3.cer b/ios/TestApp/AppleWWDRCAG3.cer new file mode 100644 index 00000000000000..32f96f81dd6ea4 Binary files /dev/null and b/ios/TestApp/AppleWWDRCAG3.cer differ diff --git a/ios/TestApp/fastlane/Fastfile b/ios/TestApp/fastlane/Fastfile index 3e8ef00f0f7591..42a6346e1b7a47 100644 --- a/ios/TestApp/fastlane/Fastfile +++ b/ios/TestApp/fastlane/Fastfile @@ -4,7 +4,14 @@ platform :ios do before_all do setup_circle_ci end - lane :install_cert do + lane :install_root_cert do + import_certificate( + certificate_path: "AppleWWDRCAG3.cer", + keychain_path: "/Users/distiller/Library/Keychains/fastlane_tmp_keychain-db", + keychain_password: "" + ) + end + lane :install_dev_cert do puts "Installing Certificates.p12" import_certificate( keychain_name: ENV["MATCH_KEYCHAIN_NAME"], diff --git a/related_commits b/related_commits new file mode 100644 index 00000000000000..f2349f17655c30 --- /dev/null +++ b/related_commits @@ -0,0 +1,4 @@ +ubuntu|pytorch|apex|none| +centos|pytorch|apex|none| +ubuntu|pytorch|torchvision|release/0.11|fa347eb9f38c1759b73677a11b17335191e3f602 +centos|pytorch|torchvision|release/0.11|fa347eb9f38c1759b73677a11b17335191e3f602 diff --git a/test/cpp_extensions/cublas_extension.cpp b/test/cpp_extensions/cublas_extension.cpp new file mode 100644 index 00000000000000..61945b1aa223a3 --- /dev/null +++ b/test/cpp_extensions/cublas_extension.cpp @@ -0,0 +1,17 @@ +#include + +#include +#include + +#include + +torch::Tensor noop_cublas_function(torch::Tensor x) { + cublasHandle_t handle; + TORCH_CUDABLAS_CHECK(cublasCreate(&handle)); + TORCH_CUDABLAS_CHECK(cublasDestroy(handle)); + return x; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("noop_cublas_function", &noop_cublas_function, "a cublas function"); +} diff --git a/test/cpp_extensions/cusolver_extension.cpp b/test/cpp_extensions/cusolver_extension.cpp new file mode 100644 index 00000000000000..515d09958a8d27 --- /dev/null +++ b/test/cpp_extensions/cusolver_extension.cpp @@ -0,0 +1,17 @@ +#include +#include + +#include + + +torch::Tensor noop_cusolver_function(torch::Tensor x) { + cusolverDnHandle_t handle; + TORCH_CUSOLVER_CHECK(cusolverDnCreate(&handle)); + TORCH_CUSOLVER_CHECK(cusolverDnDestroy(handle)); + return x; +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("noop_cusolver_function", &noop_cusolver_function, "a cusolver function"); +} diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py index 7888d0e3a88bbd..96301510f05adf 100644 --- a/test/cpp_extensions/setup.py +++ b/test/cpp_extensions/setup.py @@ -48,6 +48,19 @@ 'nvcc': ['-O2']}) ext_modules.append(extension) +if torch.cuda.is_available() and CUDA_HOME is not None: + cublas_extension = CUDAExtension( + name='torch_test_cpp_extension.cublas_extension', + sources=['cublas_extension.cpp'] + ) + ext_modules.append(cublas_extension) + + cusolver_extension = CUDAExtension( + name='torch_test_cpp_extension.cusolver_extension', + sources=['cusolver_extension.cpp'] + ) + ext_modules.append(cusolver_extension) + setup( name='torch_test_cpp_extension', packages=['torch_test_cpp_extension'], diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py index 811137a8d83b42..6e998c5c271d39 100644 --- a/test/distributed/elastic/multiprocessing/api_test.py +++ b/test/distributed/elastic/multiprocessing/api_test.py @@ -15,7 +15,7 @@ import time import unittest from itertools import product -from typing import Dict, List, Union, Callable +from typing import Callable, Dict, List, Union from unittest import mock from unittest.mock import patch @@ -24,25 +24,25 @@ from torch.distributed.elastic.multiprocessing import ProcessFailure, start_processes from torch.distributed.elastic.multiprocessing.api import ( MultiprocessContext, - SignalException, RunProcsResult, + SignalException, Std, _validate_full_rank, - to_map, _wrap, + to_map, ) from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error from torch.testing._internal.common_utils import ( + IS_IN_CI, + IS_MACOS, + IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN, - TEST_WITH_TSAN, TEST_WITH_DEV_DBG_ASAN, - IS_IN_CI, - IS_WINDOWS, - IS_MACOS, + TEST_WITH_TSAN, + run_tests, sandcastle_skip_if, ) -from torch.testing._internal.common_utils import run_tests class RunProcResultsTest(unittest.TestCase): @@ -224,6 +224,7 @@ def start_processes_zombie_test( # tests incompatible with tsan or asan if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS): + class StartProcessesTest(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_") @@ -251,12 +252,15 @@ def assert_pids_noexist(self, pids: Dict[int, int]): def test_to_map(self): local_world_size = 2 - self.assertEqual({0: Std.OUT, 1: Std.OUT}, to_map(Std.OUT, local_world_size)) + self.assertEqual( + {0: Std.OUT, 1: Std.OUT}, to_map(Std.OUT, local_world_size) + ) self.assertEqual( {0: Std.NONE, 1: Std.OUT}, to_map({1: Std.OUT}, local_world_size) ) self.assertEqual( - {0: Std.ERR, 1: Std.OUT}, to_map({0: Std.ERR, 1: Std.OUT}, local_world_size) + {0: Std.ERR, 1: Std.OUT}, + to_map({0: Std.ERR, 1: Std.OUT}, local_world_size), ) def test_invalid_log_dir(self): @@ -382,9 +386,7 @@ def test_void_function(self): results = pc.wait(period=0.1) self.assertEqual({0: None, 1: None}, results.return_values) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan") def test_function_large_ret_val(self): # python multiprocessing.queue module uses pipes and actually PipedQueues # This means that if a single object is greater than a pipe size @@ -439,7 +441,9 @@ def test_function_raise(self): self.assertEqual(1, failure.exitcode) self.assertEqual("", failure.signal_name()) self.assertEqual(pc.pids()[0], failure.pid) - self.assertEqual(os.path.join(log_dir, "0", "error.json"), error_file) + self.assertEqual( + os.path.join(log_dir, "0", "error.json"), error_file + ) self.assertEqual( int(error_file_data["message"]["extraInfo"]["timestamp"]), int(failure.timestamp), @@ -541,17 +545,22 @@ def test_multiprocessing_context_poll_raises_exception(self): run_result = mp_context._poll() self.assertEqual(1, len(run_result.failures)) failure = run_result.failures[0] - self.assertEqual("Signal 1 (SIGHUP) received by PID 123", failure.message) + self.assertEqual( + "Signal 1 (SIGHUP) received by PID 123", failure.message + ) # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS): + class StartProcessesListTest(StartProcessesTest): ######################################## # start_processes as binary tests ######################################## def test_function(self): - for start_method, redirs in product(self._start_methods, redirects_oss_test()): + for start_method, redirs in product( + self._start_methods, redirects_oss_test() + ): with self.subTest(start_method=start_method, redirs=redirs): pc = start_processes( name="echo", @@ -644,6 +653,7 @@ def test_binary_redirect_and_tee(self): # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_IN_CI): + class StartProcessesNotCITest(StartProcessesTest): def test_wrap_bad(self): none = "" @@ -796,7 +806,8 @@ def test_function_exit(self): self.assertEqual(pc.pids()[0], failure.pid) self.assertEqual("", error_file) self.assertEqual( - f"Process failed with exitcode {FAIL}", failure.message + "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html", + failure.message, ) self.assertLessEqual(failure.timestamp, int(time.time())) diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py index 859069004ae712..a24986c659811f 100644 --- a/test/distributed/elastic/multiprocessing/errors/api_test.py +++ b/test/distributed/elastic/multiprocessing/errors/api_test.py @@ -115,7 +115,10 @@ def test_process_failure_no_error_file(self): pf = self.failure_without_error_file(exitcode=138) self.assertEqual("", pf.signal_name()) self.assertEqual("", pf.error_file) - self.assertEqual("Process failed with exitcode 138", pf.message) + self.assertEqual( + "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html", + pf.message, + ) def test_child_failed_error(self): pf0 = self.failure_with_error_file(exception=SentinelError("rank 0")) @@ -134,7 +137,7 @@ def test_child_failed_error(self): rank: 0 (local_rank: 0) exitcode: 1 (pid: 997) error_file: /tmp/ApiTesttbb37ier/error.json - msg: "SentinelError: rank 0" + traceback: "SentinelError: rank 0" ============================================= Other Failures: [1]: @@ -148,7 +151,7 @@ def test_child_failed_error(self): rank: 2 (local_rank: 0) exitcode: 138 (pid: 997) error_file: - msg: "Process failed with exitcode 138" + traceback: To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ********************************************* """ print(ex) diff --git a/test/jit/test_export_modes.py b/test/jit/test_export_modes.py index 70f236f59871f8..80c8e28ad00f78 100644 --- a/test/jit/test_export_modes.py +++ b/test/jit/test_export_modes.py @@ -64,7 +64,7 @@ def foo(a): return (a, a) f = io.BytesIO() x = torch.ones(3) - torch.onnx._export(foo, (x,), f, example_outputs=(x, x)) + torch.onnx._export(foo, (x,), f) @skipIfNoLapack def test_aten_fallback(self): @@ -76,9 +76,8 @@ def forward(self, x, y): x = torch.rand(3, 4) y = torch.rand(3, 4) - f = io.BytesIO() torch.onnx.export_to_pretty_string( - ModelWithAtenNotONNXOp(), (x, y), f, + ModelWithAtenNotONNXOp(), (x, y), None, add_node_names=False, do_constant_folding=False, operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK) @@ -91,11 +90,10 @@ class ModelWithAtenFmod(nn.Module): def forward(self, x, y): return torch.fmod(x, y) - f = io.BytesIO() x = torch.randn(3, 4, dtype=torch.float32) y = torch.randn(3, 4, dtype=torch.float32) torch.onnx.export_to_pretty_string( - ModelWithAtenFmod(), (x, y), f, + ModelWithAtenFmod(), (x, y), None, add_node_names=False, do_constant_folding=False, operator_export_type=OperatorExportTypes.ONNX_ATEN) diff --git a/test/jit/test_onnx_export.py b/test/jit/test_onnx_export.py index 21ee19d262aab1..b999a1ee0b8a39 100644 --- a/test/jit/test_onnx_export.py +++ b/test/jit/test_onnx_export.py @@ -49,20 +49,17 @@ def forward(self, x): tm = TraceMe() tm = torch.jit.trace(tm, torch.rand(3, 4)) - example_outputs = (tm(torch.rand(3, 4)),) f = io.BytesIO() - torch.onnx._export(tm, (torch.rand(3, 4),), f, example_outputs=example_outputs) + torch.onnx._export(tm, (torch.rand(3, 4),), f) def test_export_tensoroption_to(self): def foo(x): return x[0].clone().detach().cpu() + x traced = torch.jit.trace(foo, (torch.rand([2]))) - example_outputs = traced(torch.rand([2])) f = io.BytesIO() - torch.onnx._export_to_pretty_string(traced, (torch.rand([2]),), f, - example_outputs=example_outputs) + torch.onnx._export_to_pretty_string(traced, (torch.rand([2]),), f) def test_onnx_export_script_module(self): class ModuleToExport(torch.jit.ScriptModule): @@ -75,10 +72,8 @@ def forward(self, x): return x + x mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3)) torch.onnx.export_to_pretty_string( - mte, (torch.zeros(1, 2, 3),), None, verbose=False, - example_outputs=outputs) + mte, (torch.zeros(1, 2, 3),), None, verbose=False) @suppress_warnings def test_onnx_export_func_with_warnings(self): @@ -93,11 +88,9 @@ def __init__(self): def forward(self, x): return func_with_warning(x) - outputs = WarningTest()(torch.randn(42)) # no exception torch.onnx.export_to_pretty_string( - WarningTest(), torch.randn(42), None, verbose=False, - example_outputs=outputs) + WarningTest(), torch.randn(42), None, verbose=False) def test_onnx_export_script_python_fail(self): class PythonModule(torch.jit.ScriptModule): @@ -119,11 +112,9 @@ def forward(self, x): return y + y mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3)) f = io.BytesIO() with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"): - torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False, - example_outputs=outputs) + torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False) def test_onnx_export_script_inline_trace(self): class ModuleToInline(torch.nn.Module): @@ -144,10 +135,8 @@ def forward(self, x): return y + y mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3)) torch.onnx.export_to_pretty_string( - mte, (torch.zeros(1, 2, 3),), None, verbose=False, - example_outputs=outputs) + mte, (torch.zeros(1, 2, 3),), None, verbose=False) def test_onnx_export_script_inline_script(self): class ModuleToInline(torch.jit.ScriptModule): @@ -169,10 +158,8 @@ def forward(self, x): return y + y mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3)) torch.onnx.export_to_pretty_string( - mte, (torch.zeros(1, 2, 3),), None, verbose=False, - example_outputs=outputs) + mte, (torch.zeros(1, 2, 3),), None, verbose=False) def test_onnx_export_script_module_loop(self): class ModuleToExport(torch.jit.ScriptModule): @@ -189,10 +176,8 @@ def forward(self, x): return x mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3)) torch.onnx.export_to_pretty_string( - mte, (torch.zeros(1, 2, 3),), None, verbose=False, - example_outputs=outputs) + mte, (torch.zeros(1, 2, 3),), None, verbose=False) @suppress_warnings def test_onnx_export_script_truediv(self): @@ -206,11 +191,9 @@ def forward(self, x): return x + z mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3)) torch.onnx.export_to_pretty_string( - mte, (torch.zeros(1, 2, 3, dtype=torch.float),), None, verbose=False, - example_outputs=outputs) + mte, (torch.zeros(1, 2, 3, dtype=torch.float),), None, verbose=False) def test_onnx_export_script_non_alpha_add_sub(self): class ModuleToExport(torch.jit.ScriptModule): @@ -223,10 +206,8 @@ def forward(self, x): return bs - 1 mte = ModuleToExport() - outputs = torch.LongTensor([mte(torch.rand(3, 4))]) torch.onnx.export_to_pretty_string( - mte, (torch.rand(3, 4),), None, verbose=False, - example_outputs=outputs) + mte, (torch.rand(3, 4),), None, verbose=False) def test_onnx_export_script_module_if(self): class ModuleToExport(torch.jit.ScriptModule): @@ -240,10 +221,8 @@ def forward(self, x): return x mte = ModuleToExport() - outputs = mte(torch.zeros(1, 2, 3, dtype=torch.long)) torch.onnx.export_to_pretty_string( - mte, (torch.zeros(1, 2, 3),), None, verbose=False, - example_outputs=outputs) + mte, (torch.zeros(1, 2, 3),), None, verbose=False) def test_onnx_export_script_inline_params(self): class ModuleToInline(torch.jit.ScriptModule): @@ -272,8 +251,7 @@ def forward(self, x): reference = torch.mm(torch.mm(torch.zeros(2, 3), torch.ones(3, 3)), torch.ones(3, 4)) self.assertEqual(result, reference) torch.onnx.export_to_pretty_string( - mte, (torch.ones(2, 3),), None, verbose=False, - example_outputs=result) + mte, (torch.ones(2, 3),), None, verbose=False) def test_onnx_export_speculate(self): @@ -305,18 +283,16 @@ def transpose(x): return x.t() f1 = Foo(transpose) - outputs_f1 = f1(torch.ones(1, 10, dtype=torch.float)) f2 = Foo(linear) - outputs_f2 = f2(torch.ones(1, 10, dtype=torch.float)) torch.onnx.export_to_pretty_string( f1, (torch.ones(1, 10, dtype=torch.float), ), - None, verbose=False, example_outputs=outputs_f1) + None, verbose=False) torch.onnx.export_to_pretty_string( f2, (torch.ones(1, 10, dtype=torch.float), ), - None, verbose=False, example_outputs=outputs_f2) + None, verbose=False) def test_onnx_export_shape_reshape(self): class Foo(torch.nn.Module): @@ -328,10 +304,8 @@ def forward(self, x): return reshaped foo = torch.jit.trace(Foo(), torch.zeros(1, 2, 3)) - outputs = foo(torch.zeros(1, 2, 3)) f = io.BytesIO() - torch.onnx.export_to_pretty_string(foo, (torch.zeros(1, 2, 3)), f, - example_outputs=outputs) + torch.onnx.export_to_pretty_string(foo, (torch.zeros(1, 2, 3)), f) def test_listconstruct_erasure(self): class FooMod(torch.nn.Module): @@ -360,11 +334,10 @@ def forward(self, x): mod = DynamicSliceExportMod() input = torch.rand(3, 4, 5) - example_outs = mod(input) f = io.BytesIO() torch.onnx.export_to_pretty_string( - DynamicSliceExportMod(), (input,), f, example_outputs=example_outs, opset_version=10) + DynamicSliceExportMod(), (input,), f, opset_version=10) def test_export_dict(self): class DictModule(torch.nn.Module): @@ -380,4 +353,4 @@ def forward(self, x_in: torch.Tensor) -> typing.Dict[str, torch.Tensor]: with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported."): torch.onnx.export_to_pretty_string( - torch.jit.script(mod), (x_in,), f, example_outputs=(mod(x_in),)) + torch.jit.script(mod), (x_in,), f) diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 1d95dc8d0d8a4b..ee2402cc32b728 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -1115,9 +1115,8 @@ class Mod(torch.nn.Module): def forward(self, x, w): return torch.matmul(x, w).detach() - f = io.BytesIO() torch.onnx.export_to_pretty_string( - Mod(), (torch.rand(3, 4), torch.rand(4, 5)), f) + Mod(), (torch.rand(3, 4), torch.rand(4, 5)), None) def test_trace_slice_full_dim(self): def foo(x): diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py index 59909db5958cc6..3bcd57018c36a8 100644 --- a/test/onnx/test_models_onnxruntime.py +++ b/test/onnx/test_models_onnxruntime.py @@ -19,7 +19,7 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None): outputs = model(inputs) script_model = torch.jit.script(model) - run_model_test(self, script_model, False, example_outputs=outputs, + run_model_test(self, script_model, False, input=inputs, rtol=rtol, atol=atol) diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py index 4a12d92141bdca..53f0faec55d031 100644 --- a/test/onnx/test_onnx_opset.py +++ b/test/onnx/test_onnx_opset.py @@ -40,14 +40,13 @@ def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_versi assert attributes[j][attribute_field] == getattr(graph.node[i].attribute[j], attribute_field) -def check_onnx_opsets_operator(module, x, ops, opset_versions, training=torch.onnx.TrainingMode.EVAL, example_outputs=None, +def check_onnx_opsets_operator(module, x, ops, opset_versions, training=torch.onnx.TrainingMode.EVAL, input_names=None, dynamic_axes=None): for opset_version in opset_versions: f = io.BytesIO() torch.onnx.export(module, x, f, opset_version=opset_version, training=training, - example_outputs=example_outputs, input_names=input_names, dynamic_axes=dynamic_axes) model = onnx.load(io.BytesIO(f.getvalue())) @@ -91,10 +90,8 @@ def forward(self, input, k): x = torch.arange(1., 6., requires_grad=True) k = torch.tensor(3) module = MyModuleDynamic() - example_output = module(x, k) check_onnx_opsets_operator(module, [x, k], ops, - opset_versions=[10], - example_outputs=example_output) + opset_versions=[10]) def test_maxpool(self): module = torch.nn.MaxPool1d(2, stride=1) @@ -191,7 +188,6 @@ def forward(self, x): module = DynamicSliceModel() x = torch.rand(1, 2) - example_output = module(x) ops_10 = [{"op_name" : "Shape"}, {"op_name" : "Constant"}, {"op_name" : "Gather", @@ -202,7 +198,7 @@ def forward(self, x): {"op_name" : "Slice", "attributes" : []}] ops = {10 : ops_10} - check_onnx_opsets_operator(module, x, ops, opset_versions=[10], example_outputs=example_output, + check_onnx_opsets_operator(module, x, ops, opset_versions=[10], input_names=['x'], dynamic_axes={"x": [0, 1]}) ops_10 = [{"op_name" : "Constant"}, @@ -212,7 +208,7 @@ def forward(self, x): {"op_name" : "Slice", "attributes" : []}] ops = {10 : ops_10} - check_onnx_opsets_operator(module, x, ops, opset_versions=[10], example_outputs=example_output) + check_onnx_opsets_operator(module, x, ops, opset_versions=[10]) def test_flip(self): class MyModule(Module): diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 14d47945f64377..696e01b2c393a0 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -279,12 +279,11 @@ def test_conv_onnx_irv4_opset8(self): def test_conv_variable_length(self): x = torch.ones(5, 3, 6, 6, requires_grad=True) model = torch.nn.Conv2d(3, 2, 3) - y = model(x) dynamic_axes = {"input_1": [0, 2, 3], "output_1": {0: "output_1_variable_dim_0", 1: "output_1_variable_dim_1"}} model_proto_name = "conv2d.onnx" torch.onnx.export(model, x, model_proto_name, verbose=True, input_names=["input_1"], output_names=["output_1"], - example_outputs=y, dynamic_axes=dynamic_axes) + dynamic_axes=dynamic_axes) import onnx onnx_model = onnx.load(model_proto_name) @@ -729,22 +728,6 @@ def test_cumsum(self): x = torch.randn(2, 3, 4, requires_grad=True) self.assertONNX(lambda x: torch.cumsum(x, dim=1), x, opset_version=11) - def test_retain_param_name_disabled(self): - class MyModule(Module): - def __init__(self): - super(MyModule, self).__init__() - self.fc1 = nn.Linear(4, 5, bias=False) - self.fc1.weight.data.fill_(2.) - self.fc2 = nn.Linear(5, 6, bias=False) - self.fc2.weight.data.fill_(3.) - - def forward(self, x): - return self.fc2(self.fc1(x)) - - x = torch.randn(3, 4).float() - self.assertONNX(MyModule(), (x,), _retain_param_name=False, - keep_initializers_as_inputs=True) - def test_c2_op(self): class MyModel(torch.nn.Module): def __init__(self): diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index fad9a0ba5b66ef..c4074bb1bbcbfb 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -134,7 +134,7 @@ def convert_cuda(self, model, input): return cuda_model, cuda_input def run_debug_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True, example_outputs=None, + input=None, use_gpu=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX): """ # TODO: remove this from the final release version @@ -153,7 +153,6 @@ def run_debug_test(self, model, train, batch_size, state_dict=None, model, input = self.convert_cuda(model, input) onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False, - example_outputs=example_outputs, do_constant_folding=False, opset_version=self.opset_version, keep_initializers_as_inputs=True, @@ -168,7 +167,7 @@ def run_debug_test(self, model, train, batch_size, state_dict=None, def run_actual_test(self, model, train, batch_size, state_dict=None, input=None, use_gpu=True, rtol=0.001, atol=1e-7, - example_outputs=None, do_constant_folding=True, + do_constant_folding=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX, input_names=None, dynamic_axes=None, remained_onnx_input_idx=None): @@ -191,7 +190,6 @@ def run_actual_test(self, model, train, batch_size, state_dict=None, # Verify the model runs the same in Caffe2 verify.verify(model, input, c2, rtol=rtol, atol=atol, - example_outputs=example_outputs, do_constant_folding=do_constant_folding, opset_version=self.opset_version, keep_initializers_as_inputs=True, @@ -202,7 +200,7 @@ def run_actual_test(self, model, train, batch_size, state_dict=None, def run_model_test(self, model, train, batch_size, state_dict=None, input=None, use_gpu=True, rtol=0.001, atol=1e-7, - example_outputs=None, do_constant_folding=True, + do_constant_folding=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX, input_names=None, dynamic_axes=None, remained_onnx_input_idx=None): @@ -214,7 +212,6 @@ def run_model_test(self, model, train, batch_size, state_dict=None, if self.embed_params: self.run_actual_test(model, train, batch_size, state_dict, input, use_gpu=use_gpu_, rtol=rtol, atol=atol, - example_outputs=example_outputs, do_constant_folding=do_constant_folding, operator_export_type=operator_export_type, input_names=input_names, @@ -222,8 +219,7 @@ def run_model_test(self, model, train, batch_size, state_dict=None, remained_onnx_input_idx=remained_onnx_input_idx) else: self.run_debug_test(model, train, batch_size, state_dict, input, - use_gpu=use_gpu_, example_outputs=example_outputs, - operator_export_type=operator_export_type) + use_gpu=use_gpu_, operator_export_type=operator_export_type) def test_linear(self): class MyModel(torch.nn.Module): @@ -289,7 +285,7 @@ def forward(self, input): # This test checks that given this edge condition, the model can be loaded and executed # in Caffe2 backend correctly. torch.onnx._export(model, input, f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE, - input_names=["input1", "fc1.bias"], _retain_param_name=False, + input_names=["input1", "fc1.bias"], keep_initializers_as_inputs=True) f.seek(0) @@ -1401,9 +1397,8 @@ def forward(self, x): return x[1:x.size(0)] module = DynamicSliceModel() x = torch.rand(1, 2) - example_output = module(x) self.run_model_test(DynamicSliceModel(), train=False, input=(x,), - batch_size=BATCH_SIZE, use_gpu=False, example_outputs=example_output) + batch_size=BATCH_SIZE, use_gpu=False) @skipIfUnsupportedMinOpsetVersion(11) def test_dynamic_slice_to_the_end(self): @@ -1472,7 +1467,7 @@ def forward(self): y = torch.ones(2, 3, 4) * 2 self.run_model_test(Arithmetic(), train=False, input=(), batch_size=BATCH_SIZE, - use_gpu=False, example_outputs=(x + 3, y * (x + 3))) + use_gpu=False) def test_tensor_factories(self): class TensorFactory(torch.nn.Module): @@ -1493,11 +1488,9 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, example_outputs=(torch.ones(x.size()),), - input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) + use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, example_outputs=(torch.ones(x.size()),), - remained_onnx_input_idx=[]) + use_gpu=False, remained_onnx_input_idx=[]) def test_tensor_like_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @@ -1509,12 +1502,10 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, example_outputs=(torch.ones(x.size()),), - input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) + use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) remained_onnx_input_idx = None if self.opset_version < 9 else [] self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, example_outputs=(torch.ones(x.size()),), - remained_onnx_input_idx=remained_onnx_input_idx) + use_gpu=False, remained_onnx_input_idx=remained_onnx_input_idx) def test_full(self): class FullModel(torch.nn.Module): @@ -1532,8 +1523,7 @@ def forward(self, x): return torch.full((4, 5), x, dtype=torch.long) x = torch.tensor(12) - self.run_model_test(FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, example_outputs=FullClass()(x)) + self.run_model_test(FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) def test_clamp(self): class ClampModel(torch.nn.Module): @@ -2021,8 +2011,7 @@ def forward(self, a: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, t return a x = (torch.randn(3, 4), torch.randn(4, 3)) - self.run_model_test(TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE, - example_outputs=(x,)) + self.run_model_test(TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE) def test_nested_tuple_input_output(self): class NestedTupleModel(torch.jit.ScriptModule): @@ -2032,8 +2021,7 @@ def forward(self, a: torch.Tensor, b: Tuple[torch.Tensor, Tuple[torch.Tensor, to x = torch.randn(4, 5) y = (torch.randn(4, 5), (torch.randn(4, 5), torch.randn(4, 5))) - self.run_model_test(NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, - example_outputs=x + y[0] + y[1][0] + y[1][1]) + self.run_model_test(NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE) def test_topk(self): class TopKModel(torch.nn.Module): @@ -2050,7 +2038,7 @@ def forward(self, input): return torch.topk(input, 3, dim=0) x = torch.randn(4, 3, requires_grad=True) - self.run_model_test(TopKModel(), train=False, input=(x,), batch_size=BATCH_SIZE, example_outputs=torch.topk(x, 3, dim=0)) + self.run_model_test(TopKModel(), train=False, input=(x,), batch_size=BATCH_SIZE) def test_floor(self): class FloorModel(torch.nn.Module): @@ -2086,9 +2074,7 @@ def forward(self, a): return torch.arange(a.size(0), dtype=torch.float).view(-1, 1) + a x = torch.randn(3, 4, requires_grad=True) - outputs = ArangeScript()(x) - self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE) class ArangeModel(torch.nn.Module): def forward(self, a): @@ -2104,9 +2090,7 @@ def forward(self, a): return torch.arange(2, a.size(0) + 2, dtype=torch.float).view(-1, 1) + a x = torch.randn(3, 4, requires_grad=True) - outputs = ArangeScript()(x) - self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE) class ArangeModel(torch.nn.Module): def forward(self, a): @@ -2122,9 +2106,7 @@ def forward(self, a): return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a x = torch.randn(3, 4, requires_grad=True) - outputs = ArangeScript()(x) - self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE) class ArangeModel(torch.nn.Module): def forward(self, a): @@ -2249,9 +2231,7 @@ def forward(self, x): model = WhileModel() inputs = torch.zeros(1, 2, 3, dtype=torch.long) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,) def test_while_cond(self): class WhileModel(torch.jit.ScriptModule): @@ -2266,9 +2246,7 @@ def forward(self, x, a): model = WhileModel() x = torch.zeros(1, 2, 3, dtype=torch.long) a = torch.tensor([0], dtype=torch.long) - outputs = model(x, a) - self.run_model_test(model, train=False, input=(x, a), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(x, a), batch_size=BATCH_SIZE) @unittest.skip("Disabled due to onnx optimizer deprecation") def test_loop(self): @@ -2281,9 +2259,7 @@ def forward(self, x): model = LoopModel() inputs = torch.zeros(1, 2, 3, dtype=torch.long) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) @unittest.skip("Disabled due to onnx optimizer deprecation") def test_dynamic_loop(self): @@ -2296,9 +2272,7 @@ def forward(self, x): model = LoopModel() inputs = torch.zeros(1, 2, 3, dtype=torch.long) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) @unittest.skip("Disabled due to onnx optimizer deprecation") @skipIfUnsupportedMinOpsetVersion(9) @@ -2317,9 +2291,7 @@ def forward(self, x): model = NestedLoopsModel() inputs = torch.zeros(1, 2, 3, dtype=torch.long) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,) def test_select(self): class SelectModel(torch.nn.Module): @@ -2337,9 +2309,7 @@ def forward(self, input): model = StandardDeviation() inputs = torch.randn(2, 3, 4) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) def test_std_along_dims(self): class StandardDeviationAlongDims(torch.nn.Module): @@ -2348,9 +2318,7 @@ def forward(self, input): model = StandardDeviationAlongDims() inputs = torch.randn(2, 3, 4) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) @skipIfUnsupportedMinOpsetVersion(9) def test_masked_fill(self): @@ -2379,9 +2347,7 @@ def forward(self, x, y, z): y = torch.zeros(4, requires_grad=True) z = torch.ones(5, requires_grad=True) model = MeshgridModel() - outputs = model(x, y, z) - self.run_model_test(model, train=False, input=(x, y, z), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(x, y, z), batch_size=BATCH_SIZE) def test_remainder(self): class RemainderModel(torch.nn.Module): @@ -2391,9 +2357,7 @@ def forward(self, input, other): x = torch.randn(4, 2, 3) y = torch.randn(1, 2, 1) model = RemainderModel() - outputs = model(x, y) - self.run_model_test(model, train=False, input=(x, y), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(x, y), batch_size=BATCH_SIZE) def test_remainder_scalar(self): class RemainderModel(torch.nn.Module): @@ -2402,9 +2366,7 @@ def forward(self, input): inputs = torch.randint(10, (2, 3)) model = RemainderModel() - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,) def test_baddbmm(self): class MyModule(torch.nn.Module): @@ -2423,9 +2385,7 @@ def forward(self, x): model = GeluModel() inputs = torch.randn(2, 4, 5, 6, requires_grad=True) - outputs = model(inputs) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE, - example_outputs=(outputs,)) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) @skipIfUnsupportedMinOpsetVersion(9) def test_index_fill(self): diff --git a/test/onnx/test_pytorch_onnx_caffe2_quantized.py b/test/onnx/test_pytorch_onnx_caffe2_quantized.py index 25b95563cde989..7a194a66d27ad8 100644 --- a/test/onnx/test_pytorch_onnx_caffe2_quantized.py +++ b/test/onnx/test_pytorch_onnx_caffe2_quantized.py @@ -28,7 +28,7 @@ def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxe output = q_model(*pt_inputs) f = io.BytesIO() - torch.onnx.export(q_model, pt_inputs, f, input_names=input_names, example_outputs=output, + torch.onnx.export(q_model, pt_inputs, f, input_names=input_names, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) f.seek(0) onnx_model = onnx.load(f) @@ -84,8 +84,6 @@ def test_quantized_relu(self): self.generic_unary_test(torch.nn.ReLU()) def export_to_onnx(self, model, input, input_names): - outputs = model(input) - traced = torch.jit.trace(model, input) buf = io.BytesIO() torch.jit.save(traced, buf) @@ -93,7 +91,7 @@ def export_to_onnx(self, model, input, input_names): model = torch.jit.load(buf) f = io.BytesIO() - torch.onnx.export(model, input, f, input_names=input_names, example_outputs=outputs, + torch.onnx.export(model, input, f, input_names=input_names, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) f.seek(0) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 3763282147d8d2..35ff1b75e11ce3 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -45,9 +45,9 @@ def to_numpy(tensor): else: return tensor.cpu().numpy() -def convert_to_onnx(model, input=None, opset_version=9, example_outputs=None, - do_constant_folding=True, keep_initializers_as_inputs=True, - dynamic_axes=None, input_names=None, output_names=None, +def convert_to_onnx(model, input=None, opset_version=9, do_constant_folding=True, + keep_initializers_as_inputs=True, dynamic_axes=None, + input_names=None, output_names=None, fixed_batch_size=False, training=None, onnx_shape_inference=True): # export the model to ONNX @@ -55,7 +55,6 @@ def convert_to_onnx(model, input=None, opset_version=9, example_outputs=None, input_copy = copy.deepcopy(input) torch.onnx._export(model, input_copy, f, opset_version=opset_version, - example_outputs=example_outputs, do_constant_folding=do_constant_folding, keep_initializers_as_inputs=keep_initializers_as_inputs, dynamic_axes=dynamic_axes, @@ -132,7 +131,7 @@ def run_model_test(self, model, batch_size=2, state_dict=None, input = input + ({},) ort_sess = convert_to_onnx(model, input=input, opset_version=self.opset_version, - example_outputs=output, do_constant_folding=do_constant_folding, + do_constant_folding=do_constant_folding, keep_initializers_as_inputs=self.keep_initializers_as_inputs, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names, fixed_batch_size=fixed_batch_size, training=training, @@ -284,9 +283,9 @@ def _run_test(m, remained_onnx_input_idx): _run_test(model, tracing_remained_onnx_input_idx) def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7, - example_outputs=None, do_constant_folding=True, - dynamic_axes=None, input_names=None, output_names=None, - ort_optim_on=True, training=None): + do_constant_folding=True, dynamic_axes=None, + input_names=None, output_names=None, + ort_optim_on=True, training=None, use_external_data_format=None): import os import tempfile @@ -310,13 +309,12 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7, input_copy = copy.deepcopy(input) torch.onnx.export(model, input_copy, model_file_name, opset_version=self.opset_version, - example_outputs=output, verbose=False, do_constant_folding=do_constant_folding, keep_initializers_as_inputs=self.keep_initializers_as_inputs, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names, - use_external_data_format=True) + use_external_data_format=use_external_data_format) # compute onnxruntime output prediction ort_sess_opt = onnxruntime.SessionOptions() ort_sess_opt.graph_optimization_level = \ @@ -366,19 +364,55 @@ def forward(self, x): return x + torch.ones(2, 1024) x = torch.randn(2, 1) - self.run_model_test_with_external_data(LargeModel(), x) + self.run_model_test_with_external_data(LargeModel(), x, use_external_data_format=None) @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. - @unittest.skip("Enable this once large model with subgraph is supported in ORT") - def test_subgraph_with_external_data(self): + def test_largemodel_without_use_external_data_format_param(self): class LargeModel(torch.nn.Module): - def forward(self, x): - for i in range(x.size(0)): - x = x + torch.ones(2, 1024) - return x + def __init__(self): + super(LargeModel, self).__init__() + dim = 5 + n = 40 * 4 * 10 ** 6 + self.emb = torch.nn.Embedding(n, dim) + self.lin1 = torch.nn.Linear(dim, 1) + self.seq = torch.nn.Sequential( + self.emb, + self.lin1, + ) - x = torch.randn(2, 1) - self.run_model_test_with_external_data(torch.jit.script(LargeModel()), x) + def forward(self, input): + return self.seq(input) + + model = LargeModel() + x = torch.tensor([2], dtype=torch.long) + self.run_model_test_with_external_data(LargeModel(), x, use_external_data_format=None) + + @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. + def test_largemodel_with_use_external_data_format_False(self): + class LargeModel(torch.nn.Module): + def __init__(self): + super(LargeModel, self).__init__() + dim = 5 + n = 30 * 4 * 10 ** 6 + self.emb = torch.nn.Embedding(n, dim) + self.lin1 = torch.nn.Linear(dim, 1) + self.seq = torch.nn.Sequential( + self.emb, + self.lin1, + ) + + def forward(self, input): + return self.seq(input) + + model = LargeModel() + x = torch.tensor([3], dtype=torch.long) + + with self.assertRaises(RuntimeError) as cm: + self.run_model_test_with_external_data(LargeModel(), x, use_external_data_format=False) + + the_exception = cm.exception + self.assertEqual("RuntimeError: Exporting model exceed maximum protobuf size of 2GB. " + + "Please call torch.onnx.export without setting use_external_data_format parameter.") def test_fuse_conv_bn1d(self): class Fuse(torch.nn.Module): @@ -7784,7 +7818,6 @@ def forward(self, x): script_model = torch.jit.script(model) output = model(x) ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version, - example_outputs=output, training=torch.onnx.TrainingMode.TRAINING) ort_outs = run_ort(ort_sess, input=(x,)) assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0]))) @@ -7828,7 +7861,6 @@ def forward(self, x): y = model(input) output = y.cpu().numpy() ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version, - example_outputs=y, training=torch.onnx.TrainingMode.TRAINING) ort_outs = run_ort(ort_sess, input=(x,)) ort_mask = np.where(ort_outs[0] != 0, 1, 0) @@ -7913,11 +7945,9 @@ def forward(self, box_regression: torch.Tensor, proposals: List[torch.Tensor]): model = torch.jit.script(MyModule()) box_regression = torch.randn([4, 4]) proposal = [torch.randn(2, 4), torch.randn(2, 4)] - outputs = model(box_regression, proposal) with self.assertRaises(RuntimeError) as cm: - convert_to_onnx(model, input=(box_regression, proposal), - example_outputs=outputs) + convert_to_onnx(model, input=(box_regression, proposal)) def test_initializer_sequence(self): class MyModule(torch.nn.Module): @@ -7939,7 +7969,7 @@ def forward(self, x): x = torch.randn(32, 3) f = io.BytesIO() - torch.onnx._export(test_model, (x,), f, _retain_param_name=True, do_constant_folding=False) + torch.onnx._export(test_model, (x,), f, do_constant_folding=False) loaded_model = onnx.load_from_string(f.getvalue()) actual_list = [p.name for p in loaded_model.graph.initializer] @@ -7986,10 +8016,9 @@ def forward(self, x, y): x = torch.ones(2, 3, dtype=torch.float) y = torch.tensor(5, dtype=torch.long) - example_output = (test_model(x, y),) f = io.BytesIO() - torch.onnx.export(test_model, (x, y), f, example_outputs=example_output, _retain_param_name=True, do_constant_folding=False) + torch.onnx.export(test_model, (x, y), f, do_constant_folding=False) loaded_model = onnx.load_from_string(f.getvalue()) actual_list = [p.name for p in loaded_model.graph.initializer] diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index b87fa06d648a44..191cd1c3cb30b5 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -32,7 +32,6 @@ def setUp(self): def _model_to_graph(self, model, input, do_constant_folding=True, - example_outputs=None, training=TrainingMode.EVAL, operator_export_type=OperatorExportTypes.ONNX, input_names=None, @@ -49,7 +48,6 @@ def _model_to_graph(self, model, input, _disable_torch_constant_prop=True, operator_export_type=operator_export_type, training=training, - example_outputs=example_outputs, input_names=input_names, dynamic_axes=dynamic_axes) _set_onnx_shape_inference(True) @@ -116,11 +114,11 @@ def forward(self, input_t, n): example_output = model(input_t, n) with self.assertRaises(RuntimeError): - torch.onnx.export(model, - (input_t, n), - "test.onnx", - opset_version=self.opset_version, - example_outputs=[example_output]) + torch.onnx._export(model, + (input_t, n), + "test.onnx", + opset_version=self.opset_version, + example_outputs=[example_output]) def test_constant_fold_transpose(self): class TransposeModule(torch.nn.Module): @@ -558,27 +556,27 @@ def forward(self, x): assert node.kind() != "onnx::Shape" assert len(list(graph.nodes())) == 1 - def test_strip_doc_string(self): + def test_verbose(self): class MyModule(torch.nn.Module): def forward(self, input): return torch.exp(input) x = torch.randn(3, 4) - def is_model_stripped(f, strip_doc_string=None): - if strip_doc_string is None: + def is_model_stripped(f, verbose=None): + if verbose is None: torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version) else: - torch.onnx.export(MyModule(), x, f, strip_doc_string=strip_doc_string, + torch.onnx.export(MyModule(), x, f, verbose=verbose, opset_version=self.opset_version) model = onnx.load(io.BytesIO(f.getvalue())) model_strip = copy.copy(model) onnx.helper.strip_doc_string(model_strip) return model == model_strip - # test strip_doc_string=True (default) + # test verbose=False (default) self.assertTrue(is_model_stripped(io.BytesIO())) - # test strip_doc_string=False - self.assertFalse(is_model_stripped(io.BytesIO(), False)) + # test verbose=True + self.assertFalse(is_model_stripped(io.BytesIO(), True)) # NB: remove this test once DataParallel can be correctly handled def test_error_on_data_parallel(self): @@ -720,9 +718,8 @@ def forward(self, x): q_model = torch.quantization.convert(q_model, inplace=False) q_model.eval() - output = q_model(*pt_inputs) - graph, _, __ = self._model_to_graph(q_model, pt_inputs, example_outputs=output, + graph, _, __ = self._model_to_graph(q_model, pt_inputs, operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, input_names=['pt_inputs'], dynamic_axes={'pt_inputs': [0, 1, 2, 3]}) @@ -749,9 +746,8 @@ def forward(self, x): x = torch.tensor([2]) model = PrimModule() - output = model(x) model.eval() - graph, _, __ = self._model_to_graph(model, (x,), example_outputs=output, + graph, _, __ = self._model_to_graph(model, (x,), operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, input_names=['x'], dynamic_axes={'x': [0]}) iter = graph.nodes() @@ -814,10 +810,9 @@ def forward(self, x): model = torch.jit.script(MyModule()) x = torch.randn(10, 3, 128, 128) - example_outputs = model(x) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True, example_outputs=example_outputs, + graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True, operator_export_type=OperatorExportTypes.ONNX, training=torch.onnx.TrainingMode.TRAINING, input_names=['x'], dynamic_axes={'x': [0, 1, 2, 3]}) diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index 144c209bd50fd4..df0383b1f354bb 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -1233,8 +1233,6 @@ def _test_lower_graph_impl(self, model, data): input_names = ["x"] def export_to_onnx(model, input, input_names): - outputs = model(input) - traced = torch.jit.trace(model, input) buf = io.BytesIO() torch.jit.save(traced, buf) @@ -1242,7 +1240,7 @@ def export_to_onnx(model, input, input_names): model = torch.jit.load(buf) f = io.BytesIO() - torch.onnx.export(model, input, f, input_names=input_names, example_outputs=outputs, + torch.onnx.export(model, input, f, input_names=input_names, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) onnx_model = export_to_onnx(model, data, input_names) diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py index cf35e6b13265d9..979f23d0698709 100644 --- a/test/test_cpp_extensions_aot.py +++ b/test/test_cpp_extensions_aot.py @@ -80,6 +80,24 @@ def test_cuda_extension(self): # 2 * sigmoid(0) = 2 * 0.5 = 1 self.assertEqual(z, torch.ones_like(z)) + @common.skipIfRocm + @unittest.skipIf(not TEST_CUDA, "CUDA not found") + def test_cublas_extension(self): + from torch_test_cpp_extension import cublas_extension + + x = torch.zeros(100, device="cuda", dtype=torch.float32) + z = cublas_extension.noop_cublas_function(x) + self.assertEqual(z, x) + + @common.skipIfRocm + @unittest.skipIf(not TEST_CUDA, "CUDA not found") + def test_cusolver_extension(self): + from torch_test_cpp_extension import cusolver_extension + + x = torch.zeros(100, device="cuda", dtype=torch.float32) + z = cusolver_extension.noop_cusolver_function(x) + self.assertEqual(z, x) + @unittest.skipIf(IS_WINDOWS, "Not available on Windows") def test_no_python_abi_suffix_sets_the_correct_library_name(self): # For this test, run_test.py will call `python setup.py install` in the diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 07628f6559ceef..6802cc8890dd6c 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -1524,6 +1524,28 @@ def test_sampler_reproducibility(self): ): self.assertEqual(list(fn()), list(fn())) + for sampler in ( + RandomSampler(self.dataset, num_samples=5, replacement=True), + RandomSampler(self.dataset, replacement=False), + WeightedRandomSampler(weights, num_samples=5, replacement=True), + WeightedRandomSampler(weights, num_samples=5, replacement=False), + SubsetRandomSampler(range(10)), + ): + torch.manual_seed(0) + l1 = list(sampler) + list(sampler) + + torch.manual_seed(0) + l2 = list(sampler) + list(sampler) + self.assertEqual(l1, l2) + + its = (iter(sampler), iter(sampler)) + ls = ([], []) + for idx in range(len(sampler)): + for i in range(2): + if idx == 0: + torch.manual_seed(0) + ls[i].append(next(its[i])) + self.assertEqual(ls[0], ls[1]) def _test_sampler(self, **kwargs): indices = range(2, 12) # using a regular iterable diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 0da0c0a1dcebd3..c951b0c53e793a 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -1,3 +1,4 @@ +import copy import http.server import itertools import os @@ -414,13 +415,30 @@ def test_demux_mux_datapipe(self): # Test Case: Uneven DataPipes source_numbers = list(range(0, 10)) + [10, 12] - numbers_dp = IDP(source_numbers) + numbers_dp = dp.iter.IterableWrapper(source_numbers) n1, n2 = numbers_dp.demux(2, lambda x: x % 2) self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1)) self.assertEqual([1, 3, 5, 7, 9], list(n2)) n = n1.mux(n2) self.assertEqual(source_numbers, list(n)) + @suppress_warnings # Suppress warning for lambda fn + def test_map_with_col_file_handle_datapipe(self): + temp_dir = self.temp_dir.name + datapipe1 = dp.iter.FileLister(temp_dir, '') + datapipe2 = dp.iter.FileLoader(datapipe1) + + def _helper(datapipe): + dp1 = datapipe.map(lambda x: x.read(), input_col=1) + dp2 = datapipe.map(lambda x: (x[0], x[1].read())) + self.assertEqual(list(dp1), list(dp2)) + + # tuple + _helper(datapipe2) + # list + datapipe3 = datapipe2.map(lambda x: list(x)) + _helper(datapipe3) + class TestDataFramesPipes(TestCase): """ @@ -619,24 +637,12 @@ def __init__(self, input_dp): super().__init__() self.input_dp = input_dp + # Prevent in-place modification def __iter__(self): - for i in self.input_dp: - yield i - - -class IDP(IterDataPipe): - def __init__(self, input_dp): - super().__init__() - self.input_dp = input_dp - self.length = len(input_dp) - - def __iter__(self): - for i in self.input_dp: + input_dp = self.input_dp if isinstance(self.input_dp, IterDataPipe) else copy.deepcopy(self.input_dp) + for i in input_dp: yield i - def __len__(self): - return self.length - class MDP(MapDataPipe): def __init__(self, input_dp): @@ -669,19 +675,19 @@ class TestFunctionalIterDataPipe(TestCase): def _test_picklable(self): arr = range(10) picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ - (dp.iter.Mapper, IDP(arr), (), {}), - (dp.iter.Mapper, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), - (dp.iter.Collator, IDP(arr), (), {}), - (dp.iter.Collator, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), - (dp.iter.Filter, IDP(arr), (_fake_filter_fn, (0, ), {'test': True}), {}), + (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (), {}), + (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Collator, dp.iter.IterableWrapper(arr), (), {}), + (dp.iter.Collator, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Filter, dp.iter.IterableWrapper(arr), (_fake_filter_fn, (0, ), {'test': True}), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] unpicklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ - (dp.iter.Mapper, IDP(arr), (lambda x: x, ), {}), - (dp.iter.Collator, IDP(arr), (lambda x: x, ), {}), - (dp.iter.Filter, IDP(arr), (lambda x: x >= 5, ), {}), + (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (lambda x: x, ), {}), + (dp.iter.Collator, dp.iter.IterableWrapper(arr), (lambda x: x, ), {}), + (dp.iter.Filter, dp.iter.IterableWrapper(arr), (lambda x: x >= 5, ), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes: with warnings.catch_warnings(record=True) as wa: @@ -692,8 +698,8 @@ def _test_picklable(self): p = pickle.dumps(datapipe) def test_concat_datapipe(self): - input_dp1 = IDP(range(10)) - input_dp2 = IDP(range(5)) + input_dp1 = dp.iter.IterableWrapper(range(10)) + input_dp2 = dp.iter.IterableWrapper(range(5)) with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"): dp.iter.Concater() @@ -718,7 +724,7 @@ def test_concat_datapipe(self): def test_fork_datapipe(self): - input_dp = IDP(range(10)) + input_dp = dp.iter.IterableWrapper(range(10)) with self.assertRaises(ValueError): input_dp.fork(num_instances=0) @@ -836,7 +842,7 @@ def test_fork_datapipe(self): self.assertEqual(len(input_dp), len(dp3)) def test_demux_datapipe(self): - input_dp = IDP(range(10)) + input_dp = dp.iter.IterableWrapper(range(10)) with self.assertRaises(ValueError): input_dp.demux(num_instances=0, classifier_fn=lambda x: 0) @@ -882,8 +888,8 @@ def test_demux_datapipe(self): self.assertEqual(list(range(0, 5)), output2) # Test Case: classifer returns a value outside of [0, num_instance - 1] - dp = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2) - it = iter(dp[0]) + dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2) + it = iter(dp0[0]) with self.assertRaises(ValueError): next(it) next(it) @@ -960,7 +966,7 @@ def test_demux_datapipe(self): @suppress_warnings # Suppress warning for lambda fn def test_map_datapipe(self): - input_dp = IDP(range(10)) + input_dp = dp.iter.IterableWrapper(range(10)) def fn(item, dtype=torch.float, *, sum=False): data = torch.tensor(item, dtype=dtype) @@ -1005,7 +1011,7 @@ def fn_nn(d0, d1): def _helper(ref_fn, fn, input_col=None, output_col=None): for constr in (list, tuple): - datapipe = IDP([constr((0, 1, 2)), constr((3, 4, 5)), constr((6, 7, 8))]) + datapipe = dp.iter.IterableWrapper([constr((0, 1, 2)), constr((3, 4, 5)), constr((6, 7, 8))]) res_dp = datapipe.map(fn, input_col, output_col) ref_dp = datapipe.map(ref_fn) self.assertEqual(list(res_dp), list(ref_dp)) @@ -1072,9 +1078,11 @@ def _dict_update(data, newdata, remove_idx=None): return _data def _helper(ref_fn, fn, input_col=None, output_col=None): - datapipe = IDP([{"x": 0, "y": 1, "z": 2}, - {"x": 3, "y": 4, "z": 5}, - {"x": 6, "y": 7, "z": 8}]) + datapipe = dp.iter.IterableWrapper( + [{"x": 0, "y": 1, "z": 2}, + {"x": 3, "y": 4, "z": 5}, + {"x": 6, "y": 7, "z": 8}] + ) res_dp = datapipe.map(fn, input_col, output_col) ref_dp = datapipe.map(ref_fn) self.assertEqual(list(res_dp), list(ref_dp)) @@ -1117,7 +1125,7 @@ def _helper(ref_fn, fn, input_col=None, output_col=None): # TODO(VitalyFedyunin): If dill installed this test fails def _test_map_datapipe_nested_level(self): - input_dp = IDP([list(range(10)) for _ in range(3)]) + input_dp = dp.iter.IterableWrapper([list(range(10)) for _ in range(3)]) def fn(item, *, dtype=torch.float): return torch.tensor(item, dtype=dtype) @@ -1153,7 +1161,7 @@ def fn(item, *, dtype=torch.float): def test_collate_datapipe(self): arrs = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - input_dp = IDP(arrs) + input_dp = dp.iter.IterableWrapper(arrs) def _collate_fn(batch): return torch.tensor(sum(batch), dtype=torch.float) @@ -1172,7 +1180,7 @@ def _collate_fn(batch): def test_batch_datapipe(self): arrs = list(range(10)) - input_dp = IDP(arrs) + input_dp = dp.iter.IterableWrapper(arrs) with self.assertRaises(AssertionError): input_dp.batch(batch_size=0) @@ -1200,7 +1208,7 @@ def test_batch_datapipe(self): def test_unbatch_datapipe(self): target_length = 6 - prebatch_dp = IDP(range(target_length)) + prebatch_dp = dp.iter.IterableWrapper(range(target_length)) input_dp = prebatch_dp.batch(3) unbatch_dp = input_dp.unbatch() @@ -1208,13 +1216,13 @@ def test_unbatch_datapipe(self): for i, res in zip(prebatch_dp, unbatch_dp): self.assertEqual(i, res) - input_dp = IDP([[0, 1, 2], [3, 4, 5]]) + input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]]) unbatch_dp = input_dp.unbatch() self.assertEqual(len(list(unbatch_dp)), target_length) for i, res in zip(prebatch_dp, unbatch_dp): self.assertEqual(i, res) - input_dp = IDP([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]) + input_dp = dp.iter.IterableWrapper([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]) unbatch_dp = input_dp.unbatch() expected_dp = [[0, 1], [2, 3], [4, 5], [6, 7]] @@ -1233,7 +1241,7 @@ def test_unbatch_datapipe(self): for i, res in zip(expected_dp2, unbatch_dp): self.assertEqual(i, res) - input_dp = IDP([[0, 1, 2], [3, 4, 5]]) + input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]]) with self.assertRaises(ValueError): unbatch_dp = input_dp.unbatch(unbatch_level=-2) for i in unbatch_dp: @@ -1245,7 +1253,7 @@ def test_unbatch_datapipe(self): print(i) def test_bucket_batch_datapipe(self): - input_dp = IDP(range(20)) + input_dp = dp.iter.IterableWrapper(range(20)) with self.assertRaises(AssertionError): dp.iter.BucketBatcher(input_dp, batch_size=0) @@ -1258,7 +1266,7 @@ def _helper(**kwargs): data_len = 100 arrs = list(range(data_len)) random.shuffle(arrs) - input_dp = IDP(arrs) + input_dp = dp.iter.IterableWrapper(arrs) bucket_dp = dp.iter.BucketBatcher(input_dp, **kwargs) self.assertEqual(len(bucket_dp), data_len // 3 if kwargs['drop_last'] else data_len // 3 + 1) @@ -1291,7 +1299,7 @@ def _sort_fn(data): def test_filter_datapipe(self): - input_ds = IDP(range(10)) + input_ds = dp.iter.IterableWrapper(range(10)) def _filter_fn(data, val, clip=False): if clip: @@ -1318,7 +1326,7 @@ def _non_bool_fn(data): def test_filter_datapipe_nested_list(self): - input_ds = IDP(range(10)).batch(5) + input_ds = dp.iter.IterableWrapper(range(10)).batch(5) def _filter_fn(data, val): return data >= val @@ -1340,7 +1348,7 @@ def _filter_fn(data, val): filter_dp = input_ds.filter(nesting_level=5, filter_fn=_filter_fn, fn_kwargs={'val': 5}) temp = list(filter_dp) - input_ds = IDP(range(10)).batch(3) + input_ds = dp.iter.IterableWrapper(range(10)).batch(3) filter_dp = input_ds.filter(lambda ls: len(ls) >= 3) expected_dp3: List[List[int]] = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] @@ -1348,21 +1356,21 @@ def _filter_fn(data, val): for data, exp in zip(filter_dp, expected_dp3): self.assertEqual(data, exp) - input_ds = IDP([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]]) + input_ds = dp.iter.IterableWrapper([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]]) filter_dp = input_ds.filter(lambda x: x > 3, nesting_level=-1) expected_dp4 = [[[4, 5]], [[6, 7, 8]]] self.assertEqual(len(list(filter_dp)), len(expected_dp4)) for data2, exp2 in zip(filter_dp, expected_dp4): self.assertEqual(data2, exp2) - input_ds = IDP([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]]) + input_ds = dp.iter.IterableWrapper([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [1, 2, 3]]]) filter_dp = input_ds.filter(lambda x: x > 7, nesting_level=-1) expected_dp5 = [[[8]]] self.assertEqual(len(list(filter_dp)), len(expected_dp5)) for data2, exp2 in zip(filter_dp, expected_dp5): self.assertEqual(data2, exp2) - input_ds = IDP([[[0, 1], [3, 4]], [[6, 7, 8], [1, 2, 3]]]) + input_ds = dp.iter.IterableWrapper([[[0, 1], [3, 4]], [[6, 7, 8], [1, 2, 3]]]) filter_dp = input_ds.filter(lambda ls: len(ls) >= 3, nesting_level=1) expected_dp6 = [[[6, 7, 8], [1, 2, 3]]] self.assertEqual(len(list(filter_dp)), len(expected_dp6)) @@ -1370,7 +1378,7 @@ def _filter_fn(data, val): self.assertEqual(data2, exp2) def test_sampler_datapipe(self): - input_dp = IDP(range(10)) + input_dp = dp.iter.IterableWrapper(range(10)) # Default SequentialSampler sampled_dp = dp.iter.Sampler(input_dp) # type: ignore[var-annotated] self.assertEqual(len(sampled_dp), 10) @@ -1387,7 +1395,7 @@ def test_sampler_datapipe(self): def test_shuffle_datapipe(self): exp = list(range(20)) - input_ds = IDP(exp) + input_ds = dp.iter.IterableWrapper(exp) with self.assertRaises(AssertionError): shuffle_dp = input_ds.shuffle(buffer_size=0) @@ -1413,15 +1421,15 @@ def test_shuffle_datapipe(self): def test_zip_datapipe(self): with self.assertRaises(TypeError): - dp.iter.Zipper(IDP(range(10)), list(range(10))) # type: ignore[arg-type] + dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), list(range(10))) # type: ignore[arg-type] - zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP_NoLen(range(5))) # type: ignore[var-annotated] + zipped_dp = dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), IDP_NoLen(range(5))) # type: ignore[var-annotated] with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(zipped_dp) exp = list((i, i) for i in range(5)) self.assertEqual(list(zipped_dp), exp) - zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP(range(5))) + zipped_dp = dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), dp.iter.IterableWrapper(range(5))) self.assertEqual(len(zipped_dp), 5) self.assertEqual(list(zipped_dp), exp) # Reset @@ -1506,32 +1514,32 @@ def fn(item, dtype=torch.float, *, sum=False): def test_mux_datapipe(self): # Test Case: Elements are yielded one at a time from each DataPipe, until they are all exhausted - input_dp1 = IDP(range(4)) - input_dp2 = IDP(range(4, 8)) - input_dp3 = IDP(range(8, 12)) + input_dp1 = dp.iter.IterableWrapper(range(4)) + input_dp2 = dp.iter.IterableWrapper(range(4, 8)) + input_dp3 = dp.iter.IterableWrapper(range(8, 12)) output_dp = input_dp1.mux(input_dp2, input_dp3) expected_output = [0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11] self.assertEqual(len(expected_output), len(output_dp)) self.assertEqual(expected_output, list(output_dp)) # Test Case: Uneven input Data Pipes - input_dp1 = IDP([1, 2, 3, 4]) - input_dp2 = IDP([10]) - input_dp3 = IDP([100, 200, 300]) + input_dp1 = dp.iter.IterableWrapper([1, 2, 3, 4]) + input_dp2 = dp.iter.IterableWrapper([10]) + input_dp3 = dp.iter.IterableWrapper([100, 200, 300]) output_dp = input_dp1.mux(input_dp2, input_dp3) expected_output = [1, 10, 100, 2, 200, 3, 300, 4] self.assertEqual(len(expected_output), len(output_dp)) self.assertEqual(expected_output, list(output_dp)) # Test Case: Empty Data Pipe - input_dp1 = IDP([0, 1, 2, 3]) - input_dp2 = IDP([]) + input_dp1 = dp.iter.IterableWrapper([0, 1, 2, 3]) + input_dp2 = dp.iter.IterableWrapper([]) output_dp = input_dp1.mux(input_dp2) self.assertEqual(len(input_dp1), len(output_dp)) self.assertEqual(list(input_dp1), list(output_dp)) # Test Case: raises TypeError when __len__ is called and an input doesn't have __len__ - input_dp1 = IDP(range(10)) + input_dp1 = dp.iter.IterableWrapper(range(10)) input_dp_no_len = IDP_NoLen(range(10)) output_dp = input_dp1.mux(input_dp_no_len) with self.assertRaises(TypeError): @@ -1665,8 +1673,8 @@ def __iter__(self) -> Iterator[Tuple[int, str]]: self.assertTrue(issubclass(DP1, IterDataPipe)) dp1 = DP1(10) self.assertTrue(DP1.type.issubtype(dp1.type) and dp1.type.issubtype(DP1.type)) - dp2 = DP1(5) - self.assertEqual(dp1.type, dp2.type) + dp1_ = DP1(5) + self.assertEqual(dp1.type, dp1_.type) with self.assertRaisesRegex(TypeError, r"is not a generic class"): class InvalidDP5(DP1[tuple]): # type: ignore[type-arg] @@ -1679,10 +1687,10 @@ def __iter__(self) -> Iterator[T_co]: yield d # type: ignore[misc] self.assertTrue(issubclass(DP2, IterDataPipe)) - dp1 = DP2() # type: ignore[assignment] - self.assertTrue(DP2.type.issubtype(dp1.type) and dp1.type.issubtype(DP2.type)) - dp2 = DP2() # type: ignore[assignment] - self.assertEqual(dp1.type, dp2.type) + dp2 = DP2() # type: ignore[var-annotated] + self.assertTrue(DP2.type.issubtype(dp2.type) and dp2.type.issubtype(DP2.type)) + dp2_ = DP2() # type: ignore[var-annotated] + self.assertEqual(dp2.type, dp2_.type) class DP3(IterDataPipe[Tuple[T_co, str]]): r""" DataPipe without fixed type with __init__ function""" @@ -1695,10 +1703,10 @@ def __iter__(self) -> Iterator[Tuple[T_co, str]]: yield d, str(d) self.assertTrue(issubclass(DP3, IterDataPipe)) - dp1 = DP3(range(10)) # type: ignore[assignment] - self.assertTrue(DP3.type.issubtype(dp1.type) and dp1.type.issubtype(DP3.type)) - dp2 = DP3(5) # type: ignore[assignment] - self.assertEqual(dp1.type, dp2.type) + dp3 = DP3(range(10)) # type: ignore[var-annotated] + self.assertTrue(DP3.type.issubtype(dp3.type) and dp3.type.issubtype(DP3.type)) + dp3_ = DP3(5) # type: ignore[var-annotated] + self.assertEqual(dp3.type, dp3_.type) class DP4(IterDataPipe[tuple]): r""" DataPipe without __iter__ annotation""" @@ -1707,8 +1715,8 @@ def __iter__(self): raise NotImplementedError self.assertTrue(issubclass(DP4, IterDataPipe)) - dp = DP4() - self.assertTrue(dp.type.param == tuple) + dp4 = DP4() + self.assertTrue(dp4.type.param == tuple) class DP5(IterDataPipe): r""" DataPipe without type annotation""" @@ -1717,9 +1725,9 @@ def __iter__(self) -> Iterator[str]: raise NotImplementedError self.assertTrue(issubclass(DP5, IterDataPipe)) - dp = DP5() # type: ignore[assignment] + dp5 = DP5() from torch.utils.data._typing import issubtype - self.assertTrue(issubtype(dp.type.param, Any) and issubtype(Any, dp.type.param)) + self.assertTrue(issubtype(dp5.type.param, Any) and issubtype(Any, dp5.type.param)) class DP6(IterDataPipe[int]): r""" DataPipe with plain Iterator""" @@ -1728,13 +1736,13 @@ def __iter__(self) -> Iterator: raise NotImplementedError self.assertTrue(issubclass(DP6, IterDataPipe)) - dp = DP6() # type: ignore[assignment] - self.assertTrue(dp.type.param == int) + dp6 = DP6() + self.assertTrue(dp6.type.param == int) class DP7(IterDataPipe[Awaitable[T_co]]): r""" DataPipe with abstract base class""" - self.assertTrue(issubclass(DP6, IterDataPipe)) + self.assertTrue(issubclass(DP7, IterDataPipe)) self.assertTrue(DP7.type.param == Awaitable[T_co]) class DP8(DP7[str]): @@ -1765,11 +1773,11 @@ def __iter__(self) -> Iterator[int]: # Non-DataPipe input with DataPipe hint datasource = [(1, '1'), (2, '2'), (3, '3')] with self.assertRaisesRegex(TypeError, r"Expected argument 'dp' as a IterDataPipe"): - dp = DP0(datasource) + dp0 = DP0(datasource) - dp = DP0(IDP(range(10))) + dp0 = DP0(dp.iter.IterableWrapper(range(10))) with self.assertRaisesRegex(TypeError, r"Expected type of argument 'dp' as a subtype"): - dp = DP1(dp) + dp1 = DP1(dp0) def test_runtime(self): class DP(IterDataPipe[Tuple[int, T_co]]): @@ -1784,26 +1792,26 @@ def __iter__(self) -> Iterator[Tuple[int, T_co]]: dss = ([(1, '1'), (2, '2')], [(1, 1), (2, '2')]) for ds in dss: - dp = DP(ds) # type: ignore[var-annotated] - self.assertEqual(list(dp), ds) + dp0 = DP(ds) # type: ignore[var-annotated] + self.assertEqual(list(dp0), ds) # Reset __iter__ - self.assertEqual(list(dp), ds) + self.assertEqual(list(dp0), ds) dss = ([(1, 1), ('2', 2)], # type: ignore[assignment, list-item] [[1, '1'], [2, '2']], # type: ignore[list-item] [1, '1', 2, '2']) for ds in dss: - dp = DP(ds) + dp0 = DP(ds) with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"): - list(dp) + list(dp0) with runtime_validation_disabled(): - self.assertEqual(list(dp), ds) + self.assertEqual(list(dp0), ds) with runtime_validation_disabled(): - self.assertEqual(list(dp), ds) + self.assertEqual(list(dp0), ds) with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"): - list(dp) + list(dp0) def test_reinforce(self): T = TypeVar('T', int, str) @@ -1819,26 +1827,26 @@ def __iter__(self) -> Iterator[T]: ds = list(range(10)) # Valid type reinforcement - dp = DP(ds).reinforce_type(int) - self.assertTrue(dp.type, int) - self.assertEqual(list(dp), ds) + dp0 = DP(ds).reinforce_type(int) + self.assertTrue(dp0.type, int) + self.assertEqual(list(dp0), ds) # Invalid type with self.assertRaisesRegex(TypeError, r"'expected_type' must be a type"): - dp = DP(ds).reinforce_type(1) + dp1 = DP(ds).reinforce_type(1) # Type is not subtype with self.assertRaisesRegex(TypeError, r"Expected 'expected_type' as subtype of"): - dp = DP(ds).reinforce_type(float) + dp2 = DP(ds).reinforce_type(float) # Invalid data at runtime - dp = DP(ds).reinforce_type(str) + dp3 = DP(ds).reinforce_type(str) with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"): - list(dp) + list(dp3) # Context Manager to disable the runtime validation with runtime_validation_disabled(): - self.assertEqual(list(d for d in dp), ds) + self.assertEqual(list(d for d in dp3), ds) class NumbersDataset(IterDataPipe): @@ -1900,7 +1908,7 @@ def test_simple_sharding(self): self.assertEqual(sorted(all_items), sorted(items)) def test_sharding_length(self): - numbers_dp = IDP(range(13)) + numbers_dp = dp.iter.IterableWrapper(range(13)) sharded_dp0 = numbers_dp.sharding_filter() torch.utils.data.sharding.apply_sharding(sharded_dp0, 3, 0) sharded_dp1 = numbers_dp.sharding_filter() @@ -1912,7 +1920,7 @@ def test_sharding_length(self): self.assertEqual(4, len(sharded_dp1)) self.assertEqual(4, len(sharded_dp2)) - numbers_dp = IDP(range(1)) + numbers_dp = dp.iter.IterableWrapper(range(1)) sharded_dp0 = numbers_dp.sharding_filter() torch.utils.data.sharding.apply_sharding(sharded_dp0, 2, 0) sharded_dp1 = numbers_dp.sharding_filter() @@ -1922,11 +1930,11 @@ def test_sharding_length(self): @skipIfNoDill def test_old_dataloader(self): - dp = self._get_pipeline() - expected = list(dp) + dp0 = self._get_pipeline() + expected = list(dp0) - dp = self._get_pipeline().sharding_filter() - dl = DataLoader(dp, batch_size=1, shuffle=False, num_workers=2, + dp0 = self._get_pipeline().sharding_filter() + dl = DataLoader(dp0, batch_size=1, shuffle=False, num_workers=2, worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn) items = [] for i in dl: diff --git a/test/test_nn.py b/test/test_nn.py index 92357d9ce15394..b6dd466faadfde 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -9704,12 +9704,6 @@ def test_cosine_similarity(self): self.assertEqual(input1.grad, torch.zeros_like(input1)) self.assertEqual(input2.grad, input1 * 1e8) - # Check error when inputs are not the same shape - input1 = torch.randn(2, 2, 1) - input2 = torch.randn(2, 1, 3) - with self.assertRaises(RuntimeError): - F.cosine_similarity(input1, input2) - # Check type promotion, issue #61454 input = torch.tensor(12.) out = F.cosine_similarity(input.to(torch.int8), input, dim=-1) diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index 38af5bb2c78246..20db5c7972b83d 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -449,5 +449,15 @@ def test_nested_enable_python_mode(self) -> None: with enable_python_mode(LoggingTensor): pass + def test_tolist_numpy_with_python_mode(self) -> None: + x = LoggingTensor(torch.tensor([2.0, 3.0])) + with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."): + x.tolist() + with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."): + x.numpy() + with self.assertRaises(AssertionError): + self.assertEqual(x, None) + + if __name__ == '__main__': run_tests() diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index f632e95d9c7046..d66a1e5e452161 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -14,7 +14,7 @@ skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf) from torch.testing._internal.common_methods_invocations import spectral_funcs, SpectralFuncInfo -from setuptools import distutils +from distutils.version import LooseVersion from typing import Optional, List @@ -217,7 +217,7 @@ def get_op_name(op): @ops([op for op in spectral_funcs if not op.ndimensional]) def test_reference_1d(self, device, dtype, op): norm_modes = ((None, "forward", "backward", "ortho") - if distutils.version.LooseVersion(np.__version__) >= '1.20.0' + if LooseVersion(np.__version__) >= '1.20.0' else (None, "ortho")) test_args = [ *product( @@ -370,7 +370,7 @@ def test_fft_half_and_bfloat16_errors(self, device, dtype, op): @ops([op for op in spectral_funcs if op.ndimensional]) def test_reference_nd(self, device, dtype, op): norm_modes = ((None, "forward", "backward", "ortho") - if distutils.version.LooseVersion(np.__version__) >= '1.20.0' + if LooseVersion(np.__version__) >= '1.20.0' else (None, "ortho")) # input_ndim, s, dim @@ -469,7 +469,7 @@ def test_fftn_invalid(self, device, dtype, op): @dtypes(torch.double, torch.complex128) def test_fft2_numpy(self, device, dtype): norm_modes = ((None, "forward", "backward", "ortho") - if distutils.version.LooseVersion(np.__version__) >= '1.20.0' + if LooseVersion(np.__version__) >= '1.20.0' else (None, "ortho")) # input_ndim, s diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index e72b156c715db7..9530fbff8890eb 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -3258,6 +3258,10 @@ def helper(self, device, dtype, ptype, t_transform, std_transform): self.assertEqual(t_transform(r[:, :50]).std(), std_transform(4), atol=0.3, rtol=0) self.assertEqual(t_transform(r[:, 50:]).std(), std_transform(1), atol=0.2, rtol=0) + # test empty mean/std + out = torch.normal(mean=torch.empty((0, 2)), std=torch.empty((0, 1))) + self.assertEqual(out.size(), torch.Size([0, 2])) + r.fill_(42) r = torch.normal(2, 3, (100, 100), dtype=dtype, device=device) self.assertEqual(r.dtype, dtype) diff --git a/test/test_torch.py b/test/test_torch.py index 3c47fd87122dc7..f800a888493581 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5976,8 +5976,6 @@ def test_masked_scatter_bool_tensor(self, device): dst = dst.masked_scatter(mask, src) self.assertEqual(dst, torch.tensor([True, True, True], device=device)) - # refer https://github.com/pytorch/pytorch/issues/60190 - @skipIfRocm @onlyCUDA @largeTensorTest('30GB') def test_masked_scatter_large_tensor(self, device): @@ -8399,6 +8397,13 @@ def generate_inputs(num_batches): finally: torch.set_num_threads(num_threads) + def test_conj_neg_tolist(self): + x = torch.randn(2, dtype=torch.cfloat) + y1 = x.conj() + y1_expect = x.conj_physical() + y2 = y1.imag + self.assertEqual(y1, y1_expect.tolist()) + self.assertEqual(y2, y1_expect.imag.tolist()) # TODO: these empy classes are temporarily instantiated for XLA compatibility # once XLA updates their test suite it should be removed diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 6deda4032c7c58..9fee75d2e3719e 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -365,6 +365,16 @@ def test_conj_imag_view(self, device, dtype) -> None: self.assertEqual(v_imag, t_numpy_conj.imag) self.assertTrue(v_imag.is_neg()) + @onlyOnCPUAndCUDA + def test_conj_view_with_shared_memory(self, device) -> None: + a = _make_tensor((4, 5,), torch.cfloat, device) + b = a.conj() + c = a.conj() + + self.assertEqual(torch.add(a, b), a.add_(b)) + self.assertEqual(torch.add(b, c), torch.add(b, c, out=a)) + self.assertEqual(torch.add(b, c), b.add_(c)) + @onlyOnCPUAndCUDA @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) @suppress_warnings diff --git a/test/test_vmap.py b/test/test_vmap.py index 35b28db84f61f0..6f6996b09bb550 100644 --- a/test/test_vmap.py +++ b/test/test_vmap.py @@ -1,7 +1,8 @@ from torch.testing._internal.common_utils import TestCase, run_tests import torch import torch.nn.functional as F -from torch import Tensor, vmap +from torch import Tensor +from torch._vmap_internals import vmap import functools import itertools import warnings diff --git a/third_party/kineto b/third_party/kineto index dd32f3accff289..57188f45650e63 160000 --- a/third_party/kineto +++ b/third_party/kineto @@ -1 +1 @@ -Subproject commit dd32f3accff289bc017d84f0ec1af01c36aaa6f8 +Subproject commit 57188f45650e63721d59f5a6005294e79d7d9f3a diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index 70f7e7a83e1ec2..15a5a70872342c 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -89,6 +89,8 @@ "tools/autograd/templates/python_variable_methods.cpp", ] +includes = [os.path.join(proj_dir, include) for include in includes] + for new_dir in args.extra_include_dir: abs_new_dir = os.path.join(proj_dir, new_dir) if os.path.exists(abs_new_dir): @@ -112,6 +114,8 @@ "torch/include/*", ] +ignores = [os.path.join(proj_dir, ignore) for ignore in ignores] + # Check if the compiler is hip-clang. def is_hip_clang() -> bool: try: diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index dd8f817aba2c25..a29430520d0ff7 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -8,7 +8,7 @@ from subprocess import check_call, check_output, CalledProcessError import sys import sysconfig -from setuptools import distutils # type: ignore[import] +from distutils.version import LooseVersion from typing import IO, Any, Dict, List, Optional, Union, cast from . import which @@ -120,10 +120,10 @@ def _get_cmake_command() -> str: return cmake_command cmake3 = which('cmake3') cmake = which('cmake') - if cmake3 is not None and CMake._get_version(cmake3) >= distutils.version.LooseVersion("3.10.0"): + if cmake3 is not None and CMake._get_version(cmake3) >= LooseVersion("3.10.0"): cmake_command = 'cmake3' return cmake_command - elif cmake is not None and CMake._get_version(cmake) >= distutils.version.LooseVersion("3.10.0"): + elif cmake is not None and CMake._get_version(cmake) >= LooseVersion("3.10.0"): return cmake_command else: raise RuntimeError('no cmake or cmake3 with version >= 3.10.0 found') @@ -134,7 +134,7 @@ def _get_version(cmd: str) -> Any: for line in check_output([cmd, '--version']).decode('utf-8').split('\n'): if 'version' in line: - return distutils.version.LooseVersion(line.strip().split(' ')[2]) + return LooseVersion(line.strip().split(' ')[2]) raise RuntimeError('no version found') def run(self, args: List[str], env: Dict[str, str]) -> None: @@ -387,7 +387,7 @@ def build(self, my_env: Dict[str, str]) -> None: # then. Until then, we use "--" to pass parameters to the # underlying build system. build_args += ['--'] - if IS_WINDOWS: + if IS_WINDOWS and not USE_NINJA: # We are likely using msbuild here build_args += ['/p:CL_MPCount={}'.format(max_jobs)] else: diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py index 8c1f7973442f37..7c214919549afc 100755 --- a/tools/stats/print_test_stats.py +++ b/tools/stats/print_test_stats.py @@ -108,7 +108,7 @@ def plural(n: int) -> str: def get_base_commit(sha1: str) -> str: return subprocess.check_output( - ["git", "merge-base", sha1, "origin/master"], + ["git", "merge-base", sha1, "origin/release/1.10"], encoding="ascii", ).strip() diff --git a/tools/test/test_cmake.py b/tools/test/test_cmake.py index a9fae6dc90e026..ecbce07f52d23f 100644 --- a/tools/test/test_cmake.py +++ b/tools/test/test_cmake.py @@ -22,7 +22,11 @@ def test_build_jobs(self, mock_cpu_count: unittest.mock.MagicMock) -> None: # MAX_JOBS, USE_NINJA, IS_WINDOWS, want (( '8', True, False), ['-j', '8']), # noqa: E201,E241 (( None, True, False), None), # noqa: E201,E241 + (( '7', False, False), ['-j', '7']), # noqa: E201,E241 + (( None, False, False), ['-j', '13']), # noqa: E201,E241 + (( '6', True, True), ['-j', '6']), # noqa: E201,E241 (( None, True, True), None), # noqa: E201,E241 + (( '11', False, True), ['/p:CL_MPCount=11']), # noqa: E201,E241 (( None, False, True), ['/p:CL_MPCount=13']), # noqa: E201,E241 ] for (max_jobs, use_ninja, is_windows), want in cases: diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 743a5d442197ae..38baee4799b548 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -9,6 +9,13 @@ if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) endif() +if(BUILD_BINARY) + add_library(aot_compiler SHARED + ${TORCH_SRC_DIR}/csrc/jit/mobile/nnc/aot_compiler.cpp + ) + install(TARGETS aot_compiler DESTINATION lib) +endif() + if(NOT BUILD_PYTHON) return() endif() @@ -430,9 +437,3 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # Pybind11 requires explicit linking of the torch_python library target_link_libraries(nnapi_backend torch torch_python) endif() - -if(BUILD_BINARY) - add_library(aot_compiler SHARED - ${TORCH_SRC_DIR}/csrc/jit/mobile/nnc/aot_compiler.cpp - ) -endif() diff --git a/torch/__init__.py b/torch/__init__.py index 18c9b08a8b5368..abd8c1f918af8e 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -756,8 +756,6 @@ def compiled_with_cxx11_abi(): # torch.jit.script as a decorator, for instance): from ._lobpcg import lobpcg as lobpcg -from ._vmap_internals import vmap as vmap - # These were previously defined in native_functions.yaml and appeared on the # `torch` namespace, but we moved them to c10 dispatch to facilitate custom # class usage. We add these lines here to preserve backward compatibility. diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 2a1de8e82a774e..7f0451b089dad1 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -145,6 +145,9 @@ struct TORCH_API Node : std::enable_shared_from_this { // probably operate with names. at::NoNamesGuard no_names_guard; + // Keep track of backward pass for rocblas. + at::BackwardPassGuard in_backward; + bool pre_sampled = false; if (at::shouldRunRecordFunction(&pre_sampled)) { // Using RecordFunction to trigger observers in the backward pass diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 5760c485958a9a..17372137f927fe 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -362,19 +362,21 @@ void ConvertGraphToONNXProto( SymbolDimMap& symbol_map, int opset_version) { RawDataExportMap export_map; - std::tie(model_proto, export_map, symbol_map) = export_onnx( - graph, - {}, - opset_version, - {}, - false, - onnx_torch::OperatorExportTypes::ONNX, - true, - true, - {}, - true, - false, - std::string()); + bool val_use_external_data_format; + std::tie(model_proto, export_map, symbol_map, val_use_external_data_format) = + export_onnx( + graph, + {}, + opset_version, + {}, + false, + onnx_torch::OperatorExportTypes::ONNX, + true, + true, + {}, + true, + false, + std::string()); for (int i = 0; i < model_proto->graph().output_size(); ++i) { model_proto->mutable_graph()->mutable_output(i)->clear_type(); } diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp index 2c8246daec92b1..bcab4e676d8361 100644 --- a/torch/csrc/jit/python/python_ir.cpp +++ b/torch/csrc/jit/python/python_ir.cpp @@ -263,19 +263,25 @@ void initPythonIRBindings(PyObject* module_) { std::shared_ptr<::ONNX_NAMESPACE::ModelProto> model_proto; RawDataExportMap export_map; SymbolDimMap symbol_map; - std::tie(model_proto, export_map, symbol_map) = export_onnx( - g, - initializers, - onnx_opset_version, - dynamic_axes, - defer_weight_export, - operator_export_type, - strip_doc_string, - keep_initializers_as_inputs, - custom_opsets, - add_node_names, - use_external_data_format, - onnx_file_path); + bool val_use_external_data_format = false; + std::tie( + model_proto, + export_map, + symbol_map, + val_use_external_data_format) = + export_onnx( + g, + initializers, + onnx_opset_version, + dynamic_axes, + defer_weight_export, + operator_export_type, + strip_doc_string, + keep_initializers_as_inputs, + custom_opsets, + add_node_names, + use_external_data_format, + onnx_file_path); std::unordered_map python_serialized_export_map; for (auto& kv : export_map) { @@ -289,7 +295,9 @@ void initPythonIRBindings(PyObject* module_) { } graph = serialize_model_proto_to_string(model_proto); return std::make_tuple( - py::bytes(graph), python_serialized_export_map); + py::bytes(graph), + python_serialized_export_map, + val_use_external_data_format); }, py::arg("initializers"), py::arg("onnx_opset_version") = 0, diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp index 87792092061cfe..6f5aa2f044ab06 100644 --- a/torch/csrc/jit/serialization/export.cpp +++ b/torch/csrc/jit/serialization/export.cpp @@ -231,6 +231,12 @@ class EncoderBase { bool use_external_data_format = false, const std::string& onnx_file_path = std::string()); + unsigned long long int GetGraphProtoSize( + onnx::GraphProto* graph_proto, + const std::shared_ptr& graph, + const std::map& initializers = + std::map()); + virtual void EncodeTensor( onnx::TensorProto* tensor_proto, const at::Tensor& tensor, @@ -581,7 +587,6 @@ void EncoderBase::AddInitializersIntoGraphProto( bool use_external_data_format, const std::string& onnx_file_path) { AT_ASSERT(block->inputs().size() >= initializers.size()); - for (auto input : block->inputs()) { auto name_tensor_pair = initializers.find(input->debugName()); if (name_tensor_pair == initializers.end()) { @@ -598,6 +603,38 @@ void EncoderBase::AddInitializersIntoGraphProto( } } +unsigned long long int EncoderBase::GetGraphProtoSize( + onnx::GraphProto* graph_proto, + const std::shared_ptr& graph, + const std::map& initializers) { + unsigned long long int sizes = 0; + for (auto input : graph->inputs()) { + auto name_tensor_pair = initializers.find(input->debugName()); + if (name_tensor_pair == initializers.end()) { + continue; + } + onnx::GraphProto* graph_proto_copy = new onnx::GraphProto(*graph_proto); + auto tensor_proto = graph_proto_copy->add_initializer(); + const at::Tensor tensor = name_tensor_pair->second; + for (auto d : tensor.sizes()) { + tensor_proto->add_dims(d); + } + tensor_proto->set_data_type(ATenTypeToOnnxType(tensor.scalar_type())); + at::Tensor t; + if (tensor.is_quantized()) { + t = tensor.contiguous(); + } else { + t = tensor.contiguous().cpu(); + } + tensor_proto->set_raw_data(std::string( + static_cast(t.data_ptr()), t.element_size() * t.numel())); + sizes += tensor_proto->ByteSizeLong(); + delete graph_proto_copy; + graph_proto_copy = nullptr; + } + return sizes; +} + void EncoderBase::AddAttribute( onnx::NodeProto* node_proto, const jit::Node* node, @@ -726,6 +763,10 @@ class GraphEncoder : public EncoderBase { return raw_data_export_map_; } + bool get_use_external_data_format() { + return use_external_data_format_; + } + private: void EncodeTensor( onnx::TensorProto* tensor_proto, @@ -736,6 +777,7 @@ class GraphEncoder : public EncoderBase { RawDataExportMap raw_data_export_map_; bool defer_weight_export_; + bool use_external_data_format_; }; GraphEncoder::GraphEncoder( @@ -754,8 +796,21 @@ GraphEncoder::GraphEncoder( bool use_external_data_format, const std::string& onnx_file_path) : EncoderBase(operator_export_type, strip_doc), - defer_weight_export_(defer_weight_export) { + defer_weight_export_(defer_weight_export), + use_external_data_format_(use_external_data_format) { validateGraph(graph, operator_export_type); + // If graph proto size exceed maximum protobuf size of 2GB, set + // use_external_data_format to true. + if (!use_external_data_format && !onnx_file_path.empty() && + GetGraphProtoSize(model_proto_.mutable_graph(), graph, initializers) > + INT_MAX) { + GRAPH_DEBUG( + "Exporting model exceed maximum protobuf size of 2GB. Storing model parameters in external data files"); + use_external_data_format = true; + // use_external_data_format_ is one of graph_encoder private variable set + // for return `use_external_data_format` value. + use_external_data_format_ = use_external_data_format; + } if (use_external_data_format) { TORCH_CHECK( @@ -895,7 +950,8 @@ std::string pretty_print_onnx( std::tuple< std::shared_ptr<::ONNX_NAMESPACE::ModelProto>, RawDataExportMap, - SymbolDimMap> + SymbolDimMap, + bool> export_onnx( const std::shared_ptr& graph, const std::map& initializers, @@ -929,7 +985,8 @@ export_onnx( std::make_shared<::ONNX_NAMESPACE::ModelProto>( graph_encoder.get_model_proto()), graph_encoder.get_raw_data_export_map(), - graph_encoder.get_symbol_dim_param_map()); + graph_encoder.get_symbol_dim_param_map(), + graph_encoder.get_use_external_data_format()); } std::string serialize_model_proto_to_string( @@ -938,7 +995,7 @@ std::string serialize_model_proto_to_string( TORCH_CHECK( proto_size <= INT_MAX, "Exporting model exceed maximum protobuf size of 2GB. " - "Please call torch.onnx.export with use_external_data_format=True."); + "Please call torch.onnx.export without setting use_external_data_format parameter."); return model_proto->SerializeAsString(); } diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h index 80f0884224392e..37d3433fa5f908 100644 --- a/torch/csrc/jit/serialization/export.h +++ b/torch/csrc/jit/serialization/export.h @@ -33,7 +33,8 @@ using SymbolDimMap = std::map; TORCH_API std::tuple< std::shared_ptr<::ONNX_NAMESPACE::ModelProto>, RawDataExportMap, - SymbolDimMap> + SymbolDimMap, + bool> export_onnx( const std::shared_ptr& graph, const std::map& initializers, diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp index 7948734f1e58a2..1cde3a196d6bbb 100644 --- a/torch/csrc/utils/tensor_list.cpp +++ b/torch/csrc/utils/tensor_list.cpp @@ -30,7 +30,8 @@ static PyObject* recursive_to_list( } PyObject* tensor_to_list(const Tensor& tensor) { - Tensor data = tensor; + TORCH_CHECK(!tensor.unsafeGetTensorImpl()->is_python_dispatch(), ".tolist() is not supported for tensor subclasses."); + Tensor data = tensor.resolve_conj().resolve_neg(); if (!data.device().is_cpu()) { pybind11::gil_scoped_release no_gil; data = data.toBackend(Backend::CPU); diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 25e9a5962614fb..959e0d1d30e562 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -194,7 +194,7 @@ void recursive_store(char* data, IntArrayRef sizes, IntArrayRef strides, int64_t PyObject** items = PySequence_Fast_ITEMS(seq.get()); for(const auto i : c10::irange(n)) { #ifdef USE_NUMPY - if (PyArray_Check(items[i])) { + if (is_numpy_available() && PyArray_Check(items[i])) { TORCH_WARN_ONCE( "Creating a tensor from a list of numpy.ndarrays is extremely slow. " "Please consider converting the list to a single numpy.ndarray with " diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp index 433d1e2e680841..e507ffbf448bb3 100644 --- a/torch/csrc/utils/tensor_numpy.cpp +++ b/torch/csrc/utils/tensor_numpy.cpp @@ -130,6 +130,8 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor) { "Can't call numpy() on Tensor that has negative bit set. " "Use tensor.resolve_neg().numpy() instead."); + TORCH_CHECK(!tensor.unsafeGetTensorImpl()->is_python_dispatch(), ".numpy() is not supported for tensor subclasses."); + auto dtype = aten_to_numpy_dtype(tensor.scalar_type()); auto sizes = to_numpy_shape(tensor.sizes()); auto strides = to_numpy_shape(tensor.strides()); diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index e0c2d8907e2b69..a85cc9d224b9e3 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1519,8 +1519,11 @@ def _object_to_tensor(obj): f = io.BytesIO() _pickler(f).dump(obj) byte_storage = torch.ByteStorage.from_buffer(f.getvalue()) # type: ignore[attr-defined] - byte_tensor = torch.tensor(byte_storage, dtype=torch.uint8) - local_size = torch.tensor([byte_tensor.numel()], dtype=torch.long) + # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype. + # Otherwise, it will casue 100X slowdown. + # See: https://github.com/pytorch/pytorch/issues/65696 + byte_tensor = torch.ByteTensor(byte_storage) + local_size = torch.LongTensor([byte_tensor.numel()]) return byte_tensor, local_size diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py index 4a8238f1259e85..c84df1a8e43426 100644 --- a/torch/distributed/elastic/agent/server/local_elastic_agent.py +++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py @@ -21,10 +21,11 @@ WorkerState, ) from torch.distributed.elastic.metrics.api import prof -from torch.distributed.elastic.multiprocessing import start_processes, PContext +from torch.distributed.elastic.multiprocessing import PContext, start_processes from torch.distributed.elastic.utils import macros from torch.distributed.elastic.utils.logging import get_logger + log = get_logger() diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py index c1be1e093484f2..be955d28d4704b 100644 --- a/torch/distributed/elastic/multiprocessing/errors/__init__.py +++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py @@ -51,6 +51,7 @@ import json import os import signal +import socket import time import warnings from dataclasses import dataclass, field @@ -109,7 +110,7 @@ def __post_init__(self): try: with open(self.error_file, "r") as fp: self.error_file_data = json.load(fp) - log.info( + log.debug( f"User process failed with error data: {json.dumps(self.error_file_data, indent=2)}" ) self.message, self.timestamp = self._get_error_data( @@ -130,7 +131,7 @@ def __post_init__(self): f" received by PID {self.pid}" ) else: - self.message = f"Process failed with exitcode {self.exitcode}" + self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html" def _get_error_data(self, error_file_data: Dict[str, Any]) -> Tuple[str, int]: message = error_file_data["message"] @@ -162,24 +163,24 @@ def timestamp_isoformat(self): GlobalRank = int _FAILURE_FORMAT_TEMPLATE = """[${idx}]: - time: ${time} - rank: ${rank} (local_rank: ${local_rank}) - exitcode: ${exitcode} (pid: ${pid}) + time : ${time} + host : ${hostname} + rank : ${rank} (local_rank: ${local_rank}) + exitcode : ${exitcode} (pid: ${pid}) error_file: ${error_file} - msg: ${message}""" + traceback : ${message}""" # extra new lines before and after are intentional _MSG_FORMAT_TEMPLATE = """ ${boarder} ${title} ${section} -Root Cause: -${root_failure} -${section} -Other Failures: +Failures: ${other_failures} -${boarder} -""" +${section} +Root Cause (first observed failure): +${root_failure} +${boarder}""" class ChildFailedError(Exception): @@ -230,8 +231,8 @@ def get_first_failure(self) -> Tuple[GlobalRank, ProcessFailure]: rank = min(self.failures.keys(), key=lambda r: self.failures[r].timestamp) return rank, self.failures[rank] - def format_msg(self, boarder_delim="*", section_delim="="): - title = f" {self.name} FAILED " + def format_msg(self, boarder_delim="=", section_delim="-"): + title = f"{self.name} FAILED" root_rank, root_failure = self.get_first_failure() root_failure_fmt: str = "" @@ -246,11 +247,11 @@ def format_msg(self, boarder_delim="*", section_delim="="): other_failures_fmt.append(fmt) # upper boundary on width - width = min(width, 80) + width = min(width, 60) return Template(_MSG_FORMAT_TEMPLATE).substitute( boarder=boarder_delim * width, - title=title.center(width), + title=title, section=section_delim * width, root_failure=root_failure_fmt, other_failures="\n".join(other_failures_fmt or [" "]), @@ -279,6 +280,7 @@ def _format_failure( fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute( idx=idx, time=failure.timestamp_isoformat(), + hostname=socket.getfqdn(), rank=rank, local_rank=failure.local_rank, exitcode=failure.exitcode, @@ -292,32 +294,6 @@ def _format_failure( return fmt, width -def _no_error_file_warning_msg(rank: int, failure: ProcessFailure) -> str: - msg = [ - "CHILD PROCESS FAILED WITH NO ERROR_FILE", - f"Child process {failure.pid} (local_rank {rank}) FAILED (exitcode {failure.exitcode})", - f"Error msg: {failure.message}", - f"Without writing an error file to {failure.error_file}.", - "While this DOES NOT affect the correctness of your application,", - "no trace information about the error will be available for inspection.", - "Consider decorating your top level entrypoint function with", - "torch.distributed.elastic.multiprocessing.errors.record. Example:", - "", - r" from torch.distributed.elastic.multiprocessing.errors import record", - "", - r" @record", - r" def trainer_main(args):", - r" # do train", - ] - width = 0 - for line in msg: - width = max(width, len(line)) - - boarder = "*" * width - header = "CHILD PROCESS FAILED WITH NO ERROR_FILE".center(width) - return "\n".join(["\n", boarder, header, boarder, *msg, boarder]) - - def record( fn: Callable[..., T], error_handler: Optional[ErrorHandler] = None ) -> Callable[..., T]: @@ -372,7 +348,13 @@ def wrapper(*args, **kwargs): if failure.error_file != _NOT_AVAILABLE: error_handler.dump_error_file(failure.error_file, failure.exitcode) else: - warnings.warn(_no_error_file_warning_msg(rank, failure)) + log.info( + ( + f"local_rank {rank} FAILED with no error file." + f" Decorate your entrypoint fn with @record for traceback info." + f" See: https://pytorch.org/docs/stable/elastic/errors.html" + ) + ) raise except Exception as e: error_handler.record_exception(e) diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py index 74586e9fd85233..da47eb77e8a55a 100644 --- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py +++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py @@ -107,7 +107,7 @@ def dump_error_file(self, rootcause_error_file: str, error_code: int = 0): else: rootcause_error["message"]["errorCode"] = error_code - log.info( + log.debug( f"child error file ({rootcause_error_file}) contents:\n" f"{json.dumps(rootcause_error, indent=2)}" ) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index c6e84d6f65f4bc..127d91355a4f47 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -304,6 +304,27 @@ def train(): if should_checkpoint: save_checkpoint(checkpoint_path) + +9. (Recommended) On worker errors, this tool will summarize the details of the error + (e.g. time, rank, host, pid, traceback, etc). On each node, the first error (by timestamp) + is heuristically reported as the "Root Cause" error. To get tracebacks as part of this + error summary print out, you must decorate your main entrypoint function in your + training script as shown in the example below. If not decorated, then the summary + will not include the traceback of the exception and will only contain the exitcode. + For details on torchelastic error handling see: https://pytorch.org/docs/stable/elastic/errors.html + +:: + + from torch.distributed.elastic.multiprocessing.errors import record + + @record + def main(): + # do train + pass + + if __name__ == "__main__": + main() + """ import logging import os @@ -597,7 +618,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 log.warning( - f"*****************************************\n" + f"\n*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 4b0449c8f56727..5f14f31d15a46c 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -4256,7 +4256,10 @@ def pairwise_distance(x1: Tensor, x2: Tensor, p: float = 2.0, eps: float = 1e-6, r""" cosine_similarity(x1, x2, dim=1, eps=1e-8) -> Tensor -Returns cosine similarity between x1 and x2, computed along dim. +Returns cosine similarity between ``x1`` and ``x2``, computed along dim. ``x1`` and ``x2`` must be broadcastable +to a common shape. ``dim`` refers to the dimension in this common shape. Dimension ``dim`` of the output is +squeezed (see :func:`torch.squeeze`), resulting in the +output tensor having 1 fewer dimension. .. math :: \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)} @@ -4265,16 +4268,11 @@ def pairwise_distance(x1: Tensor, x2: Tensor, p: float = 2.0, eps: float = 1e-6, Args: x1 (Tensor): First input. - x2 (Tensor): Second input (with the same number of dimensions as x1, matching x1 size at dimension `dim`, - and broadcastable with x1 at other dimensions). - dim (int, optional): Dimension of vectors. Default: 1 + x2 (Tensor): Second input. + dim (int, optional): Dimension along which cosine similarity is computed. Default: 1 eps (float, optional): Small value to avoid division by zero. Default: 1e-8 -Shape: - - Input: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`. - - Output: :math:`(\ast_1, \ast_2)` - Example:: >>> input1 = torch.randn(100, 128) diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index 2494c2d6dff854..3b641f85709db0 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -31,10 +31,10 @@ def _export(*args, **kwargs): def export(model, args, f, export_params=True, verbose=False, training=TrainingMode.EVAL, input_names=None, output_names=None, operator_export_type=None, - opset_version=None, _retain_param_name=True, do_constant_folding=True, - example_outputs=None, strip_doc_string=True, dynamic_axes=None, - keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=True, - use_external_data_format=False): + opset_version=None, _retain_param_name=None, do_constant_folding=True, + example_outputs=None, strip_doc_string=None, dynamic_axes=None, + keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=None, + use_external_data_format=None): r""" Exports a model into ONNX format. If ``model`` is not a :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs @@ -105,7 +105,9 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM In this case, the exported model will first take all of its parameters as arguments, with the ordering as specified by ``model.state_dict().values()`` verbose (bool, default False): if True, prints a description of the - model being exported to stdout. + model being exported to stdout. In addition, the final ONNX graph will include the + field ``doc_string``` from the exported model which mentions the source code locations + for ``model``. training (enum, default TrainingMode.EVAL): * ``TrainingMode.EVAL``: export the model in inference mode. * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is @@ -181,16 +183,17 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM opset_version (int, default 9): Must be ``== _onnx_main_opset or in _onnx_stable_opsets``, defined in torch/onnx/symbolic_helper.py. + _retain_param_name (bool, default True): [Deprecated and ignored. Will be removed in next PyTorch + release] do_constant_folding (bool, default False): Apply the constant-folding optimization. Constant-folding will replace some of the ops that have all constant inputs with pre-computed constant nodes. example_outputs (T or a tuple of T, where T is Tensor or convertible to Tensor, default None): + [Deprecated and ignored. Will be removed in next PyTorch release], Must be provided when exporting a ScriptModule or ScriptFunction, ignored otherwise. Used to determine the type and shape of the outputs without tracing the execution of the model. A single object is treated as equivalent to a tuple of one element. - strip_doc_string (bool, default True): do not include the field - ``doc_string``` from the exported model. Otherwise the field will mention the source - code locations for ``model``. + strip_doc_string (bool, default True): [Deprecated and ignored. Will be removed in next PyTorch release] dynamic_axes (dict> or dict, default empty dict): By default the exported model will have the shapes of all input and output tensors @@ -293,10 +296,11 @@ def forward(self, x): enable_onnx_checker (bool, default True): Deprecated and ignored. Will be removed in next Pytorch release. - use_external_data_format (bool, default False): If True, then some of the model - parameters are stored in external data files and not in the ONNX model file itself. - Models larger than 2GB cannot be exported in one file because of size limits imposed - by Protocol Buffers. + use_external_data_format (bool, default False): [Deprecated and ignored. Will be removed in + next Pytorch release.] + If True, then some of the model parameters are stored in external data files and not in + the ONNX model file itself. Models larger than 2GB cannot be exported in one file because + of size limits imposed by Protocol Buffers. For details see `onnx.proto `_. If True, argument ``f`` must be a string specifying the location of the model. @@ -316,9 +320,24 @@ def forward(self, x): custom_opsets, enable_onnx_checker, use_external_data_format) -def export_to_pretty_string(*args, **kwargs): +def export_to_pretty_string(*args, **kwargs) -> str: r""" - Same as :func:`export`, but returns a text representation of the exported model. + Similar to :func:`export`, but returns a text representation of the ONNX + model. Only differences in args listed below. All other args are the same + as :func:`export`. + + Args: + f: Deprecated and ignored. Will be removed in the next release of + PyTorch. + add_node_names (bool, default True): Whether or not to set + NodeProto.name. This makes no difference unless + ``google_printer=True``. + google_printer (bool, default False): If False, will return a custom, + compact representation of the model. If True will return the + protobuf's `Message::DebugString()`, which is more verbose. + + Returns: + A UTF-8 str containing a human-readable representation of the ONNX model. """ from torch.onnx import utils return utils.export_to_pretty_string(*args, **kwargs) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 17b5953a2db533..b0418f7cd547b3 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -12,6 +12,7 @@ import re import collections import contextlib +import copy import numbers import warnings from torch._six import string_classes @@ -75,24 +76,37 @@ def select_model_mode_for_export(model, mode): def export(model, args, f, export_params=True, verbose=False, training=None, input_names=None, output_names=None, operator_export_type=None, - opset_version=None, _retain_param_name=True, do_constant_folding=True, - example_outputs=None, strip_doc_string=True, dynamic_axes=None, + opset_version=None, _retain_param_name=None, do_constant_folding=True, + example_outputs=None, strip_doc_string=None, dynamic_axes=None, keep_initializers_as_inputs=None, custom_opsets=None, - enable_onnx_checker=None, use_external_data_format=False): + enable_onnx_checker=None, use_external_data_format=None): if operator_export_type is None: if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE: operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK else: operator_export_type = OperatorExportTypes.ONNX + if enable_onnx_checker is not None: - warnings.warn("`enable_onnx_checker' is deprecated and ignored. It will be removed in" - "the next PyTorch release. To proceed despite ONNX checker failures, you" - "can catch torch.onnx.ONNXCheckerError.") + warnings.warn("'enable_onnx_checker' is deprecated and ignored. It will be removed in " + "the next PyTorch release. To proceed despite ONNX checker failures, " + "catch torch.onnx.ONNXCheckerError.") + if _retain_param_name is not None: + warnings.warn("'_retain_param_name' is deprecated and ignored. " + "It will be removed in the next PyTorch release.") + if strip_doc_string is not None: + warnings.warn("`strip_doc_string' is deprecated and ignored. Will be removed in " + "next PyTorch release. It's combined with `verbose' argument now. ") + if example_outputs is not None: + warnings.warn("`example_outputs' is deprecated and ignored. Will be removed in " + "next PyTorch release.") + if use_external_data_format is not None: + warnings.warn("`use_external_data_format' is deprecated and ignored. Will be removed in next " + "PyTorch release. The code will work as it is False if models are not larger than 2GB, " + "Otherwise set to False because of size limits imposed by Protocol Buffers.") _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type=operator_export_type, opset_version=opset_version, - _retain_param_name=_retain_param_name, do_constant_folding=do_constant_folding, - example_outputs=example_outputs, strip_doc_string=strip_doc_string, + do_constant_folding=do_constant_folding, example_outputs=example_outputs, dynamic_axes=dynamic_axes, keep_initializers_as_inputs=keep_initializers_as_inputs, custom_opsets=custom_opsets, use_external_data_format=use_external_data_format) @@ -310,7 +324,10 @@ def _decide_external_data_format(use_external_data_format, operator_export_type, # string specifying the location of the model. For large model cases, if f is not a non-empty string, # then this method returns an empty string, which is an error condition for the large model export code # path later (but not for regular model export code path). - model_file_location = f if val_use_external_data_format and isinstance(f, str) else str() + if (val_use_external_data_format is None or val_use_external_data_format is True) and isinstance(f, str): + model_file_location = f + else: + model_file_location = str() return val_use_external_data_format, model_file_location def _decide_input_format(model, args): @@ -389,7 +406,7 @@ def _get_param_count_list(method_graph, args_params): return param_count_list -def _create_jit_graph(model, args, _retain_param_name): +def _create_jit_graph(model, args): torch_out = None params: Union[List, Tuple] if isinstance(model, torch.jit.ScriptModule): @@ -420,13 +437,12 @@ def _create_jit_graph(model, args, _retain_param_name): graph, torch_out = _trace_and_get_graph_from_model(model, args) state_dict = _unique_state_dict(model) params = list(state_dict.values()) - if _retain_param_name: - graph_inputs = list(graph.inputs()) - user_input_num = len(graph_inputs) - len(state_dict) - param_names = list(state_dict.keys()) - for i, inp in enumerate(graph_inputs): - if i >= user_input_num: - inp.setDebugName(param_names[i - user_input_num]) + graph_inputs = list(graph.inputs()) + user_input_num = len(graph_inputs) - len(state_dict) + param_names = list(state_dict.keys()) + for i, inp in enumerate(graph_inputs): + if i >= user_input_num: + inp.setDebugName(param_names[i - user_input_num]) torch._C._jit_pass_onnx_function_substitution(graph) return graph, params, torch_out, None @@ -437,12 +453,25 @@ def _get_named_param_dict(graph, params): _params_dict = dict(zip(param_names, params)) return _params_dict +def _get_example_outputs(model, args): + input_args = copy.deepcopy(args) + input_kwargs = {} + if input_args and isinstance(input_args[-1], dict): + input_kwargs = input_args[-1] + input_args = input_args[:-1] + + example_outputs = model(*input_args, **input_kwargs) + if isinstance(example_outputs, (torch.Tensor, int, float, bool)): + example_outputs = (example_outputs,) + + if isinstance(example_outputs, list): + example_outputs = [example_outputs] + return example_outputs def _model_to_graph(model, args, verbose=False, input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX, - example_outputs=None, - _retain_param_name=False, do_constant_folding=True, + example_outputs=None, do_constant_folding=True, _disable_torch_constant_prop=False, fixed_batch_size=False, training=None, dynamic_axes=None): r"""Converts model into an ONNX graph. @@ -461,11 +490,7 @@ def _model_to_graph(model, args, verbose=False, if isinstance(args, (torch.Tensor, int, float, bool)): args = (args, ) - if isinstance(example_outputs, (torch.Tensor, int, float, bool)): - example_outputs = (example_outputs,) - - graph, params, torch_out, module = _create_jit_graph(model, args, - _retain_param_name) + graph, params, torch_out, module = _create_jit_graph(model, args) params_dict = _get_named_param_dict(graph, params) @@ -476,13 +501,22 @@ def _model_to_graph(model, args, verbose=False, module=module) from torch.onnx.symbolic_helper import _onnx_shape_inference if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction): - assert example_outputs is not None, "example_outputs must be provided when exporting a ScriptModule or " \ - "ScriptFunction." - if isinstance(example_outputs, list): - example_outputs = [example_outputs] + if example_outputs is None: + example_outputs = _get_example_outputs(model, args) + else: + # example_outpus specified + if isinstance(example_outputs, (torch.Tensor, int, float, bool)): + example_outputs = (example_outputs,) + + if isinstance(example_outputs, list): + example_outputs = [example_outputs] out_vars, desc = torch.jit._flatten(tuple(example_outputs)) torch._C._jit_pass_onnx_assign_output_shape(graph, out_vars, desc, _onnx_shape_inference) + else: + flatten_args, _ = torch._C._jit_flatten(args) + # make sure that the param dict and the graph match each other + assert len(params) + len(flatten_args) == sum(1 for _ in graph.inputs()) # NB: ONNX requires complete information about output types, which might be # erased by some optimizations, so we need to set it explicitly again. @@ -533,14 +567,18 @@ def _model_to_graph(model, args, verbose=False, def export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=None, input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX, export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, - google_printer=False, opset_version=None, _retain_param_name=True, + google_printer=False, opset_version=None, _retain_param_name=None, keep_initializers_as_inputs=None, custom_opsets=None, add_node_names=True, do_constant_folding=True, dynamic_axes=None): + if f is not None: + warnings.warn("'f' is deprecated and ignored. It will be removed in the next PyTorch release.") + if _retain_param_name is not None: + warnings.warn("'_retain_param_name' is deprecated and ignored. " + "It will be removed in the next PyTorch release.") return _export_to_pretty_string(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, example_outputs, google_printer, - opset_version, _retain_param_name, - do_constant_folding=do_constant_folding, + opset_version, do_constant_folding=do_constant_folding, add_node_names=add_node_names, keep_initializers_as_inputs=keep_initializers_as_inputs, custom_opsets=custom_opsets, dynamic_axes=dynamic_axes) @@ -549,7 +587,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=None, input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX, export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, - google_printer=False, opset_version=None, _retain_param_name=False, + google_printer=False, opset_version=None, do_constant_folding=True, keep_initializers_as_inputs=None, fixed_batch_size=False, custom_opsets=None, add_node_names=True, onnx_shape_inference=True, dynamic_axes=None): @@ -572,8 +610,8 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, args = _decide_input_format(model, args) graph, params_dict, torch_out = _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, - example_outputs, _retain_param_name, - val_do_constant_folding, fixed_batch_size=fixed_batch_size, + example_outputs, val_do_constant_folding, + fixed_batch_size=fixed_batch_size, training=training, dynamic_axes=dynamic_axes) return graph._pretty_print_onnx(params_dict, opset_version, False, @@ -633,10 +671,10 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini def _export(model, args, f, export_params=True, verbose=False, training=None, input_names=None, output_names=None, operator_export_type=None, export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, - opset_version=None, _retain_param_name=False, do_constant_folding=True, - strip_doc_string=True, dynamic_axes=None, keep_initializers_as_inputs=None, + opset_version=None, do_constant_folding=True, + dynamic_axes=None, keep_initializers_as_inputs=None, fixed_batch_size=False, custom_opsets=None, add_node_names=True, - use_external_data_format=False, onnx_shape_inference=True): + use_external_data_format=None, onnx_shape_inference=True): if isinstance(model, torch.nn.DataParallel): raise ValueError("torch.nn.DataParallel is not supported by ONNX " @@ -685,8 +723,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=None, graph, params_dict, torch_out = \ _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, - example_outputs, _retain_param_name, - val_do_constant_folding, + example_outputs, val_do_constant_folding, fixed_batch_size=fixed_batch_size, training=training, dynamic_axes=dynamic_axes) @@ -697,14 +734,14 @@ def _export(model, args, f, export_params=True, verbose=False, training=None, custom_opsets = {} if export_params: - proto, export_map = graph._export_onnx( + proto, export_map, val_use_external_data_format = graph._export_onnx( params_dict, opset_version, dynamic_axes, defer_weight_export, - operator_export_type, strip_doc_string, val_keep_init_as_ip, custom_opsets, + operator_export_type, not verbose, val_keep_init_as_ip, custom_opsets, val_add_node_names, val_use_external_data_format, model_file_location) else: - proto, export_map = graph._export_onnx( + proto, export_map, val_use_external_data_format = graph._export_onnx( {}, opset_version, dynamic_axes, False, operator_export_type, - strip_doc_string, val_keep_init_as_ip, custom_opsets, val_add_node_names, + not verbose, val_keep_init_as_ip, custom_opsets, val_add_node_names, val_use_external_data_format, model_file_location) if export_type == ExportTypes.PROTOBUF_FILE: diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index 5d0849bb8407d7..78af6dcc968806 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -6,7 +6,7 @@ from torch.testing._internal.common_utils import TEST_NUMBA import inspect import contextlib -from setuptools import distutils +from distutils.version import LooseVersion TEST_CUDA = torch.cuda.is_available() @@ -16,7 +16,7 @@ TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)) TEST_CUDNN_VERSION = torch.backends.cudnn.version() if TEST_CUDNN else 0 -CUDA11OrLater = torch.version.cuda and distutils.version.LooseVersion(torch.version.cuda) >= "11.0" +CUDA11OrLater = torch.version.cuda and LooseVersion(torch.version.cuda) >= "11.0" CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.') SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3) SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index d817c9e56ac5bf..42582268b55d9e 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -37,7 +37,7 @@ GRADCHECK_NONDET_TOL, skipIfTBB) import torch.testing._internal.opinfo_helper as opinfo_helper -from setuptools import distutils +from distutils.version import LooseVersion if TEST_SCIPY: import scipy.special @@ -1256,6 +1256,8 @@ def generator(): yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs) # Test for Broadcasting yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) + yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2}) + yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) return list(generator()) @@ -9353,11 +9355,11 @@ def generate_std_var_kwargs(t: torch.Tensor, **kwargs): skips=( # Reference: https://github.com/pytorch/pytorch/pull/49155#issuecomment-742664611 DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - active_if=TEST_SCIPY and distutils.version.LooseVersion(scipy.__version__) < "1.4.0"), + active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard', - active_if=TEST_SCIPY and distutils.version.LooseVersion(scipy.__version__) < "1.4.0"), + active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', - active_if=TEST_SCIPY and distutils.version.LooseVersion(scipy.__version__) < "1.4.0"), + active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"), )), UnaryUfuncInfo('lgamma', ref=reference_lgamma if TEST_SCIPY else _NOTHING, diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index b32c3a159fc0d0..b4ef79620d8d92 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1781,8 +1781,10 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *, assert (atol is None) == (rtol is None), "If one of atol or rtol is specified, then the other must be too" debug_msg: Optional[str] = None + if x is None or y is None: + self.assertTrue(x is None and y is None) # Tensor x Number and Number x Tensor comparisons - if isinstance(x, torch.Tensor) and isinstance(y, Number): + elif isinstance(x, torch.Tensor) and isinstance(y, Number): self.assertEqual(x.item(), y, atol=atol, rtol=rtol, msg=msg, exact_dtype=exact_dtype, exact_device=exact_device) elif isinstance(y, torch.Tensor) and isinstance(x, Number): diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index bb0a85982c665c..a482f5d8e29cbd 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -16,7 +16,7 @@ from .file_baton import FileBaton from ._cpp_extension_versioner import ExtensionVersioner from .hipify import hipify_python -from .hipify.hipify_python import get_hip_file_path, GeneratedFileCleaner +from .hipify.hipify_python import GeneratedFileCleaner from typing import List, Optional, Union from setuptools.command.build_ext import build_ext @@ -939,16 +939,19 @@ def CUDAExtension(name, sources, *args, **kwargs): hipify_result = hipify_python.hipify( project_directory=build_dir, output_directory=build_dir, - includes=[os.path.join(os.path.relpath(include_dir, build_dir), '*') for include_dir in include_dirs] if include_dirs else ['*'], + header_include_dirs=include_dirs, + includes=[os.path.join(build_dir, '*')], # limit scope to build_dir only extra_files=[os.path.abspath(s) for s in sources], show_detailed=True, is_pytorch_extension=True, + hipify_extra_files_only=True, # don't hipify everything in includes path ) hipified_sources = set() for source in sources: s_abs = os.path.abspath(source) - hipified_sources.add(hipify_result[s_abs]["hipified_path"] if s_abs in hipify_result else s_abs) + hipified_sources.add(hipify_result[s_abs]["hipified_path"] if (s_abs in hipify_result and + hipify_result[s_abs]["hipified_path"] is not None) else s_abs) sources = list(hipified_sources) @@ -1325,15 +1328,25 @@ def _jit_compile(name, try: with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx: if IS_HIP_EXTENSION and (with_cuda or with_cudnn): - hipify_python.hipify( + hipify_result = hipify_python.hipify( project_directory=build_directory, output_directory=build_directory, - includes=os.path.join(build_directory, '*'), + header_include_dirs=(extra_include_paths if extra_include_paths is not None else []), extra_files=[os.path.abspath(s) for s in sources], + ignores=[os.path.join(ROCM_HOME, '*'), os.path.join(_TORCH_PATH, '*')], # no need to hipify ROCm or PyTorch headers show_detailed=verbose, + show_progress=verbose, is_pytorch_extension=True, clean_ctx=clean_ctx ) + + hipified_sources = set() + for source in sources: + s_abs = os.path.abspath(source) + hipified_sources.add(hipify_result[s_abs]["hipified_path"] if s_abs in hipify_result else s_abs) + + sources = list(hipified_sources) + _write_ninja_file_and_build_library( name=name, sources=sources, @@ -1636,7 +1649,7 @@ def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: # Use same defaults from file cmake/public/LoadHIP.cmake. # Must keep in sync if defaults change. # Allow env var to override, just like during initial cmake build. - archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx803;gfx900;gfx906;gfx908') + archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx900;gfx906;gfx908;gfx90a') flags = ['--amdgpu-target=%s' % arch for arch in archs.split(';')] flags += ['-fno-gpu-rdc'] return flags @@ -1826,10 +1839,6 @@ def _write_ninja_file_to_build_library(path, cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS cuda_flags += extra_cuda_cflags cuda_flags += _get_rocm_arch_flags(cuda_flags) - sources = [s if not _is_cuda_file(s) else - os.path.abspath(os.path.join( - path, get_hip_file_path(os.path.relpath(s, path), is_pytorch_extension=True))) - for s in sources] elif with_cuda: cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags() if IS_WINDOWS: @@ -1940,6 +1949,8 @@ def sanitize_flags(flags): nvcc = _join_cuda_home('bin', 'nvcc') config.append(f'nvcc = {nvcc}') + if IS_HIP_EXTENSION: + post_cflags = COMMON_HIP_FLAGS + post_cflags flags = [f'cflags = {" ".join(cflags)}'] flags.append(f'post_cflags = {" ".join(post_cflags)}') if with_cuda: diff --git a/torch/utils/data/datapipes/README.md b/torch/utils/data/datapipes/README.md index 69cd56d3cfbd1b..e19e1efdc6e2cb 100644 --- a/torch/utils/data/datapipes/README.md +++ b/torch/utils/data/datapipes/README.md @@ -21,7 +21,9 @@ class MapperIterDataPipe(IterDataPipe): self.dp = dp self.fn = fn ``` -Note: Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory. +Note: +- Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory. +- If `IterDataPipe` instance holds data in memory, please be ware of the in-place modification of data. When second iterator is created from the instance, the data may have already changed. Please take [`IterableWrapper`](https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes/iter/utils.py) class as reference to `deepcopy` data for each iterator. ### Iterator For `IterDataPipe`, an `__iter__` function is needed to consume data from the source `IterDataPipe` then apply operation over the data before yield. diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py index e7f81842b233ef..bd1134c23e1f97 100644 --- a/torch/utils/data/datapipes/iter/callable.py +++ b/torch/utils/data/datapipes/iter/callable.py @@ -1,4 +1,3 @@ -import copy import warnings from torch.utils.data import IterDataPipe, _utils, functional_datapipe, DataChunk from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar @@ -99,8 +98,6 @@ def _apply_fn(self, data): data = list(data) else: t_flag = False - # Deepcopy data to prevent the original data modified. E.g. list, dict - data = copy.deepcopy(data) if self.output_col is None: if isinstance(self.input_col, (list, tuple)): diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index f8486811adeeb2..37dcc6238be2cb 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -1,9 +1,9 @@ import random -import warnings from collections import defaultdict from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk +from torch.utils.data.datapipes.utils.common import deprecation_warning_torchdata from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar T_co = TypeVar('T_co', covariant=True) @@ -185,8 +185,7 @@ def __init__(self, assert batch_size > 0, "Batch size is required to be larger than 0!" assert batch_num > 0, "Number of batches is required to be larger than 0!" assert bucket_num > 0, "Number of buckets is required to be larger than 0!" - - warnings.warn("`BucketBatcher` is going to be removed from PyTorch Core") + deprecation_warning_torchdata(type(self).__name__) super().__init__() # TODO: Verify _datapippe is not going to be serialized twice diff --git a/torch/utils/data/datapipes/iter/httpreader.py b/torch/utils/data/datapipes/iter/httpreader.py index 0c8e2fc818e9fa..f13aa5bfc7e8ec 100644 --- a/torch/utils/data/datapipes/iter/httpreader.py +++ b/torch/utils/data/datapipes/iter/httpreader.py @@ -3,6 +3,7 @@ from urllib.error import HTTPError, URLError import urllib.request as urllib from torch.utils.data import IterDataPipe +from torch.utils.data.datapipes.utils.common import deprecation_warning_torchdata class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): @@ -19,6 +20,7 @@ class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): def __init__(self, datapipe, timeout=None): self.datapipe = datapipe self.timeout = timeout + deprecation_warning_torchdata(type(self).__name__) def __iter__(self): for furl in self.datapipe: diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py index 04b992d647b777..311053eb973ed2 100644 --- a/torch/utils/data/datapipes/iter/linereader.py +++ b/torch/utils/data/datapipes/iter/linereader.py @@ -1,5 +1,6 @@ from typing import Tuple from torch.utils.data import IterDataPipe +from torch.utils.data.datapipes.utils.common import deprecation_warning_torchdata class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]): @@ -14,6 +15,7 @@ class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]): def __init__(self, datapipe): self.datapipe = datapipe + deprecation_warning_torchdata(type(self).__name__) def __iter__(self): for file_name, stream in self.datapipe: diff --git a/torch/utils/data/datapipes/iter/tararchivereader.py b/torch/utils/data/datapipes/iter/tararchivereader.py index 95c3331ee647bd..60bb16f149e909 100644 --- a/torch/utils/data/datapipes/iter/tararchivereader.py +++ b/torch/utils/data/datapipes/iter/tararchivereader.py @@ -1,5 +1,5 @@ from torch.utils.data import IterDataPipe -from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple +from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple, deprecation_warning_torchdata from typing import Iterable, Iterator, Tuple, Optional, IO, cast from io import BufferedIOBase @@ -34,11 +34,13 @@ def __init__( self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe self.mode: str = mode self.length: int = length + deprecation_warning_torchdata(type(self).__name__) def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: for data in self.datapipe: validate_pathname_binary_tuple(data) pathname, data_stream = data + folder_name = os.path.dirname(pathname) try: # typing.cast is used here to silence mypy's type checker tar = tarfile.open(fileobj=cast(Optional[IO[bytes]], data_stream), mode=self.mode) @@ -49,14 +51,12 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: if extracted_fobj is None: warnings.warn("failed to extract file {} from source tarfile {}".format(tarinfo.name, pathname)) raise tarfile.ExtractError - inner_pathname = os.path.normpath(os.path.join(pathname, tarinfo.name)) + inner_pathname = os.path.normpath(os.path.join(folder_name, tarinfo.name)) yield (inner_pathname, extracted_fobj) # type: ignore[misc] except Exception as e: warnings.warn( "Unable to extract files from corrupted tarfile stream {} due to: {}, abort!".format(pathname, e)) raise e - finally: - data_stream.close() def __len__(self): if self.length == -1: diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py index 9ba80e3576f77d..246244a6efdbaa 100644 --- a/torch/utils/data/datapipes/iter/utils.py +++ b/torch/utils/data/datapipes/iter/utils.py @@ -1,3 +1,5 @@ +import copy +import warnings from torch.utils.data import IterDataPipe @@ -8,12 +10,34 @@ class IterableWrapperIterDataPipe(IterDataPipe): Args: iterable: Iterable object to be wrapped into an IterDataPipe + deepcopy: Option to deepcopy input iterable object for each + iteration. + + .. note:: + If `deepcopy` is set to False explicitly, users should ensure + that data pipeline doesn't contain any in-place operations over + the iterable instance, in order to prevent data inconsistency + across iterations. """ - def __init__(self, iterable): + def __init__(self, iterable, deepcopy=True): self.iterable = iterable + self.deepcopy = deepcopy def __iter__(self): - for data in self.iterable: + source_data = self.iterable + if self.deepcopy: + try: + source_data = copy.deepcopy(self.iterable) + # For the case that data cannot be deep-copied, + # all in-place operations will affect iterable variable. + # When this DataPipe is iterated second time, it will + # yield modified items. + except TypeError: + warnings.warn( + "The input iterable can not be deepcopied, " + "please be aware of in-place modification would affect source data" + ) + for data in source_data: yield data def __len__(self): diff --git a/torch/utils/data/datapipes/iter/ziparchivereader.py b/torch/utils/data/datapipes/iter/ziparchivereader.py index 66af28e1fbb4a8..988384b79b2f7b 100644 --- a/torch/utils/data/datapipes/iter/ziparchivereader.py +++ b/torch/utils/data/datapipes/iter/ziparchivereader.py @@ -1,5 +1,5 @@ from torch.utils.data import IterDataPipe -from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple +from torch.utils.data.datapipes.utils.common import validate_pathname_binary_tuple, deprecation_warning_torchdata from typing import Iterable, Iterator, Tuple, IO, cast from io import BufferedIOBase @@ -31,11 +31,13 @@ def __init__( super().__init__() self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe self.length: int = length + deprecation_warning_torchdata(type(self).__name__) def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: for data in self.datapipe: validate_pathname_binary_tuple(data) pathname, data_stream = data + folder_name = os.path.dirname(pathname) try: # typing.cast is used here to silence mypy's type checker zips = zipfile.ZipFile(cast(IO[bytes], data_stream)) @@ -47,7 +49,7 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: elif zipinfo.filename.endswith('/'): continue extracted_fobj = zips.open(zipinfo) - inner_pathname = os.path.normpath(os.path.join(pathname, zipinfo.filename)) + inner_pathname = os.path.normpath(os.path.join(folder_name, zipinfo.filename)) yield (inner_pathname, extracted_fobj) # type: ignore[misc] except Exception as e: warnings.warn( diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py index bdb5a26660d8c9..a0ed36d388154f 100644 --- a/torch/utils/data/datapipes/utils/common.py +++ b/torch/utils/data/datapipes/utils/common.py @@ -63,3 +63,9 @@ def validate_pathname_binary_tuple(data): raise TypeError("pathname binary tuple should have string type pathname, but got {}".format(type(data[0]))) if not isinstance(data[1], BufferedIOBase): raise TypeError("pathname binary tuple should have BufferedIOBase based binary type, but got {}".format(type(data[1]))) + +# Warns user that the DataPipe has been moved to TorchData and will be removed from `torch` +def deprecation_warning_torchdata(name): + warnings.warn(f"{name} and its functional API are deprecated and will be removed from the package `torch`. " + f"Please import those features from the new package TorchData: https://github.com/pytorch/data", + DeprecationWarning) diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py index 79033470995dc5..232302e53c5418 100644 --- a/torch/utils/data/sampler.py +++ b/torch/utils/data/sampler.py @@ -112,15 +112,18 @@ def num_samples(self) -> int: def __iter__(self) -> Iterator[int]: n = len(self.data_source) if self.generator is None: - self.generator = torch.Generator() - self.generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item())) + seed = int(torch.empty((), dtype=torch.int64).random_().item()) + generator = torch.Generator() + generator.manual_seed(seed) + else: + generator = self.generator if self.replacement: for _ in range(self.num_samples // 32): - yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=self.generator).tolist() - yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=self.generator).tolist() + yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist() + yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist() else: - yield from torch.randperm(n, generator=self.generator).tolist() + yield from torch.randperm(n, generator=generator).tolist() def __len__(self) -> int: return self.num_samples @@ -140,7 +143,8 @@ def __init__(self, indices: Sequence[int], generator=None) -> None: self.generator = generator def __iter__(self) -> Iterator[int]: - return (self.indices[i] for i in torch.randperm(len(self.indices), generator=self.generator)) + for i in torch.randperm(len(self.indices), generator=self.generator): + yield self.indices[i] def __len__(self) -> int: return len(self.indices) @@ -183,7 +187,7 @@ def __init__(self, weights: Sequence[float], num_samples: int, def __iter__(self) -> Iterator[int]: rand_tensor = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator) - return iter(rand_tensor.tolist()) + yield from iter(rand_tensor.tolist()) def __len__(self) -> int: return self.num_samples diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py old mode 100644 new mode 100755 index ad2903f7ad655f..b9d084a8decdac --- a/torch/utils/hipify/hipify_python.py +++ b/torch/utils/hipify/hipify_python.py @@ -117,15 +117,16 @@ def match_extensions(filename: str, extensions: Iterable) -> bool: """Helper method to see if filename ends with certain extension""" return any(filename.endswith(e) for e in extensions) +def _fnmatch(filepath, patterns): + return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns) + def matched_files_iter( root_path: str, - includes: Iterable = ('*',), + includes: Iterable = (), ignores: Iterable = (), extensions: Iterable = (), out_of_place_only: bool = False, is_pytorch_extension: bool = False) -> Iterator[str]: - def _fnmatch(filepath, patterns): - return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns) exact_matches = set(includes) @@ -145,7 +146,8 @@ def _fnmatch(filepath, patterns): if "third_party" in dirs: dirs.remove("third_party") for filename in filenames: - filepath = os.path.join(rel_dirpath, filename) + filepath = os.path.join(abs_dirpath, filename) + rel_filepath = os.path.join(rel_dirpath, filename) # We respect extensions, UNLESS you wrote the entire # filename verbatim, in which case we always accept it if ( @@ -154,9 +156,9 @@ def _fnmatch(filepath, patterns): and (match_extensions(filepath, extensions) or filepath in exact_matches) ): if not is_pytorch_extension: # for pytorch extensions, consider all files - if not is_pytorch_file(filepath) and not is_caffe2_gpu_file(filepath): + if not is_pytorch_file(rel_filepath) and not is_caffe2_gpu_file(rel_filepath): continue - if out_of_place_only and not is_out_of_place(filepath): + if out_of_place_only and not is_out_of_place(rel_filepath): continue yield filepath @@ -165,59 +167,23 @@ def preprocess_file_and_save_result( output_directory: str, filepath: str, all_files: Iterable, - includes: Iterable, + header_include_dirs: Iterable, stats: Dict[str, List], hip_clang_launch: bool, is_pytorch_extension: bool, clean_ctx: GeneratedFileCleaner, show_progress: bool) -> None: - result = preprocessor(output_directory, filepath, all_files, includes, stats, + result = preprocessor(output_directory, filepath, all_files, header_include_dirs, stats, hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress) fin_path = os.path.abspath(os.path.join(output_directory, filepath)) # Show what happened - if show_progress: + if show_progress and "ignored" not in result["status"]: print( fin_path, "->", - result["hipified_path"], result["status"]) - - if result["hipified_path"] is not None: - HIPIFY_FINAL_RESULT[fin_path] = result - - -def preprocess( - output_directory: str, - all_files: Iterable, - includes: Iterable, - show_detailed: bool = False, - show_progress: bool = True, - hip_clang_launch: bool = False, - is_pytorch_extension: bool = False, - clean_ctx: GeneratedFileCleaner = None) -> HipifyFinalResult: - """ - Call preprocessor on selected files. - - Arguments) - show_detailed - Show a detailed summary of the transpilation process. - """ - - if clean_ctx is None: - clean_ctx = GeneratedFileCleaner(keep_intermediates=True) - - # Preprocessing statistics. - stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []} - - for filepath in all_files: - preprocess_file_and_save_result(output_directory, filepath, all_files, includes, stats, - hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress) + result["hipified_path"], result["status"], flush=True) - print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr) - - # Show detailed summary - if show_detailed: - compute_stats(stats) - - return HIPIFY_FINAL_RESULT + HIPIFY_FINAL_RESULT[fin_path] = result def compute_stats(stats): @@ -509,16 +475,17 @@ def replace_extern_shared(input_string): return output_string -def get_hip_file_path(filepath, is_pytorch_extension=False): +def get_hip_file_path(rel_filepath, is_pytorch_extension=False): """ Returns the new name of the hipified file """ # At the moment, some PyTorch source files are HIPified in place. The predicate # is_out_of_place tells us if this is the case or not. - if not is_pytorch_extension and not is_out_of_place(filepath): - return filepath + assert(not os.path.isabs(rel_filepath)) + if not is_pytorch_extension and not is_out_of_place(rel_filepath): + return rel_filepath - dirpath, filename = os.path.split(filepath) + dirpath, filename = os.path.split(rel_filepath) root, ext = os.path.splitext(filename) # Here's the plan: @@ -562,6 +529,7 @@ def get_hip_file_path(filepath, is_pytorch_extension=False): orig_dirpath = dirpath dirpath = dirpath.replace('cuda', 'hip') + dirpath = dirpath.replace('CUDA', 'HIP') dirpath = dirpath.replace('THC', 'THH') root = root.replace('cuda', 'hip') @@ -579,31 +547,34 @@ def get_hip_file_path(filepath, is_pytorch_extension=False): return os.path.join(dirpath, root + ext) -def is_out_of_place(filepath): - if filepath.startswith("torch/"): +def is_out_of_place(rel_filepath): + assert(not os.path.isabs(rel_filepath)) + if rel_filepath.startswith("torch/"): return False - if filepath.startswith("tools/autograd/templates/"): + if rel_filepath.startswith("tools/autograd/templates/"): return False return True # Keep this synchronized with includes/ignores in build_amd.py -def is_pytorch_file(filepath): - if filepath.startswith("aten/"): - if filepath.startswith("aten/src/ATen/core/"): +def is_pytorch_file(rel_filepath): + assert(not os.path.isabs(rel_filepath)) + if rel_filepath.startswith("aten/"): + if rel_filepath.startswith("aten/src/ATen/core/"): return False return True - if filepath.startswith("torch/"): + if rel_filepath.startswith("torch/"): return True - if filepath.startswith("tools/autograd/templates/"): + if rel_filepath.startswith("tools/autograd/templates/"): return True return False -def is_caffe2_gpu_file(filepath): - if filepath.startswith("c10/cuda"): +def is_caffe2_gpu_file(rel_filepath): + assert(not os.path.isabs(rel_filepath)) + if rel_filepath.startswith("c10/cuda"): return True - filename = os.path.basename(filepath) + filename = os.path.basename(rel_filepath) _, ext = os.path.splitext(filename) return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename) @@ -697,31 +668,36 @@ def pattern(self): Returns a dict with the following keys: "hipified_path" : absolute path of hipified source file "status" : "ok" if hipified file was written out - "skipped" if an identical hipified file already existed - "ignored" if the source file was a hipified file itself + "skipped" if an identical hipified file already existed or hipified file couldn't be written out + "ignored" if the source file was a hipified file itself or not meant to be hipified """ def preprocessor( output_directory: str, filepath: str, all_files: Iterable, - includes: Iterable, + header_include_dirs: Iterable, stats: Dict[str, List], hip_clang_launch: bool, is_pytorch_extension: bool, clean_ctx: GeneratedFileCleaner, show_progress: bool) -> HipifyResult: """ Executes the CUDA -> HIP conversion on the specified file. """ + if filepath not in all_files: + return {"hipified_path": None, "status": "[ignored, not to be hipified]"} + fin_path = os.path.abspath(os.path.join(output_directory, filepath)) + rel_filepath = os.path.relpath(filepath, output_directory) with open(fin_path, 'r', encoding='utf-8') as fin: if fin.readline() == HIPIFY_C_BREADCRUMB: - return {"hipified_path": None, "status": "ignored"} + return {"hipified_path": None, "status": "[ignored, input is hipified output]"} fin.seek(0) output_source = fin.read() orig_output_source = output_source - fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension))) + # get_hip_file_path needs a relative path to work correctly + fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(rel_filepath, is_pytorch_extension))) if not os.path.exists(os.path.dirname(fout_path)): clean_ctx.makedirs(os.path.dirname(fout_path)) @@ -732,7 +708,7 @@ def pt_repl(m): if is_pytorch_extension: output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source) else: - if is_pytorch_file(filepath): + if is_pytorch_file(rel_filepath): output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source) else: def c2_repl(m): @@ -766,8 +742,8 @@ def repl(m): header_filepath = header_path_to_check # If not found, look in include dirs one by one and first match wins if header_filepath is None: - for include in includes: - header_dir_to_check = os.path.join(output_directory, os.path.dirname(include)) + for header_include_dir in header_include_dirs: + header_dir_to_check = os.path.join(output_directory, header_include_dir) header_path_to_check = os.path.abspath(os.path.join(header_dir_to_check, f)) if os.path.exists(header_path_to_check): header_dir = header_dir_to_check @@ -778,12 +754,12 @@ def repl(m): # Hipify header file first if needed if header_filepath not in HIPIFY_FINAL_RESULT: preprocess_file_and_save_result(output_directory, - os.path.relpath(header_filepath, output_directory), - all_files, includes, stats, hip_clang_launch, is_pytorch_extension, - clean_ctx, show_progress) - value = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"] - assert value is not None - return templ.format(os.path.relpath(value, header_dir)) + header_filepath, + all_files, header_include_dirs, stats, hip_clang_launch, + is_pytorch_extension, clean_ctx, show_progress) + hipified_header_filepath = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"] + return templ.format(os.path.relpath(hipified_header_filepath if hipified_header_filepath is not None + else header_filepath, header_dir)) return m.group(0) return repl @@ -817,7 +793,7 @@ def repl(m): and orig_output_source == output_source and os.path.dirname(fin_path) == os.path.dirname(fout_path) ): - return {"hipified_path": fin_path, "status": "ok"} + return {"hipified_path": fin_path, "status": "[skipped, no changes]"} # Add hipify breadcrumb for C-style files to avoid re-hipification if fin_path != fout_path and match_extensions(fin_path, (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".hpp")): @@ -831,13 +807,13 @@ def repl(m): try: with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout: fout.write(output_source) - return {"hipified_path": fout_path, "status": "ok"} + return {"hipified_path": fout_path, "status": "[ok]"} except PermissionError as e: print(f"{bcolors.WARNING}Failed to save {fout_path} with \"{e.strerror}\", leaving {fin_path} unchanged.{bcolors.ENDC}", file=sys.stderr) - return {"hipified_path": fin_path, "status": "skipped"} + return {"hipified_path": fin_path, "status": "[skipped, no permissions]"} else: - return {"hipified_path": fout_path, "status": "skipped"} + return {"hipified_path": fout_path, "status": "[skipped, already hipified]"} def file_specific_replacement(filepath, search_string, replace_string, strict=False): with openf(filepath, "r+") as f: @@ -932,14 +908,17 @@ def hipify( project_directory: str, show_detailed: bool = False, extensions: Iterable = (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".in", ".hpp"), + header_extensions: Iterable = (".cuh", ".h", ".hpp"), output_directory: str = "", - includes: Iterable = (), + header_include_dirs: Iterable = (), + includes: Iterable = ('*',), extra_files: Iterable = (), out_of_place_only: bool = False, ignores: Iterable = (), show_progress: bool = True, hip_clang_launch: bool = False, is_pytorch_extension: bool = False, + hipify_extra_files_only: bool = False, clean_ctx: GeneratedFileCleaner = None ) -> HipifyFinalResult: if project_directory == "": @@ -955,6 +934,10 @@ def hipify( project_directory.rstrip("/") output_directory = project_directory + "_amd" + if project_directory != output_directory: + includes = [include.replace(project_directory, output_directory) for include in includes] + ignores = [ignore.replace(project_directory, output_directory) for ignore in ignores] + # Copy from project directory to output directory if not done already. if not os.path.exists(output_directory): shutil.copytree(project_directory, output_directory) @@ -964,19 +947,42 @@ def hipify( out_of_place_only=out_of_place_only, is_pytorch_extension=is_pytorch_extension)) all_files_set = set(all_files) - # Convert extra_files to relative paths since all_files has all relative paths for f in extra_files: - f_rel = os.path.relpath(f, output_directory) - if f_rel not in all_files_set: - all_files.append(f_rel) - - # Start Preprocessor - return preprocess( - output_directory, - all_files, - includes, - show_detailed=show_detailed, - show_progress=show_progress, - hip_clang_launch=hip_clang_launch, - is_pytorch_extension=is_pytorch_extension, - clean_ctx=clean_ctx) + if not os.path.isabs(f): + f = os.path.join(output_directory, f) + if f not in all_files_set: + all_files.append(f) + + # List all files in header_include_paths to ensure they are hipified + from pathlib import Path + for header_include_dir in header_include_dirs: + if os.path.isabs(header_include_dir): + header_include_dir_path = Path(header_include_dir) + else: + header_include_dir_path = Path(os.path.join(output_directory, header_include_dir)) + for path in header_include_dir_path.rglob('*'): + if ( + path.is_file() + and _fnmatch(str(path), includes) + and (not _fnmatch(str(path), ignores)) + and match_extensions(path.name, header_extensions) + ): + all_files.append(str(path)) + + if clean_ctx is None: + clean_ctx = GeneratedFileCleaner(keep_intermediates=True) + + # Preprocessing statistics. + stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []} + + for filepath in (all_files if not hipify_extra_files_only else extra_files): + preprocess_file_and_save_result(output_directory, filepath, all_files, header_include_dirs, + stats, hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress) + + print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr) + + # Show detailed summary + if show_detailed: + compute_stats(stats) + + return HIPIFY_FINAL_RESULT