diff --git a/.circleci/config.yml b/.circleci/config.yml index bcd2a5527b9835..476e9867cf2a4a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,11 +5,37 @@ docker_config_defaults: &docker_config_defaults aws_access_key_id: AKIAJ2J6FIG5OSZTQ3IA aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} +# NOTE: We only perform the merge in build step and not in test step, because +# all source files will be shared from build to test +merge_pull_request_onto_master: &merge_pull_request_onto_master + name: Merge Onto Master + no_output_timeout: "10h" + command: | + if [[ "${CIRCLE_BRANCH}" != "master" ]]; then + git config --global user.email "circleci.ossci@gmail.com" + git config --global user.name "CircleCI" + + git config remote.origin.url https://github.com/pytorch/pytorch.git + git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master + git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=50 --quiet + + export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master` + echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET} + export GIT_COMMIT=${CIRCLE_SHA1} + echo "GIT_COMMIT: " ${GIT_COMMIT} + + git checkout -f ${GIT_COMMIT} + git reset --hard ${GIT_COMMIT} + git merge --no-edit --no-ff ${GIT_MERGE_TARGET} + fi + pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults resource_class: large working_directory: /var/lib/jenkins/workspace steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -22,7 +48,7 @@ pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults # This IAM user allows write access to S3 bucket for sccache export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init + git submodule sync && git submodule update --init .jenkins/pytorch/build.sh .jenkins/pytorch/test.sh @@ -31,6 +57,8 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults working_directory: /var/lib/jenkins/workspace steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -46,16 +74,18 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults # This IAM user allows write access to S3 bucket for sccache export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init || git submodule update --init || git submodule update --init + git submodule sync && git submodule update --init .jenkins/pytorch/build.sh - mkdir -p pytorch-ci-env/ - cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch pytorch-ci-env/torch - cp -r build/bin pytorch-ci-env/cpp_test_bin + export PYTORCH_CI_ENV_DIR=/var/lib/jenkins/pytorch-ci-env + mkdir -p ${PYTORCH_CI_ENV_DIR} + cp -r /var/lib/jenkins/workspace ${PYTORCH_CI_ENV_DIR}/build_workspace # This copies all source files from build step to the next step + cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch ${PYTORCH_CI_ENV_DIR}/torch + cp -r build/bin ${PYTORCH_CI_ENV_DIR}/cpp_test_bin if [ -d "../cpp-build" ]; then - cp -r ../cpp-build pytorch-ci-env/cpp-build + cp -r ../cpp-build ${PYTORCH_CI_ENV_DIR}/cpp-build fi - persist_to_workspace: - root: /var/lib/jenkins/workspace/pytorch-ci-env + root: /var/lib/jenkins/pytorch-ci-env paths: - "*" @@ -63,7 +93,6 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults machine: image: default steps: - - checkout - run: name: Prepare workspace command: | @@ -107,12 +136,16 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) fi pwd + + cp -r /opt/workspace/build_workspace/. /home/circleci/project # This copies all source files from build step to the current step + echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env # This IAM user allows write access to S3 bucket for sccache echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env + mkdir -p /home/circleci/project/build cp -r /opt/workspace/cpp_test_bin /home/circleci/project/build/bin docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" @@ -122,9 +155,9 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults docker cp "/opt/workspace/cpp-build" "$id:/var/lib/jenkins/cpp-build" fi if [ -n "${MULTI_GPU}" ]; then - (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash + (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash else - (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash + (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash fi caffe2_linux_build_defaults: &caffe2_linux_build_defaults @@ -132,6 +165,8 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults working_directory: /var/lib/jenkins/workspace steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -152,7 +187,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults cd third_party/onnx && git fetch --tags --progress origin +refs/pull/*:refs/remotes/origin/pr/* && cd - # Reinitialize submodules - git submodule update --init --recursive + git submodule sync && git submodule update --init --recursive # Ensure jenkins can write to the ccache root dir. sudo chown jenkins:jenkins "${HOME}/.ccache" @@ -189,16 +224,18 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults fi # Copy all necessary binaries to shared workspace - mkdir -p caffe2-ci-env - cp -r third_party/onnx caffe2-ci-env/onnx + export CAFFE2_CI_ENV_DIR=/var/lib/jenkins/caffe2-ci-env + mkdir -p ${CAFFE2_CI_ENV_DIR} + cp -r /var/lib/jenkins/workspace ${CAFFE2_CI_ENV_DIR}/build_workspace # This copies all source files from build step to the next step + cp -r third_party/onnx ${CAFFE2_CI_ENV_DIR}/onnx if [ -d "/usr/local/caffe2" ]; then - cp -r /usr/local/caffe2 caffe2-ci-env/caffe2 + cp -r /usr/local/caffe2 ${CAFFE2_CI_ENV_DIR}/caffe2 fi if [ -d "/opt/conda" ]; then - cp -r /opt/conda caffe2-ci-env/conda_env + cp -r /opt/conda ${CAFFE2_CI_ENV_DIR}/conda_env fi - persist_to_workspace: - root: /var/lib/jenkins/workspace/caffe2-ci-env + root: /var/lib/jenkins/caffe2-ci-env paths: - "*" @@ -206,7 +243,6 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults machine: image: default steps: - - checkout - run: name: Prepare workspace command: | @@ -250,6 +286,7 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) fi pwd + cp -r /opt/workspace/build_workspace/. /home/circleci/project # This copies all source files from build step to the current step echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env # This IAM user allows write access to S3 bucket for sccache @@ -315,6 +352,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults xcode: "9.0" steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -326,7 +365,7 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults brew install cmake # Reinitialize submodules - git submodule update --init --recursive + git submodule sync && git submodule update --init --recursive # Reinitialize path (see man page for path_helper(8)) eval `/usr/libexec/path_helper -s` @@ -525,6 +564,8 @@ jobs: xcode: "9.0" steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build environment: @@ -544,9 +585,12 @@ jobs: export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init + git submodule sync && git submodule update --init chmod a+x .jenkins/pytorch/macos-build.sh .jenkins/pytorch/macos-build.sh + + # TODO: need to share source files from build to test, when macOS builds are enabled + - persist_to_workspace: root: /Users/distiller/pytorch-ci-env paths: @@ -556,7 +600,6 @@ jobs: macos: xcode: "9.0" steps: - - checkout - run: name: Prepare workspace command: | @@ -570,9 +613,9 @@ jobs: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3 no_output_timeout: "10h" command: | + # TODO: need to share source files from build to test, when macOS builds are enabled set -ex export IN_CIRCLECI=1 - git submodule update --init chmod a+x .jenkins/pytorch/macos-test.sh .jenkins/pytorch/macos-test.sh @@ -581,6 +624,8 @@ jobs: xcode: "9.0" steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build environment: @@ -616,7 +661,7 @@ jobs: export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init + git submodule sync && git submodule update --init chmod a+x .jenkins/pytorch/macos-build.sh .jenkins/pytorch/macos-build.sh @@ -845,36 +890,36 @@ workflows: version: 2 build: jobs: - - pytorch_linux_trusty_py2_7_9_build_test - - pytorch_linux_trusty_py2_7_build_test - - pytorch_linux_trusty_py3_5_build_test - - pytorch_linux_trusty_py3_6_gcc4_8_build_test - - pytorch_linux_trusty_py3_6_gcc5_4_build_test - - pytorch_linux_trusty_py3_6_gcc7_build_test - - pytorch_linux_trusty_pynightly_build_test - - pytorch_linux_xenial_py3_clang5_asan_build - - pytorch_linux_xenial_py3_clang5_asan_test: - requires: - - pytorch_linux_xenial_py3_clang5_asan_build - - pytorch_linux_xenial_cuda8_cudnn6_py3_build - - pytorch_linux_xenial_cuda8_cudnn6_py3_test: - requires: - - pytorch_linux_xenial_cuda8_cudnn6_py3_build - - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test: - requires: - - pytorch_linux_xenial_cuda8_cudnn6_py3_build - - pytorch_linux_xenial_cuda9_cudnn7_py2_build - - pytorch_linux_xenial_cuda9_cudnn7_py2_test: - requires: - - pytorch_linux_xenial_cuda9_cudnn7_py2_build - - pytorch_linux_xenial_cuda9_cudnn7_py3_build - - pytorch_linux_xenial_cuda9_cudnn7_py3_test: - requires: - - pytorch_linux_xenial_cuda9_cudnn7_py3_build - - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build - - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test: - requires: - - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build + # - pytorch_linux_trusty_py2_7_9_build_test + # - pytorch_linux_trusty_py2_7_build_test + # - pytorch_linux_trusty_py3_5_build_test + # - pytorch_linux_trusty_py3_6_gcc4_8_build_test + # - pytorch_linux_trusty_py3_6_gcc5_4_build_test + # - pytorch_linux_trusty_py3_6_gcc7_build_test + # - pytorch_linux_trusty_pynightly_build_test + # - pytorch_linux_xenial_py3_clang5_asan_build + # - pytorch_linux_xenial_py3_clang5_asan_test: + # requires: + # - pytorch_linux_xenial_py3_clang5_asan_build + # - pytorch_linux_xenial_cuda8_cudnn6_py3_build + # - pytorch_linux_xenial_cuda8_cudnn6_py3_test: + # requires: + # - pytorch_linux_xenial_cuda8_cudnn6_py3_build + # - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test: + # requires: + # - pytorch_linux_xenial_cuda8_cudnn6_py3_build + # - pytorch_linux_xenial_cuda9_cudnn7_py2_build + # - pytorch_linux_xenial_cuda9_cudnn7_py2_test: + # requires: + # - pytorch_linux_xenial_cuda9_cudnn7_py2_build + # - pytorch_linux_xenial_cuda9_cudnn7_py3_build + # - pytorch_linux_xenial_cuda9_cudnn7_py3_test: + # requires: + # - pytorch_linux_xenial_cuda9_cudnn7_py3_build + # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build + # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test: + # requires: + # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build # - pytorch_macos_10_13_py3_build # - pytorch_macos_10_13_py3_test: @@ -882,48 +927,48 @@ workflows: # - pytorch_macos_10_13_py3_build # - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: - requires: - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: - requires: - - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test: - requires: - - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build - - caffe2_py2_mkl_ubuntu16_04_build - - caffe2_py2_mkl_ubuntu16_04_test: - requires: - - caffe2_py2_mkl_ubuntu16_04_build - - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test: - requires: - - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build - - caffe2_py2_gcc4_8_ubuntu14_04_build - - caffe2_py2_gcc4_8_ubuntu14_04_test: - requires: - - caffe2_py2_gcc4_8_ubuntu14_04_build - - caffe2_onnx_py2_gcc5_ubuntu16_04_build - - caffe2_onnx_py2_gcc5_ubuntu16_04_test: - requires: - - caffe2_onnx_py2_gcc5_ubuntu16_04_build - - caffe2_conda2_ubuntu16_04_build - - caffe2_conda2_ubuntu16_04_test: - requires: - - caffe2_conda2_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build - - caffe2_py2_gcc4_9_ubuntu14_04_build - - caffe2_py2_clang3_8_ubuntu16_04_build - - caffe2_py2_clang3_9_ubuntu16_04_build - - caffe2_py2_gcc6_ubuntu16_04_build - - caffe2_py2_gcc7_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build - - caffe2_py2_android_ubuntu16_04_build - - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_centos7_build + # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build + # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build + # - caffe2_py2_mkl_ubuntu16_04_build + # - caffe2_py2_mkl_ubuntu16_04_test: + # requires: + # - caffe2_py2_mkl_ubuntu16_04_build + # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build + # - caffe2_py2_gcc4_8_ubuntu14_04_build + # - caffe2_py2_gcc4_8_ubuntu14_04_test: + # requires: + # - caffe2_py2_gcc4_8_ubuntu14_04_build + # - caffe2_onnx_py2_gcc5_ubuntu16_04_build + # - caffe2_onnx_py2_gcc5_ubuntu16_04_test: + # requires: + # - caffe2_onnx_py2_gcc5_ubuntu16_04_build + # - caffe2_conda2_ubuntu16_04_build + # - caffe2_conda2_ubuntu16_04_test: + # requires: + # - caffe2_conda2_ubuntu16_04_build + # - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_gcc4_9_ubuntu14_04_build + # - caffe2_py2_clang3_8_ubuntu16_04_build + # - caffe2_py2_clang3_9_ubuntu16_04_build + # - caffe2_py2_gcc6_ubuntu16_04_build + # - caffe2_py2_gcc7_ubuntu16_04_build + # - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build + # - caffe2_py2_android_ubuntu16_04_build + # - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_centos7_build # - caffe2_py2_ios_macos10_13_build # - caffe2_py2_system_macos10_13_build diff --git a/.gitignore b/.gitignore index da783554fe1120..a56ae2ab81df84 100644 --- a/.gitignore +++ b/.gitignore @@ -25,16 +25,17 @@ aten/src/ATen/cuda/CUDAConfig.h build/ dist/ docs/src/**/* -docs/cpp/xml/ -docs/cpp/html/ -docs/cpp/api/ +docs/cpp/build +docs/cpp/source/api test/.coverage test/cpp/api/mnist +test/custom_operator/model.pt test/data/gpu_tensors.pt test/data/legacy_modules.t7 test/data/legacy_serialized.pt test/data/linear.pt test/htmlcov +test/cpp_extensions/install/ third_party/build/ tools/shared/_utils_internal.py torch.egg-info/ @@ -43,6 +44,7 @@ torch/csrc/cudnn/cuDNN.cpp torch/csrc/generated torch/csrc/generic/TensorMethods.cpp torch/csrc/jit/generated/* +torch/csrc/jit/fusers/Config.h torch/csrc/nn/THCUNN.cpp torch/csrc/nn/THCUNN.cwrap torch/csrc/nn/THNN_generic.cpp @@ -65,6 +67,7 @@ torch/lib/protoc torch/lib/tmp_install torch/lib/torch_shm_manager torch/lib/python* +torch/share/ torch/version.py # IPython notebook checkpoints diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index d9b2a2e096a1e5..ffcbbc136b50d5 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -226,7 +226,7 @@ else export MAX_JOBS=`expr $(nproc) - 1` fi - USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user + USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user # This is to save test binaries for testing cp -r torch/lib/tmp_install $INSTALL_PREFIX diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index b0f9c41382601f..585f994367f339 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -49,6 +49,20 @@ fi mkdir -p $TEST_DIR/{cpp,python} +if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then + export LANG=C.UTF-8 + export LC_ALL=C.UTF-8 + + # Pin individual runs to specific gpu so that we can schedule + # multiple jobs on machines that have multi-gpu. + NUM_AMD_GPUS=$(/opt/rocm/bin/rocminfo | grep 'Device Type.*GPU' | wc -l) + if (( $NUM_AMD_GPUS == 0 )); then + echo >&2 "No AMD GPU detected!" + exit 1 + fi + export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % $NUM_AMD_GPUS)) +fi + cd "${WORKSPACE}" # C++ tests @@ -62,19 +76,27 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do */mkl_utils_test|*/aten/integer_divider_test) continue ;; - */aten/*) - # ATen uses test framework Catch2 - # NB: We do NOT use the xml test reporter, because - # Catch doesn't support multiple reporters + */scalar_tensor_test|*/basic|*/native_test) + if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + continue + else + "$test" + fi + ;; + *) + # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While + # planning to migrate to gtest as the common PyTorch c++ test suite, we + # currently do NOT use the xml test reporter, because Catch doesn't + # support multiple reporters # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223 # which means that enabling XML output means you lose useful stdout # output for Jenkins. It's more important to have useful console # output than it is to have XML output for Jenkins. + # Note: in the future, if we want to use xml test reporter once we switch + # to all gtest, one can simply do: + # "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" "$test" ;; - *) - "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" - ;; esac done @@ -98,9 +120,6 @@ fi rocm_ignore_test=() if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then - export LANG=C.UTF-8 - export LC_ALL=C.UTF-8 - # Currently these tests are failing on ROCM platform: # Unknown reasons, need to debug @@ -115,10 +134,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Our cuda top_k op has some asm code, the hipified version doesn't # compile yet, so we don't have top_k operator for now rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") - - # Our AMD CI boxes have 4 gpus on each - # Remove this once we have added multi-gpu support - export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4)) fi # Python tests diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 3ffed384b081b7..2dc64157c5d00d 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -11,7 +11,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0 fi -if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]] || [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then # TODO: move this to Docker sudo apt-get update sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev @@ -102,12 +102,6 @@ fi # Add the test binaries so that they won't be git clean'ed away git add -f build/bin -# Testing ATen install -if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then - echo "Testing ATen install" - time tools/test_aten_install.sh -fi - # Test C FFI plugins # cffi install doesn't work for Python 3.7 if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then @@ -124,7 +118,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then pushd docs # TODO: Don't run this here pip install -r requirements.txt || true - make html + LC_ALL=C make html popd fi @@ -138,4 +132,14 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then pushd ../cpp-build/caffe2 WERROR=1 VERBOSE=1 DEBUG=1 python $BUILD_LIBTORCH_PY popd + + # Build custom operator tests. + CUSTOM_OP_BUILD="$PWD/../custom-op-build" + CUSTOM_OP_TEST="$PWD/test/custom_operator" + SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + mkdir "$CUSTOM_OP_BUILD" + pushd "$CUSTOM_OP_BUILD" + CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake "$CUSTOM_OP_TEST" + make VERBOSE=1 + popd fi diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh index 5ce6ee01a46975..ca728df2b826c5 100644 --- a/.jenkins/pytorch/common.sh +++ b/.jenkins/pytorch/common.sh @@ -112,7 +112,8 @@ else exit 1 fi -if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3 ]] || \ + [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then BUILD_TEST_LIBTORCH=1 else BUILD_TEST_LIBTORCH=0 diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 87e0476e418ba5..16d34342c544c8 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -78,13 +78,35 @@ test_cpp_api() { "$CPP_BUILD"/caffe2/bin/test_api } +test_custom_script_ops() { + echo "Testing custom script operators" + pushd test/custom_operator + # Build the custom operator library. + rm -rf build && mkdir build + pushd build + SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake .. + make VERBOSE=1 + popd + + # Run tests Python-side and export a script module. + python test_custom_ops.py -v + python model.py --export-script-module=model.pt + # Run tests C++-side and load the exported script module. + build/test_custom_ops ./model.pt + popd +} + + if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then test_python_all test_cpp_api + test_custom_script_ops else if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then test_python_all elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then test_cpp_api + test_custom_script_ops fi fi diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 05bd71602b9783..471fd8fac1fc6e 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -90,6 +90,8 @@ test_python_all_except_nn() { test_aten() { # Test ATen + # The following test(s) of ATen have already been skipped by caffe2 in rocm environment: + # scalar_tensor_test, basic, native_test if ([[ "$BUILD_ENVIRONMENT" != *asan* ]] && [[ "$BUILD_ENVIRONMENT" != *rocm* ]]); then echo "Running ATen tests with pytorch lib" TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib @@ -97,7 +99,7 @@ test_aten() { # put the dynamic libraries somewhere were the dynamic linker can find them. # This is a bit of a hack. if [[ "$BUILD_ENVIRONMENT" == *ppc64le* ]]; then - SUDO=sudo + SUDO=sudo fi ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin @@ -140,12 +142,28 @@ test_libtorch() { fi } +test_custom_script_ops() { + if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then + echo "Testing custom script operators" + CUSTOM_OP_BUILD="$PWD/../custom-op-build" + pushd test/custom_operator + cp -r "$CUSTOM_OP_BUILD" build + # Run tests Python-side and export a script module. + python test_custom_ops.py -v + python model.py --export-script-module=model.pt + # Run tests C++-side and load the exported script module. + build/test_custom_ops ./model.pt + popd + fi +} + if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then test_python_nn test_python_all_except_nn test_aten test_torchvision test_libtorch + test_custom_script_ops else if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then test_python_nn @@ -154,5 +172,6 @@ else test_aten test_torchvision test_libtorch + test_custom_script_ops fi fi diff --git a/.travis.yml b/.travis.yml index be45e69f67cb2f..77d430ee8917a4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,4 +28,4 @@ matrix: script: mypy @mypy-files.txt - env: CPP_DOC_CHECK install: sudo apt-get install -y doxygen - script: cd docs/cpp && ./check-doxygen.sh + script: cd docs/cpp/source && ./check-doxygen.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index b7f56f96e87f3a..827121b1fc5931 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF) option(BUILD_DOCS "Build Caffe2 documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON) option(BUILD_PYTHON "Build Python binaries" ON) +option(BUILD_CAFFE2_OPS "Build Caffe2 operators" ON) option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON) cmake_dependent_option( CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON @@ -115,7 +116,7 @@ option(USE_IDEEP "Use IDEEP interface in MKL BLAS" ON) option(USE_MKLML "Use MKLML interface in MKL BLAS" ON) option(USE_DISTRIBUTED "Use distributed" ON) cmake_dependent_option( - USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." OFF + USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON "USE_DISTRIBUTED" OFF) cmake_dependent_option( USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON @@ -123,6 +124,7 @@ cmake_dependent_option( cmake_dependent_option( USE_GLOO_IBVERBS "Use Gloo IB verbs for distributed. Only available if USE_GLOO is on." OFF "USE_GLOO" OFF) +option(TORCH_USE_CEREAL "Build the C++ API with Cereal for serialization support" OFF) # Used when building Caffe2 through setup.py option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF) diff --git a/CODEOWNERS b/CODEOWNERS index 113be035c9b99f..5723d6ebe5c058 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,6 +2,7 @@ # Each line is a file pattern followed by one or more owners. /aten/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang +/aten/src/ATen/core/ /torch/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang /docs/source @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ssnl @zou3519 /docs/cpp @goldsborough @ebetica @apaszke @soumith @colesbury @gchanan @zdevito @ezyang diff --git a/README.md b/README.md index b909001edc6f14..b23bc60aa19de6 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ We are in an early-release beta. Expect some adventures and rough edges. - [Binaries](#binaries) - [From Source](#from-source) - [Docker Image](#docker-image) + - [Building the Documentation](#building-the-documentation) - [Previous Versions](#previous-versions) - [Getting Started](#getting-started) - [Communication](#communication) @@ -200,9 +201,8 @@ set DISTUTILS_USE_SDK=1 REM The following two lines are needed for Python 2.7, but the support for it is very experimental. set MSSdk=1 set FORCE_PY27_BUILD=1 -REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following two lines. -set "PREBUILD_COMMAND=%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" -set PREBUILD_COMMAND_ARGS=x64 +REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following line. +set "CUDA_HOST_COMPILER=%VS140COMNTOOLS%\..\..\VC\bin\amd64\cl.exe" call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11 python setup.py install @@ -224,6 +224,18 @@ Please note that PyTorch uses shared memory to share data between processes, so for multithreaded data loaders) the default shared memory segment size that container runs with is not enough, and you should increase shared memory size either with `--ipc=host` or `--shm-size` command line options to `nvidia-docker run`. +### Building the Documentation + +To build documentation in various formats, you will need Sphinx and the +readthedocs theme. + +``` +cd docs/ +pip install -r requirements.txt +``` +You can then build the documentation by running ``make `` from the +``docs/`` folder. Run ``make`` to get a list of all available output formats. + ### Previous Versions Installation instructions and binaries for previous PyTorch versions may be found diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index ee025265a982e7..0f0019d57b11e9 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -1,19 +1,5 @@ -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(ATen CXX C) - include(CMakeDependentOption) - option(USE_CUDA "Use CUDA" ON) - option(USE_ROCM "Use ROCm" OFF) - option(USE_CUDNN "Use cuDNN" ON) - option(USE_MKLDNN "Use MKLDNN" ON) - cmake_dependent_option( - USE_CUDNN "Use cuDNN" ON - "USE_CUDA" OFF) - option(ATEN_NO_TEST "Do not build ATen test binaries" OFF) -else() - if (BUILD_ATEN_MOBILE) - return() - endif() +if (BUILD_ATEN_MOBILE) + return() endif() # Find modules @@ -42,32 +28,6 @@ SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory") SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory") SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory") -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # ---[ Build variables set within the cmake tree - include(../cmake/BuildVariables.cmake) - set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.") - - # ---[ Misc checks to cope with various compiler modes - include(../cmake/MiscCheck.cmake) - - # External projects - include(ExternalProject) - - # ---[ Utils - # TODO: merge the following 3 files into cmake/public/utils.cmake. - include(../cmake/Utils.cmake) - include(../cmake/public/utils.cmake) - - # ---[ Dependencies - include(../cmake/Dependencies.cmake) - list(APPEND ATen_CPU_INCLUDE ${Caffe2_CPU_INCLUDE}) - list(APPEND ATen_CUDA_INCLUDE ${Caffe2_GPU_INCLUDE}) - list(APPEND ATen_CPU_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS}) - list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS}) - list(APPEND ATen_PUBLIC_CUDA_DEPENDENCY_LIBS - ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) -endif() - if(USE_CUDA) list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS}) endif() @@ -132,16 +92,14 @@ list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/src/ATen) add_subdirectory(src/ATen) -if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # Pass source, includes, and libs to parent - set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) - set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) - set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) - set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) - set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) - set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) - set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) - set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) - set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) - set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) -endif() +# Pass source, includes, and libs to parent +set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) +set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) +set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) +set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) +set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) +set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) +set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h index cf074730bf072b..29812852e24fdb 100644 --- a/aten/src/ATen/ATen.h +++ b/aten/src/ATen/ATen.h @@ -19,7 +19,7 @@ #include "ATen/core/Storage.h" #include "ATen/Tensor.h" #include "ATen/TensorGeometry.h" -#include "ATen/TensorMethods.h" +#include "ATen/core/TensorMethods.h" #include "ATen/TensorOperators.h" #include "ATen/core/TensorOptions.h" #include "ATen/Type.h" diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index f6d296dfe79e45..994756fa18c995 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -1,11 +1,6 @@ cmake_minimum_required(VERSION 3.0 FATAL_ERROR) SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # ---[ Generate and install header and cpp files - include(../../../cmake/Codegen.cmake) -endif() - IF(NOT MSVC) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers") @@ -352,34 +347,6 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE") endif() endif() -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # Eventually replace this use of LOCATION with use of - # $, but generators only work in some cases - cmake_policy(SET CMP0026 OLD) - get_target_property(ATEN_CPU_OUTPUT_NAME ATen_cpu LOCATION) - get_filename_component(ATEN_CPU_OUTPUT_NAME ${ATEN_CPU_OUTPUT_NAME} NAME) - set(ATEN_LIBRARIES - "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CPU_OUTPUT_NAME}") - if(USE_CUDA OR USE_ROCM) - get_target_property(ATEN_CUDA_OUTPUT_NAME ATen_cuda LOCATION) - get_filename_component(ATEN_CUDA_OUTPUT_NAME ${ATEN_CUDA_OUTPUT_NAME} NAME) - list(APPEND ATEN_LIBRARIES - "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CUDA_OUTPUT_NAME}") - endif() - - install(TARGETS ATen_cpu - RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}" - LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}" - ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}") - - if(USE_CUDA OR USE_ROCM) - install(TARGETS ATen_cuda - RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}" - LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}" - ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}") - endif() -endif() - SET(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}") CONFIGURE_FILE(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake") INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" @@ -404,38 +371,6 @@ else() add_subdirectory(test) endif() -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - foreach(test_src ${ATen_CPU_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - add_executable(${test_name} "${test_src}") - target_include_directories( - ${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) - target_link_libraries(${test_name} ATen_cpu) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION test) - endforeach() - - if(USE_CUDA OR USE_ROCM) - foreach(test_src ${ATen_CUDA_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - torch_cuda_based_add_executable(${test_name} "${test_src}") - target_include_directories( - ${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) - target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION test) - endforeach() - endif() - - # Make sure these don't get built by parent - set(ATen_CPU_TEST_SRCS) - set(ATen_CUDA_TEST_SRCS) -endif() - # Pass source, includes, and libs to parent set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 40cb364e91fd0c..c73d2efd8ea813 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -12,6 +12,7 @@ #include "ATen/CPUGenerator.h" #include "ATen/RegisterCPU.h" +#include "ATen/Tensor.h" #include "TH/TH.h" // for USE_LAPACK @@ -107,15 +108,19 @@ bool Context::setFlushDenormal(bool on) { #endif } -Type& getType(TensorOptions options) { +TypeExtendedInterface& getType(TensorOptions options) { return globalContext().getType( options.backend(), options.dtype(), options.is_variable()); } -Type& getType(const TensorImpl* impl) { +TypeExtendedInterface& getType(const TensorImpl* impl) { Backend backend = tensorTypeIdToBackend(impl->type_id()); return globalContext().getType( - backend, impl->scalar_type(), impl->is_variable()); + backend, dataTypeToScalarType(impl->dtype().id()), impl->is_variable()); +} + +TypeExtendedInterface& getType(const Tensor& t) { + return getType(t.unsafeGetTensorImpl()); } Allocator* getCPUAllocator() { diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 7b3634dd83086f..4e147cffabbe86 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -5,6 +5,7 @@ #include "ATen/CUDAStream.h" #include "ATen/core/Generator.h" #include "ATen/Type.h" +#include "ATen/TypeExtendedInterface.h" #include "ATen/Utils.h" #include "ATen/core/Error.h" #include "ATen/detail/CUDAHooksInterface.h" @@ -21,23 +22,25 @@ namespace at { +struct Tensor; + class AT_API Context { public: Context(); - Type* getNonVariableTypeRaw(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s); + TypeExtendedInterface* getNonVariableTypeRaw(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s)); } - Type * getNonVariableTypeOpt(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s); + TypeExtendedInterface * getNonVariableTypeOpt(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s)); } - Type & getNonVariableType(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getNonVariableType(p, s); + TypeExtendedInterface & getNonVariableType(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getNonVariableType(p, s)); } - Type & getVariableType(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getVariableType(p, s); + TypeExtendedInterface & getVariableType(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getVariableType(p, s)); } - Type & getType(Backend p, ScalarType s, bool is_variable) { - return globalLegacyTypeDispatch().getType(p, s, is_variable); + TypeExtendedInterface & getType(Backend p, ScalarType s, bool is_variable) { + return static_cast(globalLegacyTypeDispatch().getType(p, s, is_variable)); } // The passed in Type must be delete'able // TODO: Just make it take a unique_ptr @@ -142,24 +145,25 @@ static inline void init() { } } -static inline Type& getNonVariableType(Backend p, ScalarType s) { +static inline TypeExtendedInterface& getNonVariableType(Backend p, ScalarType s) { return globalContext().getNonVariableType(p, s); } -static inline Type& getNonVariableType(DeviceType p, ScalarType s) { +static inline TypeExtendedInterface& getNonVariableType(DeviceType p, ScalarType s) { return globalContext().getNonVariableType(deviceTypeToBackend(p), s); } -AT_API Type& getType(TensorOptions options); -AT_API Type& getType(const TensorImpl*); +AT_API TypeExtendedInterface& getType(TensorOptions options); +AT_API TypeExtendedInterface& getType(const TensorImpl*); +AT_API TypeExtendedInterface& getType(const Tensor&); AT_API Allocator* getCPUAllocator(); -static inline Type& CPU(ScalarType s) { +static inline TypeExtendedInterface& CPU(ScalarType s) { return getNonVariableType(Backend::CPU, s); } -static inline Type& CUDA(ScalarType s) { +static inline TypeExtendedInterface& CUDA(ScalarType s) { return getNonVariableType(Backend::CUDA, s); } diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index 9d67537ccdedd8..d45815c5b600c9 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -302,8 +302,7 @@ output: True - accreal start - accreal end - - arg: accreal step - default: 1 + - accreal step ]] [[ name: _arange @@ -320,8 +319,7 @@ output: True - accreal start - accreal end - - arg: accreal step - default: 1 + - accreal step - cname: arange arguments: - arg: THTensor* result @@ -1956,8 +1954,7 @@ output: True - real start - real end - - arg: long steps - default: 100 + - long steps ]] [[ name: _logspace @@ -1976,8 +1973,7 @@ output: True - real start - real end - - arg: long steps - default: 100 + - long steps ]] [[ name: histc @@ -2471,11 +2467,12 @@ - THTensor* mat2 ]] [[ - name: bmm + name: _th_bmm cname: baddbmm variants: - - method - function + backends: + - CUDA return: argument 0 arguments: - arg: THTensor* result @@ -2525,10 +2522,12 @@ - THTensor* batch2 ]] [[ - name: baddbmm + name: _th_baddbmm + cname: baddbmm variants: - - method - function + backends: + - CUDA return: argument 0 arguments: - arg: THTensor* result @@ -2544,22 +2543,6 @@ - THTensor* batch1 - THTensor* batch2 ]] -[[ - name: baddbmm_ - cname: baddbmm - return: argument 0 - arguments: - - THTensor* self - - arg: real beta - default: AS_REAL(1) - kwarg_only: True - - THTensor* self - - arg: real alpha - default: AS_REAL(1) - kwarg_only: True - - THTensor* batch1 - - THTensor* batch2 -]] [[ name: addcmul variants: diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h index b51d80d22d350f..4da336aef5b7cd 100644 --- a/aten/src/ATen/DeviceGuard.h +++ b/aten/src/ATen/DeviceGuard.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,12 @@ struct DeviceGuard { } } + explicit DeviceGuard(optional device_opt) { + if (device_opt.has_value() && device_opt.value().is_cuda()) { + set_index(device_opt.value().index()); + } + } + /// Calls `set_index` with the given index. explicit DeviceGuard(int32_t index) { set_index(index); diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h index 9bbf37b5a9f4d4..cb652fffcb1481 100644 --- a/aten/src/ATen/DimVector.h +++ b/aten/src/ATen/DimVector.h @@ -1,11 +1,2 @@ #pragma once - -#include -#include - -namespace at { - -/// A container for sizes or strides -using DimVector = SmallVector; - -} +#include diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index c598901b2b943f..64f181d4dccb3c 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -79,3 +79,52 @@ AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ } \ }() + +#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + +#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() diff --git a/aten/src/ATen/Formatting.h b/aten/src/ATen/Formatting.h index 6a8b502cc978a8..392e2a27b0130c 100644 --- a/aten/src/ATen/Formatting.h +++ b/aten/src/ATen/Formatting.h @@ -1,24 +1 @@ -#pragma once - -#include -#include "ATen/Type.h" -#include "ATen/core/Scalar.h" - -namespace at { - -AT_API std::ostream& operator<<(std::ostream & out, IntList list); -AT_API std::ostream& operator<<(std::ostream & out, Backend b); -AT_API std::ostream& operator<<(std::ostream & out, const Type & t); -AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize); -static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { - return print(out,t,80); -} -static inline void print(const Tensor & t, int64_t linesize=80) { - print(std::cout,t,linesize); -} - -static inline std::ostream& operator<<(std::ostream & out, Scalar s) { - return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong()); -} - -} +#include diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index 6aadd62eb1d3fd..a4c8b50abe8263 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -27,7 +27,7 @@ inline void parallel_for( const int64_t grain_size, const F& f) { #ifdef _OPENMP -#pragma omp parallel if ((end - begin) >= grain_size) +#pragma omp parallel if (!omp_in_parallel() && ((end - begin) >= grain_size)) { int64_t num_threads = omp_get_num_threads(); int64_t tid = omp_get_thread_num(); diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 3f13d59b4467e5..66b71dd7b8a650 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -28,13 +28,13 @@ namespace { // // This means that we allocate a [1,0] size indices tensor and a [0] size // values tensor for such an empty tensor. -SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, at::ScalarType scalar_type) - : TensorImpl(type_id, scalar_type, nullptr, false) +SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type) + : TensorImpl(type_id, data_type, nullptr, false) , size_{0} , sparseDims_(1) , denseDims_(0) , indices_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), ScalarType::Long)->tensor({1, 0})) - , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), scalar_type)->tensor()) {} + , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), dataTypeToScalarType(data_type.id()))->tensor()) {} IntList SparseTensorImpl::sizes() const { return size_; diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 835b45c2a541ee..42b670bea08541 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -1,7 +1,7 @@ #pragma once #include "ATen/Tensor.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/core/Error.h" namespace at { @@ -36,7 +36,7 @@ struct AT_API SparseTensorImpl : public TensorImpl { public: // Public for now... - explicit SparseTensorImpl(at::TensorTypeId, at::ScalarType); + explicit SparseTensorImpl(at::TensorTypeId, const caffe2::TypeMeta&); int64_t nnz() const { return values_.size(0); } int64_t sparseDims() const { return sparseDims_; } diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h new file mode 100644 index 00000000000000..cef05f5341cb31 --- /dev/null +++ b/aten/src/ATen/Tensor.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp index ca3fcd961feda0..b11c7bb159900b 100644 --- a/aten/src/ATen/TensorGeometry.cpp +++ b/aten/src/ATen/TensorGeometry.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -8,15 +9,7 @@ bool TensorGeometry::is_contiguous() const { if (numel_ == 0) { return true; } - int64_t dim = sizes_.size(); - int64_t expected_stride = 1; - for (int64_t i = dim - 1; i >= 0; i--) { - if (sizes_[i] != 1 && strides_[i] != expected_stride) { - return false; - } - expected_stride *= sizes_[i]; - } - return true; + return at::geometry_is_contiguous(sizes_, strides_); } Tensor TensorGeometry::zeros_with_stride(const Type& type) const { diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h index e0a649a49b6ccd..34ece0fc0d03fb 100644 --- a/aten/src/ATen/TensorImpl.h +++ b/aten/src/ATen/TensorImpl.h @@ -1,204 +1,2 @@ #pragma once - -#include -#include - -#include "ATen/core/Storage.h" -#include "ATen/core/optional.h" -#include "ATen/core/TensorTypeId.h" -#include "ATen/core/TensorTypeIdRegistration.h" -#include "ATen/core/LegacyTypeDispatch.h" -#include "ATen/core/Backend.h" - -struct THTensor; - -namespace at { -class Scalar; -struct Type; -struct Storage; -struct Tensor; -} // namespace at - -namespace at { -struct AT_API TensorImpl : public c10::intrusive_ptr_target { - TensorImpl() = delete; - TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable); - TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); - - virtual void release_resources() override; - - Type & type() const { - // NB: It's valid to use getTypeRaw here, because the TensorImpl - // could not have been created without initializing the Type first. - // TODO: This is not actually true via the Caffe2 codepath! Make - // it so. - return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable()); - } - - TensorTypeId type_id() const { return type_id_; } - virtual IntList sizes() const; - virtual IntList strides() const; - virtual int64_t dim() const; - virtual const Storage& storage() const; - friend struct Type; - - virtual int64_t numel() const { -#ifdef DEBUG - AT_ASSERT(compute_numel() == numel_); -#endif - return numel_; - } - - virtual bool is_contiguous() const { -#ifdef DEBUG - AT_ASSERT(compute_contiguous() == is_contiguous_); -#endif - return is_contiguous_; - } - - // this is called by the generated wrapper code when there are conditions - // when this output tensor should be zero dimensional. e.g. when all inputs - // to a function 'add' were zero dimensional, then condition_when_zero_dim == true. - // we also prevent this from getting marked as a zero dim tensor if it is not - // the right shape afterall. - virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim); - - // True if a tensor was auto-wrapped from a C++ or Python number. - // Wrapped numbers do not participate in the result type computation for - // mixed-type operations if there are any Tensors that are not wrapped - // numbers. Otherwise, they behave like their non-wrapped equivalents. - // See [Result type computation] in TensorIterator.h. - bool is_wrapped_number() const { - return is_wrapped_number_; - } - void set_wrapped_number(bool value) { - AT_ASSERT(dim() == 0); - is_wrapped_number_ = value; - } - - // ~~~~~ Autograd API ~~~~~ - // Some methods below are defined in TensorImpl.cpp because Tensor is an - // incomplete type. - - virtual void set_requires_grad(bool requires_grad) { - AT_ERROR("set_requires_grad is not implemented for Tensor"); - } - virtual bool requires_grad() const { - AT_ERROR("requires_grad is not implemented for Tensor"); - } - - virtual Tensor& grad(); - virtual const Tensor& grad() const; - - // TODO: make these protected - // Note: storage->size() may be greater than the recorded size - // of a tensor - at::Storage storage_; - - template - inline T * data() const { - return storage_.data() + storage_offset_; - } - - template - inline T * unsafe_data() const { - return storage_.unsafe_data() + storage_offset_; - } - - inline at::ScalarType scalar_type() const { - return scalar_type_; - } - - virtual int64_t storage_offset() const { - return storage_offset_; - } - - // represents that numel() == 0. - inline bool is_empty() const { - return numel() == 0; - } - - virtual void resize_dim(int64_t ndim) { - // NB: This is *truly* a resize; calling code (e.g., squeeze) - // assumes that old values are preserved - sizes_.resize(ndim); - strides_.resize(ndim); - refresh_numel(); - refresh_contiguous(); - } - - virtual void set_size(int64_t dim, int64_t new_size) { - sizes_[dim] = new_size; - refresh_numel(); - refresh_contiguous(); - } - - virtual void set_stride(int64_t dim, int64_t new_stride) { - strides_[dim] = new_stride; - refresh_numel(); - refresh_contiguous(); - } - - virtual void set_storage_offset(int64_t storage_offset) { - storage_offset_ = storage_offset; - refresh_numel(); - refresh_contiguous(); - } - - // WARNING: This function does not check if the requested - // sizes/strides are in bounds for the storage that is allocated; - // this is the responsibility of the caller - void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) { - AT_CHECK( - new_size.size() == new_stride.size(), - "dimensionality of sizes (", - new_size.size(), - ") must match dimensionality of strides (", - new_stride.size(), - ")"); - sizes_ = new_size.vec(); - strides_ = new_stride.vec(); - refresh_numel(); - refresh_contiguous(); - } - - virtual int64_t size(int64_t d) const; - virtual int64_t stride(int64_t d) const; - - bool is_variable() const { return is_variable_; }; - - private: - int64_t storage_offset_; - std::vector sizes_; - std::vector strides_; - - bool is_contiguous_; - int64_t numel_; - - int64_t compute_numel() const { - int64_t n = 1; - for (auto s : sizes()) { - n *= s; - } - return n; - } - bool compute_contiguous() const; - - protected: - void refresh_numel() { - numel_ = compute_numel(); - } - void refresh_contiguous() { - is_contiguous_ = compute_contiguous(); - } - TensorTypeId type_id_; - // INVARIANT: When storage is non-null, this scalar type must - // agree with the scalar type in storage - ScalarType scalar_type_; - bool is_variable_ = false; - bool is_wrapped_number_ = false; - - private: - TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable); -}; -} // namespace at +#include diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 454ab9e91fd29e..4ec8e374c2e515 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -215,4 +215,24 @@ void * maybe_data_ptr(const Tensor& tensor) { void * maybe_data_ptr(const TensorArg& tensor) { return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; } + +// See TensorUtils.h on why this is useful now that we cache is_contiguous. +bool geometry_is_contiguous(IntList sizes, IntList strides) { + int64_t dim = sizes.size(); + int64_t expected_stride = 1; + bool contig_if_nonempty = true; + for (int64_t i = dim - 1; i >= 0; i--) { + if (sizes[i] == 0) { + return true; + } + if (contig_if_nonempty) { + if (sizes[i] != 1 && strides[i] != expected_stride) { + contig_if_nonempty = false; + } + expected_stride *= sizes[i]; + } + } + return contig_if_nonempty; +} + } diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index cc7453f77375f6..2443bde4b482cb 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -78,4 +78,11 @@ AT_API void checkBackend(CheckedFrom c, at::ArrayRef t, at::Backend back AT_API void * maybe_data_ptr(const Tensor& tensor); AT_API void * maybe_data_ptr(const TensorArg& tensor); +// Return if the tensor geometry represented by `sizes` and `strides` is contiguous +// Although we cache is_contiguous in tensor now, this is till useful because it +// allows checking if a particular geometry is contiguous without explicitly +// constructing a tensor, e.g., when you want to choose a kernel strategy based +// on whether a subgeometry is contiguous. +AT_API bool geometry_is_contiguous(IntList sizes, IntList strides); + } diff --git a/aten/src/ATen/Type.h b/aten/src/ATen/Type.h new file mode 100644 index 00000000000000..0c95f43e0482e9 --- /dev/null +++ b/aten/src/ATen/Type.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp deleted file mode 100644 index 956c70b9f178af..00000000000000 --- a/aten/src/ATen/UndefinedTensor.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "ATen/UndefinedTensor.h" -#include "ATen/core/Error.h" - -namespace at { - -// should this use the globalContext? Can it get a context passed in somehow? -UndefinedTensor::UndefinedTensor() -: TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) { -} - -IntList UndefinedTensor::sizes() const { - AT_ERROR("sizes() called on undefined Tensor"); -} - -int64_t UndefinedTensor::size(int64_t d) const { - AT_ERROR("size(dim) called on an undefined Tensor"); -} - -int64_t UndefinedTensor::stride(int64_t d) const { - AT_ERROR("stride(dim) called on an undefined Tensor"); -} - -int64_t UndefinedTensor::dim() const { - AT_ERROR("dim() called on undefined Tensor"); -} - -const Storage& UndefinedTensor::storage() const { - AT_ERROR("storage() called on undefined Tensor"); -} - -int64_t UndefinedTensor::storage_offset() const { - AT_ERROR("storage_offset() called on an undefined Tensor"); -} - -IntList UndefinedTensor::strides() const { - AT_ERROR("strides() called on undefined Tensor"); -} -UndefinedTensor UndefinedTensor::_singleton; - -} diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp index bea9baf61892f5..8e9722eae3be09 100644 --- a/aten/src/ATen/UndefinedType.cpp +++ b/aten/src/ATen/UndefinedType.cpp @@ -8,6 +8,9 @@ UndefinedType::UndefinedType() ScalarType UndefinedType::scalarType() const { return ScalarType::Undefined; } +caffe2::TypeMeta UndefinedType::typeMeta() const { + return scalarTypeToTypeMeta(scalarType()); +} Backend UndefinedType::backend() const { return Backend::Undefined; } diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h index 594fb99e61dc0e..4ccd6101851a72 100644 --- a/aten/src/ATen/UndefinedType.h +++ b/aten/src/ATen/UndefinedType.h @@ -14,6 +14,7 @@ namespace at { struct UndefinedType final : public TypeDefault { explicit UndefinedType(); virtual ScalarType scalarType() const override; + virtual caffe2::TypeMeta typeMeta() const override; virtual Backend backend() const override; virtual Allocator* allocator() const override; virtual Device getDeviceFromPtr(void* data) const override; diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index fff88e39aba053..c4473d1471ab7d 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -2,7 +2,7 @@ #include "ATen/core/ATenGeneral.h" #include "ATen/StorageImpl.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include #include "ATen/Formatting.h" @@ -44,12 +44,12 @@ static inline const Storage& checked_storage( name, "'"); } - if (expr.dtype() != data_type) { + if (expr.dtype().id() != data_type) { AT_ERROR( "Expected object of data type ", data_type, " but got data type ", - expr.dtype(), + expr.dtype().id(), " for argument #", pos, " '", diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index 8e9db589c5267a..467a5664f6e05b 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -1,7 +1,7 @@ #pragma once #include "ATen/core/WrapDimMinimal.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" namespace at { diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h index f3d3a81a365c26..4d3df92fe0bc57 100644 --- a/aten/src/ATen/WrapDimUtilsMulti.h +++ b/aten/src/ATen/WrapDimUtilsMulti.h @@ -1,6 +1,6 @@ #pragma once -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/WrapDimUtils.h" #include #include diff --git a/aten/src/ATen/copy_wrapper.py b/aten/src/ATen/copy_wrapper.py index e4651f8c846070..a746f5543901ae 100644 --- a/aten/src/ATen/copy_wrapper.py +++ b/aten/src/ATen/copy_wrapper.py @@ -238,7 +238,7 @@ def create(all_types, backend): top_env['copy_includes'].append( '#include "ATen/{}.h"'.format(the_type['Type'])) top_env['copy_includes'].append( - '#include "ATen/TensorImpl.h"') + '#include "ATen/core/TensorImpl.h"') # Code generation for the_type in all_types: diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp index 5bb595a0bce5de..bb670b315f16c9 100644 --- a/aten/src/ATen/core/ATenCoreTest.cpp +++ b/aten/src/ATen/core/ATenCoreTest.cpp @@ -1,9 +1,11 @@ #include +#include namespace at { static int CoreTestGlobal = 0; int CoreTest() { + Tensor x; return CoreTestGlobal++; } diff --git a/aten/src/ATen/core/C++17.h b/aten/src/ATen/core/C++17.h index d8440ceea0c21a..ac5b3022ed5ff1 100644 --- a/aten/src/ATen/core/C++17.h +++ b/aten/src/ATen/core/C++17.h @@ -12,6 +12,53 @@ namespace c10 { namespace guts { + + +#ifdef __cpp_lib_transformation_trait_aliases +template using conditional_t = std::conditional_t; +template using enable_if_t = std::enable_if_t; +template using add_lvalue_reference_t = std::add_lvalue_reference_t; +template using remove_reference_t = std::remove_reference_t; +template using remove_cv_t = std::remove_cv_t; +template using result_of_t = std::result_of_t; +template using decay_t = std::decay_t; +template using remove_const_t = std::remove_const_t; +template using remove_pointer_t = std::remove_pointer_t; +#else +template using conditional_t = typename std::conditional::type; +template using enable_if_t = typename std::enable_if::type; +template using add_lvalue_reference_t = typename std::add_lvalue_reference::type; +template using remove_reference_t = typename std::remove_reference::type; +template using remove_cv_t = typename std::remove_cv::type; +template using result_of_t = typename std::result_of::type; +template using decay_t = typename std::decay::type; +template using remove_const_t = typename std::remove_const::type; +template using remove_pointer_t = typename std::remove_pointer::type; +#endif + + + + +// C++11 doesn't have constexpr std::move / std::forward. +// Implementation taken from libc++. +template +constexpr inline guts::remove_reference_t&& move(T&& t) noexcept { + return static_cast&&>(t); +} +template +constexpr inline T&& forward(guts::remove_reference_t& t) noexcept { + return static_cast(t); +} +template +constexpr inline T&& forward(guts::remove_reference_t&& t) noexcept { + static_assert(!std::is_lvalue_reference::value, + "can not forward an rvalue as an lvalue."); + return static_cast(t); +} + + + + #if __cplusplus >= 201402L || defined(__cpp_lib_make_unique) && __cpp_lib_make_unique >= 201304L || \ (defined(__ANDROID__) && __ANDROID__ && __cplusplus >= 201300L) || defined(_MSC_VER) && _MSC_VER >= 1900 @@ -23,7 +70,7 @@ namespace c10 { namespace guts { template typename std::enable_if::value, std::unique_ptr>::type make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); + return std::unique_ptr(new T(forward(args)...)); } // Allows 'make_unique(10)'. (N3690 s20.9.1.4 p3-4) template @@ -39,6 +86,7 @@ make_unique(Args&&...) = delete; #endif + #ifdef __cpp_lib_integer_sequence template using integer_sequence = std::integer_sequence; @@ -73,26 +121,6 @@ template using index_sequence_for = make_index_sequence using conditional_t = std::conditional_t; -template using enable_if_t = std::enable_if_t; -template using add_lvalue_reference_t = std::add_lvalue_reference_t; -template using remove_reference_t = std::remove_reference_t; -template using remove_cv_t = std::remove_cv_t; -template using result_of_t = std::result_of_t; -template using decay_t = std::decay_t; -template using remove_const_t = std::remove_const_t; -#else -template using conditional_t = typename std::conditional::type; -template using enable_if_t = typename std::enable_if::type; -template using add_lvalue_reference_t = typename std::add_lvalue_reference::type; -template using remove_reference_t = typename std::remove_reference::type; -template using remove_cv_t = typename std::remove_cv::type; -template using result_of_t = typename std::result_of::type; -template using decay_t = typename std::decay::type; -template using remove_const_t = typename std::remove_const::type; -#endif - #ifdef __cpp_lib_logical_traits @@ -153,7 +181,7 @@ template using void_t = typename make_void::type; template inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { - return std::apply(std::forward(f), std::forward(t)); + return std::apply(forward(f), forward(t)); } #else @@ -162,19 +190,19 @@ inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { // TODO This is an incomplete implementation of std::apply, not working for member functions. namespace detail { template -constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence) -> decltype(std::forward(f)(std::get(std::forward(t))...)) +constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence) -> decltype(forward(f)(std::get(forward(t))...)) { - return std::forward(f)(std::get(std::forward(t))...); + return forward(f)(std::get(forward(t))...); } } // namespace detail template constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl( - std::forward(f), std::forward(t), + forward(f), forward(t), guts::make_index_sequence>::value>{})) { return detail::apply_impl( - std::forward(f), std::forward(t), + forward(f), forward(t), guts::make_index_sequence>::value>{}); } diff --git a/aten/src/ATen/core/DimVector.h b/aten/src/ATen/core/DimVector.h new file mode 100644 index 00000000000000..a98c841a94777b --- /dev/null +++ b/aten/src/ATen/core/DimVector.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace at { + +/// A container for sizes or strides +using DimVector = SmallVector; + +} // namespace at diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp similarity index 99% rename from aten/src/ATen/Formatting.cpp rename to aten/src/ATen/core/Formatting.cpp index 390230316bd0dc..f13b0082d90d10 100644 --- a/aten/src/ATen/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -1,6 +1,4 @@ -#include "ATen/Formatting.h" - -#include +#include "ATen/core/Formatting.h" #include #include diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h new file mode 100644 index 00000000000000..c6ac26b8a9e0e3 --- /dev/null +++ b/aten/src/ATen/core/Formatting.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at { + +AT_API std::ostream& operator<<(std::ostream & out, IntList list); +AT_API std::ostream& operator<<(std::ostream & out, Backend b); +AT_API std::ostream& operator<<(std::ostream & out, const Type & t); +AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize); +static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { + return print(out,t,80); +} +static inline void print(const Tensor & t, int64_t linesize=80) { + print(std::cout,t,linesize); +} + +static inline std::ostream& operator<<(std::ostream & out, Scalar s) { + return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong()); +} + +} diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h index a1786d0bb9db6e..75ff2a2fe6937f 100644 --- a/aten/src/ATen/core/Half-inl.h +++ b/aten/src/ATen/core/Half-inl.h @@ -16,7 +16,7 @@ namespace at { /// Constructors -inline AT_HOSTDEVICE Half::Half(float value) { +inline AT_HOST_DEVICE Half::Half(float value) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) x = __half_as_short(__float2half(value)); #else @@ -26,7 +26,7 @@ inline AT_HOSTDEVICE Half::Half(float value) { /// Implicit conversions -inline AT_HOSTDEVICE Half::operator float() const { +inline AT_HOST_DEVICE Half::operator float() const { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return __half2float(*reinterpret_cast(&x)); #else @@ -35,150 +35,158 @@ inline AT_HOSTDEVICE Half::operator float() const { } #ifdef __CUDACC__ -inline AT_HOSTDEVICE Half::Half(const __half& value) { +inline AT_HOST_DEVICE Half::Half(const __half& value) { x = *reinterpret_cast(&value); } -inline AT_HOSTDEVICE Half::operator __half() const { +inline AT_HOST_DEVICE Half::operator __half() const { return *reinterpret_cast(&x); } #endif +// CUDA intrinsics + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) +inline __device__ Half __ldg(const Half* ptr) { + return __ldg(reinterpret_cast(ptr)); +} +#endif + /// Arithmetic -inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator+(const Half& a, const Half& b) { return static_cast(a) + static_cast(b); } -inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator-(const Half& a, const Half& b) { return static_cast(a) - static_cast(b); } -inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator*(const Half& a, const Half& b) { return static_cast(a) * static_cast(b); } -inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator/(const Half& a, const Half& b) { return static_cast(a) / static_cast(b); } -inline AT_HOSTDEVICE Half operator-(const Half& a) { +inline AT_HOST_DEVICE Half operator-(const Half& a) { return -static_cast(a); } -inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator+=(Half& a, const Half& b) { a = a + b; return a; } -inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator-=(Half& a, const Half& b) { a = a - b; return a; } -inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator*=(Half& a, const Half& b) { a = a * b; return a; } -inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator/=(Half& a, const Half& b) { a = a / b; return a; } /// Arithmetic with floats -inline AT_HOSTDEVICE float operator+(Half a, float b) { +inline AT_HOST_DEVICE float operator+(Half a, float b) { return static_cast(a) + b; } -inline AT_HOSTDEVICE float operator-(Half a, float b) { +inline AT_HOST_DEVICE float operator-(Half a, float b) { return static_cast(a) - b; } -inline AT_HOSTDEVICE float operator*(Half a, float b) { +inline AT_HOST_DEVICE float operator*(Half a, float b) { return static_cast(a) * b; } -inline AT_HOSTDEVICE float operator/(Half a, float b) { +inline AT_HOST_DEVICE float operator/(Half a, float b) { return static_cast(a) / b; } -inline AT_HOSTDEVICE float operator+(float a, Half b) { +inline AT_HOST_DEVICE float operator+(float a, Half b) { return a + static_cast(b); } -inline AT_HOSTDEVICE float operator-(float a, Half b) { +inline AT_HOST_DEVICE float operator-(float a, Half b) { return a - static_cast(b); } -inline AT_HOSTDEVICE float operator*(float a, Half b) { +inline AT_HOST_DEVICE float operator*(float a, Half b) { return a * static_cast(b); } -inline AT_HOSTDEVICE float operator/(float a, Half b) { +inline AT_HOST_DEVICE float operator/(float a, Half b) { return a / static_cast(b); } -inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator+=(float& a, const Half& b) { return a += static_cast(b); } -inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator-=(float& a, const Half& b) { return a -= static_cast(b); } -inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator*=(float& a, const Half& b) { return a *= static_cast(b); } -inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator/=(float& a, const Half& b) { return a /= static_cast(b); } /// Arithmetic with doubles -inline AT_HOSTDEVICE double operator+(Half a, double b) { +inline AT_HOST_DEVICE double operator+(Half a, double b) { return static_cast(a) + b; } -inline AT_HOSTDEVICE double operator-(Half a, double b) { +inline AT_HOST_DEVICE double operator-(Half a, double b) { return static_cast(a) - b; } -inline AT_HOSTDEVICE double operator*(Half a, double b) { +inline AT_HOST_DEVICE double operator*(Half a, double b) { return static_cast(a) * b; } -inline AT_HOSTDEVICE double operator/(Half a, double b) { +inline AT_HOST_DEVICE double operator/(Half a, double b) { return static_cast(a) / b; } -inline AT_HOSTDEVICE double operator+(double a, Half b) { +inline AT_HOST_DEVICE double operator+(double a, Half b) { return a + static_cast(b); } -inline AT_HOSTDEVICE double operator-(double a, Half b) { +inline AT_HOST_DEVICE double operator-(double a, Half b) { return a - static_cast(b); } -inline AT_HOSTDEVICE double operator*(double a, Half b) { +inline AT_HOST_DEVICE double operator*(double a, Half b) { return a * static_cast(b); } -inline AT_HOSTDEVICE double operator/(double a, Half b) { +inline AT_HOST_DEVICE double operator/(double a, Half b) { return a / static_cast(b); } /// Arithmetic with ints -inline AT_HOSTDEVICE Half operator+(Half a, int b) { +inline AT_HOST_DEVICE Half operator+(Half a, int b) { return a + static_cast(b); } -inline AT_HOSTDEVICE Half operator-(Half a, int b) { +inline AT_HOST_DEVICE Half operator-(Half a, int b) { return a - static_cast(b); } -inline AT_HOSTDEVICE Half operator*(Half a, int b) { +inline AT_HOST_DEVICE Half operator*(Half a, int b) { return a * static_cast(b); } -inline AT_HOSTDEVICE Half operator/(Half a, int b) { +inline AT_HOST_DEVICE Half operator/(Half a, int b) { return a / static_cast(b); } -inline AT_HOSTDEVICE Half operator+(int a, Half b) { +inline AT_HOST_DEVICE Half operator+(int a, Half b) { return static_cast(a) + b; } -inline AT_HOSTDEVICE Half operator-(int a, Half b) { +inline AT_HOST_DEVICE Half operator-(int a, Half b) { return static_cast(a) - b; } -inline AT_HOSTDEVICE Half operator*(int a, Half b) { +inline AT_HOST_DEVICE Half operator*(int a, Half b) { return static_cast(a) * b; } -inline AT_HOSTDEVICE Half operator/(int a, Half b) { +inline AT_HOST_DEVICE Half operator/(int a, Half b) { return static_cast(a) / b; } diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h index c306fcd6b92b72..47a8e8e52d2adb 100644 --- a/aten/src/ATen/core/Half.h +++ b/aten/src/ATen/core/Half.h @@ -30,14 +30,6 @@ #include #endif -#ifndef AT_HOSTDEVICE -#ifdef __CUDACC__ -#define AT_HOSTDEVICE __host__ __device__ -#else -#define AT_HOSTDEVICE -#endif -#endif - namespace at { namespace detail { @@ -55,18 +47,18 @@ struct alignas(2) Half { // HIP wants __host__ __device__ tag, CUDA does not #ifdef __HIP_PLATFORM_HCC__ - AT_HOSTDEVICE Half() = default; + AT_HOST_DEVICE Half() = default; #else Half() = default; #endif - constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){}; - inline AT_HOSTDEVICE Half(float value); - inline AT_HOSTDEVICE operator float() const; + constexpr AT_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits){}; + inline AT_HOST_DEVICE Half(float value); + inline AT_HOST_DEVICE operator float() const; #ifdef __CUDACC__ - inline AT_HOSTDEVICE Half(const __half& value); - inline AT_HOSTDEVICE operator __half() const; + inline AT_HOST_DEVICE Half(const __half& value); + inline AT_HOST_DEVICE operator __half() const; #endif }; @@ -186,17 +178,8 @@ To checked_convert(From f, const char* name) { return convert(f); } -template -To HalfFix(From h) { - To ret; - ret.x = h.x; - return ret; -} - AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value); } // namespace at #include "ATen/core/Half-inl.h" - -#undef AT_HOSTDEVICE diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h index 67efa523ac2bba..244124475bc08f 100644 --- a/aten/src/ATen/core/Macros.h +++ b/aten/src/ATen/core/Macros.h @@ -39,6 +39,17 @@ #define AT_CORE_API AT_CORE_IMPORT #endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +#ifdef __CUDACC__ +// Designates functions callable from the host (CPU) and the device (GPU) +#define AT_HOST_DEVICE __host__ __device__ +#define AT_DEVICE __device__ +#define AT_HOST __host__ +#else +#define AT_HOST_DEVICE +#define AT_HOST +#define AT_DEVICE +#endif + // Disable the copy and assignment operator for a class. Note that this will // disable the usage of the class in std containers. #define AT_DISABLE_COPY_AND_ASSIGN(classname) \ diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h index 35c4b538336aeb..de01a56ce33748 100644 --- a/aten/src/ATen/core/Scalar.h +++ b/aten/src/ATen/core/Scalar.h @@ -99,6 +99,6 @@ template<> \ inline T Scalar::to() { \ return to##name(); \ } -AT_FORALL_SCALAR_TYPES(DEFINE_TO) +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO) #undef DEFINE_TO } diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h index b5e1a47646d7d6..6fe88bfadb05f5 100644 --- a/aten/src/ATen/core/ScalarType.h +++ b/aten/src/ATen/core/ScalarType.h @@ -80,6 +80,18 @@ static inline DataType scalarTypeToDataType(ScalarType scalar_type) { #undef DEFINE_CASE } +static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) { +#define DEFINE_CASE(ctype,name,_) \ + case ScalarType:: name : return caffe2::TypeMeta::Make(); + + switch(scalar_type) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE) + case ScalarType::Undefined: return caffe2::TypeMeta(); + default: AT_ERROR("Unrecognized Scalartype ", scalar_type, " (please report this error)"); + } +#undef DEFINE_CASE +} + static inline ScalarType dataTypeToScalarType(DataType dtype) { #define DEFINE_IF(ctype,name,_) \ if (dtype == caffe2::TypeMeta::Id()) { \ diff --git a/aten/src/ATen/core/Storage.cpp b/aten/src/ATen/core/Storage.cpp index 21f3e35ada4fd2..aca4bb75d2c95b 100644 --- a/aten/src/ATen/core/Storage.cpp +++ b/aten/src/ATen/core/Storage.cpp @@ -2,28 +2,4 @@ namespace at { -Storage::Storage( - at::ScalarType scalar_type, - size_t size, - Allocator* allocator, - bool resizable) - : storage_impl_(c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), - size, - allocator, - resizable)) {} - -Storage::Storage( - at::ScalarType scalar_type, - at::DataPtr data_ptr, - size_t size, - const std::function& deleter, - bool resizable) - : storage_impl_(c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), - size, - std::move(data_ptr), - /* allocator */ nullptr, - resizable)) {} - } // namespace at diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h index 6f2a8fd68ee716..ab201be88d630e 100644 --- a/aten/src/ATen/core/Storage.h +++ b/aten/src/ATen/core/Storage.h @@ -7,20 +7,58 @@ namespace at { struct AT_API Storage { public: Storage() {} - Storage(StorageImpl* storage_impl) : storage_impl_(c10::intrusive_ptr::reclaim(storage_impl)) {} - Storage(const c10::intrusive_ptr& ptr) : storage_impl_(ptr) {} - Storage(c10::intrusive_ptr&& ptr) : storage_impl_(std::move(ptr)) {} + Storage(c10::intrusive_ptr ptr) : storage_impl_(std::move(ptr)) {} Storage( - at::ScalarType, + caffe2::TypeMeta data_type, size_t size, Allocator* allocator, - bool resizable = false); + bool resizable = false) + : storage_impl_(c10::make_intrusive( + data_type, + size, + allocator, + resizable)) {} + Storage( - at::ScalarType, - at::DataPtr, + caffe2::TypeMeta data_type, + at::DataPtr data_ptr, size_t size, const std::function& deleter, - bool resizable = false); + bool resizable = false) + : storage_impl_(c10::make_intrusive( + data_type, + size, + std::move(data_ptr), + /* allocator */ nullptr, + resizable)) {} + + Storage(at::DeviceType device_type) + : storage_impl_(c10::make_intrusive(device_type)) {} + Storage(at::DeviceType device_type, caffe2::TypeMeta data_type) + : storage_impl_( + c10::make_intrusive(device_type, data_type)) {} + + Storage( + caffe2::TypeMeta data_type, + int64_t numel, + at::DataPtr data_ptr, + at::Allocator* allocator, + bool resizable) + : storage_impl_(c10::make_intrusive( + data_type, + numel, + std::move(data_ptr), + allocator, + resizable)) {} + + void reset() { + storage_impl_->reset(); + } + + template + inline bool IsType() const { + return storage_impl_->IsType(); + } template T* data() const { return storage_impl_->data(); } @@ -28,32 +66,122 @@ struct AT_API Storage { template T* unsafe_data() const { return storage_impl_->unsafe_data(); } - size_t elementSize() const { return storage_impl_->itemsize(); } - ptrdiff_t size() const { return storage_impl_->numel(); } - bool resizable() const { return storage_impl_->resizable(); } + size_t elementSize() const { + return storage_impl_->itemsize(); + } + + inline size_t itemsize() const { + return storage_impl_->itemsize(); + } + + ptrdiff_t size() const { + return storage_impl_->numel(); + } + + int64_t numel() const { + return storage_impl_->numel(); + } + + // TODO: remove later + void set_numel(int64_t numel) { + storage_impl_->set_numel(numel); + } + + bool resizable() const { + return storage_impl_->resizable(); + } + + size_t capacity() const { + return storage_impl_->capacity(); + } // get() use here is to get const-correctness - void* data() const { return storage_impl_.get()->data(); } - const at::DataType dtype() const { + + void* data() { + return storage_impl_->data(); + } + + void* data() const { + return storage_impl_.get()->data(); + } + + const caffe2::TypeMeta& dtype() const { return storage_impl_->dtype(); } - const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); } - DeviceType device_type() const { return storage_impl_->device_type(); } - at::Allocator* allocator() const { return storage_impl_.get()->allocator(); } - at::Device device() const { return storage_impl_->device(); } + + at::DataPtr& data_ptr() { + return storage_impl_->data_ptr(); + } + + const at::DataPtr& data_ptr() const { + return storage_impl_->data_ptr(); + } + + // Returns the previous data_ptr + at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { + return storage_impl_->set_data_ptr(std::move(data_ptr)); + }; + + void set_dtype(const caffe2::TypeMeta& data_type) { + storage_impl_->set_dtype(data_type); + } + + DeviceType device_type() const { + return storage_impl_->device_type(); + } + + at::Allocator* allocator() const { + return storage_impl_.get()->allocator(); + } + + at::Device device() const { + return storage_impl_->device(); + } StorageImpl* unsafeReleaseStorageImpl() { return storage_impl_.release(); } + StorageImpl* unsafeGetStorageImpl() const noexcept { return storage_impl_.get(); } + operator bool() const { return storage_impl_; } + size_t use_count() const { return storage_impl_.use_count(); } + inline bool unique() const { + return storage_impl_.unique(); + } + + void UniqueStorageShareExternalPointer( + void* src, + const caffe2::TypeMeta& data_type, + size_t capacity, + DeleterFnPtr d = nullptr) { + if (!storage_impl_.unique()) { + AT_ERROR( + "UniqueStorageShareExternalPointer can only be called when use_count == 1"); + } + storage_impl_->UniqueStorageShareExternalPointer( + src, data_type, capacity, d); + } + + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + const caffe2::TypeMeta& data_type, + size_t capacity) { + if (!storage_impl_.unique()) { + AT_ERROR( + "UniqueStorageShareExternalPointer can only be called when use_count == 1"); + } + storage_impl_->UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); + } + protected: c10::intrusive_ptr storage_impl_; }; diff --git a/aten/src/ATen/core/StorageImpl.cpp b/aten/src/ATen/core/StorageImpl.cpp index b0f82132e39cb8..5190a7766dcb49 100644 --- a/aten/src/ATen/core/StorageImpl.cpp +++ b/aten/src/ATen/core/StorageImpl.cpp @@ -1,30 +1 @@ #include - -namespace at { - -StorageImpl::StorageImpl( - at::DataType data_type, - int64_t numel, - at::DataPtr data_ptr, - at::Allocator* allocator, - bool resizable) - : data_type_(data_type), - data_ptr_(std::move(data_ptr)), - numel_(numel), - resizable_(resizable), - allocator_(allocator) {} - -StorageImpl::StorageImpl( - at::DataType data_type, - int64_t numel, - at::Allocator* allocator, - bool resizable) - : StorageImpl( - data_type, - numel, - allocator->allocate( - at::elementSize(dataTypeToScalarType(data_type)) * numel), - allocator, - resizable) {} - -} // namespace at diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h index e80c11c6b0e211..cc63bd00906669 100644 --- a/aten/src/ATen/core/StorageImpl.h +++ b/aten/src/ATen/core/StorageImpl.h @@ -12,32 +12,73 @@ struct Type; struct AT_API StorageImpl : public c10::intrusive_ptr_target { public: - StorageImpl() = delete; - ~StorageImpl() {}; StorageImpl( - at::DataType data_type, + caffe2::TypeMeta data_type, int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, - bool resizable); + bool resizable) + : data_type_(data_type), + data_ptr_(std::move(data_ptr)), + numel_(numel), + resizable_(resizable), + allocator_(allocator) { + if (numel > 0) { + if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) { + AT_ERROR( + "Constructing a storage with meta of unknown type and non-zero numel"); + } + } + } + StorageImpl( - at::DataType data_type, + caffe2::TypeMeta data_type, int64_t numel, at::Allocator* allocator, - bool resizable); + bool resizable) + : StorageImpl( + data_type, + numel, + allocator->allocate(data_type.itemsize() * numel), + allocator, + resizable) {} + + explicit StorageImpl(at::DeviceType device_type) + : StorageImpl(device_type, caffe2::TypeMeta()) {} + + StorageImpl(at::DeviceType device_type, caffe2::TypeMeta data_type) + : StorageImpl( + data_type, + 0, + at::DataPtr(nullptr, at::Device(device_type)), + nullptr, + true) {} + + StorageImpl& operator=(StorageImpl&& other) = default; + StorageImpl& operator=(const StorageImpl&) = delete; + StorageImpl() = delete; + StorageImpl(StorageImpl&& other) = default; StorageImpl(StorageImpl&) = delete; StorageImpl(const StorageImpl&) = delete; - // NB: Don't move ref count! - StorageImpl(StorageImpl&& other) = default; - StorageImpl& operator=(StorageImpl&& other) = default; + ~StorageImpl() = default; + + void reset() { + data_ptr_.clear(); + numel_ = 0; + } + + template + inline bool IsType() const { + return data_type_.Match(); + } template inline T* data() const { auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType::to()); - if (dtype() != data_type_T) { + if (dtype().id() != data_type_T) { AT_ERROR( "Attempt to access StorageImpl having data type ", - dtype(), + dtype().id(), " as data type ", data_type_T); } @@ -53,52 +94,77 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { data_ptr_.clear(); } - void operator=(const StorageImpl&) = delete; - size_t itemsize() const { - return at::elementSize(dataTypeToScalarType(data_type_)); + return data_type_.itemsize(); } Type& type(); + size_t capacity() const { + return numel_ * itemsize(); + } + int64_t numel() const { return numel_; }; + + // TODO: remove later void set_numel(int64_t numel) { numel_ = numel; }; + bool resizable() const { return resizable_; }; + at::DataPtr& data_ptr() { return data_ptr_; }; + const at::DataPtr& data_ptr() const { return data_ptr_; }; + // Returns the previous data_ptr at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { std::swap(data_ptr_, data_ptr); return std::move(data_ptr); }; + + // XXX: TERRIBLE! DONT USE UNLESS YOU HAVE TO! AND EVEN THEN DONT, JUST DONT! + // Setting the data_type will require you to audit many other parts of the + // struct again to make sure it's still valid. + void set_dtype(const caffe2::TypeMeta& data_type) { + int64_t capacity = numel_ * data_type_.itemsize(); + data_type_ = data_type; + numel_ = capacity / data_type_.itemsize(); + } + + // TODO: Return const ptr eventually if possible void* data() { return data_ptr_.get(); - }; - const void* data() const { + } + + void* data() const { return data_ptr_.get(); - }; + } + at::DeviceType device_type() const { return data_ptr_.device().type(); } + at::Allocator* allocator() { return allocator_; - }; - const DataType dtype() const { + } + + const caffe2::TypeMeta& dtype() const { return data_type_; } + const at::Allocator* allocator() const { return allocator_; }; + // You generally shouldn't use this method, but it is occasionally // useful if you want to override how a tensor will be reallocated, // after it was already allocated (and its initial allocator was @@ -106,15 +172,53 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { void set_allocator(at::Allocator* allocator) { allocator_ = allocator; } + Device device() const { return data_ptr_.device(); } + void set_resizable(bool resizable) { resizable_ = resizable; } + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + void* src, + const caffe2::TypeMeta& data_type, + size_t capacity, + DeleterFnPtr d = nullptr) { + UniqueStorageShareExternalPointer( + at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity); + } + + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + const caffe2::TypeMeta& data_type, + size_t capacity) { + data_type_ = data_type; + // TODO: Use CAFFE_ENFORCE_WITH_CALLER equivalent + // For now causes lots of redefine issues if caffe2/core/logging.h is used + if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) { + AT_ERROR( + "To share with a raw external pointer you need to have meta " + "already set."); + } + data_ptr_ = std::move(data_ptr); + // NOTE: data_type might change and so it's also possible that capacity + // might not be divisible by itemsize. There is no way for us to keep track + // of the exact capacity if we're not explicity storing is. More conrectely + // capacity() might not return the value that was set here, if itemsize does + // not evenly divide it. + numel_ = capacity / data_type_.itemsize(); + } + private: - at::DataType data_type_; + caffe2::TypeMeta data_type_; at::DataPtr data_ptr_; int64_t numel_; bool resizable_; diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp similarity index 77% rename from aten/src/ATen/Tensor.cpp rename to aten/src/ATen/core/Tensor.cpp index 860a5d2ab0afe1..924688d40b9551 100644 --- a/aten/src/ATen/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -1,6 +1,6 @@ -#include -#include -#include +#include +#include +#include #include diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h new file mode 100644 index 00000000000000..1b2c0f0e288264 --- /dev/null +++ b/aten/src/ATen/core/Tensor.h @@ -0,0 +1,688 @@ +#pragma once + +#include "ATen/core/Device.h" +#include "ATen/core/Layout.h" +#include "ATen/core/Scalar.h" +#include "ATen/core/ScalarType.h" +#include "ATen/core/SparseTensorRef.h" +#include "ATen/core/Storage.h" +#include "ATen/core/TensorAccessor.h" +#include "ATen/core/TensorImpl.h" +#include "ATen/core/optional.h" +#include "ATen/core/UndefinedTensorImpl.h" +#include "ATen/core/Error.h" + +namespace at { +struct Generator; +struct Type; +struct Tensor; +struct TensorOptions; +} // namespace at + +namespace at { +// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which +// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr. +// +// For example: +// +// void func(Tensor a) { +// Tensor b = a; +// ... +// } +// +// In this example, when we say Tensor b = a, we are creating a new object that points to the +// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the +// destructor decrements the reference count by calling release() on the TensorImpl it points to. +// The existing constructors, operator overloads, etc. take care to implement the correct semantics. +// +// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and +// special care must be taken to handle this. +struct AT_API Tensor { + Tensor(){}; + Tensor(c10::intrusive_ptr tensor_impl) + : tensor_impl_(std::move(tensor_impl)) { + if (tensor_impl_.get() == nullptr) { + throw std::runtime_error("TensorBaseImpl with nullptr not supported"); + } + } + + Tensor(const Tensor&) = default; + Tensor(Tensor&&) = default; + + int64_t dim() const { + return tensor_impl_->dim(); + } + + TensorImpl * unsafeGetTensorImpl() const { + return tensor_impl_.get(); + } + TensorImpl * unsafeReleaseTensorImpl() { + return tensor_impl_.release(); + } + const c10::intrusive_ptr& getIntrusivePtr() const { + return tensor_impl_; + } + + bool defined() const { + return tensor_impl_; + } + + void reset() { + tensor_impl_.reset(); + } + + // The following overloads are very intruiging. Consider the following + // program: + // + // x[1] = 3; + // + // We would expect that the first entry of x is written to 3. But how can we + // actually achieve this? x[1] evaluates to a tensor... + // + // The answer is, using a ref-qualifier. x[1] is an rvalue, which cannot be + // (profitably) assigned to in the traditional sense, so we overload + // assignment to mean, "Actually, copy 3 into the tensor data." This is done + // with an rvalue-reference ref-qualified overload (the methods with && at the + // end of their type.) + // + // There's one more fly in the ointment: We also want + // + // Tensor x = y; + // + // to work, and we want it NOT to copy. So we need a traditional operator= + // overload. But we MUST specify a mutable lvalue ref-qualifier, to + // disambiguate the traditional overload from the rvalue-reference + // ref-qualified overload. Otherwise, it will be ambiguous, because + // a non ref-qualified method is eligible for all situations. + + // Unfortunately, we have to write these constructors out manually + // to work around an MSVC bug: + // error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &': + // multiple versions of a defaulted special member functions are not allowed + // Tensor& operator=(const Tensor&) & = default; + // Tensor& operator=(Tensor&&) & = default; + Tensor& operator=(const Tensor& x) & { + tensor_impl_ = x.tensor_impl_; + return *this; + } + Tensor& operator=(Tensor&& x) & { + tensor_impl_ = std::move(x.tensor_impl_); + return *this; + } + + Tensor& operator=(Scalar v) &&; + Tensor& operator=(const Tensor&) &&; + Tensor& operator=(Tensor&&) &&; + + bool is_same(const Tensor& other) const noexcept { + return tensor_impl_ == other.tensor_impl_; + } + size_t use_count() const noexcept { + return tensor_impl_.use_count(); + } + size_t weak_use_count() const noexcept { + return tensor_impl_.weak_use_count(); + } + + const char * toString() const; + + IntList sizes() const { + return tensor_impl_->sizes(); + } + IntList strides() const { + return tensor_impl_->strides(); + } + int64_t ndimension() const { + return dim(); + } + Type & type() const { + return tensor_impl_->type(); + } + TensorTypeId type_id() const { + return tensor_impl_->type_id(); + } + ScalarType scalar_type() const { + return dataTypeToScalarType(tensor_impl_->dtype().id()); + } + const Storage& storage() const { + return tensor_impl_->storage(); + } + Tensor toType(const Type & t, bool non_blocking=false) const; + Tensor & copy_(const Tensor & src, bool non_blocking=false); + Tensor toType(ScalarType t) const; + Tensor toBackend(Backend b) const; + + /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`. + /// Defined in Type.h because of include order issues. + bool is_variable() const noexcept; + + /// Returns a `Tensor`'s layout. Defined in Type.h + Layout layout() const noexcept; + + /// Returns a `Tensor`'s dtype (`ScalarType`). Defined in Type.h + ScalarType dtype() const noexcept; + + /// Returns a `Tensor`'s device. + Device device() const; + + /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in + /// TensorOptions.h. + TensorOptions options() const; + + template + T * data() const; + + // Purposely not defined here to avoid inlining + void print() const; + + //toLongData(), toFloatData() etc. + #define TO_TYPE_DATA(T,name,_) \ + T * to##name##Data() const; + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA) + #undef TO_TYPE_DATA + + #define TO_C_TYPE(T,name,_) \ + T toC##name () const; + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE) + #undef TO_C_TYPE + + // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and + // dimension. + template + TensorAccessor accessor() const& { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return TensorAccessor(data(),sizes().data(),strides().data()); + } + template + TensorAccessor accessor() && = delete; + + // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and + // dimension. You can optionally specify RestrictPtrTraits as a template parameter to + // cast the data pointer to a __restrict__ pointer. + // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor + // as an argument. + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() const& { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return PackedTensorAccessor(static_cast::PtrType>(data()),sizes().data(),strides().data()); + } + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() && = delete; + + Tensor operator-() const; + Tensor& operator+=(const Tensor & other); + Tensor& operator+=(Scalar other); + Tensor& operator-=(const Tensor & other); + Tensor& operator-=(Scalar other); + Tensor& operator*=(const Tensor & other); + Tensor& operator*=(Scalar other); + Tensor& operator/=(const Tensor & other); + Tensor& operator/=(Scalar other); + Tensor operator[](Scalar index) const; + Tensor operator[](Tensor index) const; + Tensor operator[](int64_t index) const; + + Tensor cpu() const; + Tensor cuda() const; + + // ~~~~~ Autograd API ~~~~~ + + Tensor& set_requires_grad(bool requires_grad) { + tensor_impl_->set_requires_grad(requires_grad); + return *this; + } + bool requires_grad() const { + return tensor_impl_->requires_grad(); + } + + Tensor& grad() { + return tensor_impl_->grad(); + } + const Tensor& grad() const { + return tensor_impl_->grad(); + } + + void set_data(Tensor new_data); + + /// Computes the gradient of current tensor w.r.t. graph leaves. + void backward( + at::optional gradient = at::nullopt, + bool keep_graph = false, + bool create_graph = false); + + // STOP. Thinking of adding a method here, which only makes use + // of other ATen methods? Define it in native_functions.yaml. + + //example + //Tensor * add(Tensor & b); + int64_t storage_offset() const; + Tensor & resize_(IntList size); + Tensor & set_(Storage source); + Tensor & set_(Storage source, int64_t storage_offset, IntList size, IntList stride={}); + Tensor & set_(const Tensor & source); + Tensor & set_(); + bool is_contiguous() const; + bool is_set_to(const Tensor & tensor) const; + Tensor & masked_fill_(const Tensor & mask, Scalar value); + Tensor & masked_fill_(const Tensor & mask, const Tensor & value); + Tensor & masked_scatter_(const Tensor & mask, const Tensor & source); + Tensor masked_select(const Tensor & mask) const; + Tensor nonzero() const; + Tensor contiguous() const; + Tensor view(IntList size) const; + Tensor index_select(int64_t dim, const Tensor & index) const; + Tensor take(const Tensor & index) const; + Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false); + Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source); + Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value); + Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value); + Tensor unfold(int64_t dimension, int64_t size, int64_t step) const; + Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src); + Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value); + Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src); + Tensor gather(int64_t dim, const Tensor & index) const; + void* data_ptr() const; + bool equal(const Tensor & other) const; + Tensor __and__(Scalar other) const; + Tensor __and__(const Tensor & other) const; + Tensor & __iand__(Scalar other); + Tensor & __iand__(const Tensor & other); + Tensor __or__(Scalar other) const; + Tensor __or__(const Tensor & other) const; + Tensor & __ior__(Scalar other); + Tensor & __ior__(const Tensor & other); + Tensor __xor__(Scalar other) const; + Tensor __xor__(const Tensor & other) const; + Tensor & __ixor__(Scalar other); + Tensor & __ixor__(const Tensor & other); + Tensor __lshift__(Scalar other) const; + Tensor __lshift__(const Tensor & other) const; + Tensor & __ilshift__(Scalar other); + Tensor & __ilshift__(const Tensor & other); + Tensor __rshift__(Scalar other) const; + Tensor __rshift__(const Tensor & other) const; + Tensor & __irshift__(Scalar other); + Tensor & __irshift__(const Tensor & other); + Tensor lt(Scalar other) const; + Tensor lt(const Tensor & other) const; + Tensor & lt_(Scalar other); + Tensor & lt_(const Tensor & other); + Tensor gt(Scalar other) const; + Tensor gt(const Tensor & other) const; + Tensor & gt_(Scalar other); + Tensor & gt_(const Tensor & other); + Tensor le(Scalar other) const; + Tensor le(const Tensor & other) const; + Tensor & le_(Scalar other); + Tensor & le_(const Tensor & other); + Tensor ge(Scalar other) const; + Tensor ge(const Tensor & other) const; + Tensor & ge_(Scalar other); + Tensor & ge_(const Tensor & other); + Tensor eq(Scalar other) const; + Tensor eq(const Tensor & other) const; + Tensor & eq_(Scalar other); + Tensor & eq_(const Tensor & other); + Tensor ne(Scalar other) const; + Tensor ne(const Tensor & other) const; + Tensor & ne_(Scalar other); + Tensor & ne_(const Tensor & other); + Tensor min(const Tensor & other) const; + Tensor min() const; + Tensor max(const Tensor & other) const; + Tensor max() const; + Tensor median() const; + std::tuple sort(int64_t dim=-1, bool descending=false) const; + std::tuple topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const; + Tensor all() const; + Tensor any() const; + Tensor lgamma() const; + Tensor & lgamma_(); + Tensor digamma() const; + Tensor & digamma_(); + Tensor polygamma(int64_t n) const; + Tensor & polygamma_(int64_t n); + Tensor & erfinv_(); + Tensor erfinv() const; + Tensor & frac_(); + Tensor frac() const; + Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const; + Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm); + Tensor dist(const Tensor & other, Scalar p=2) const; + Tensor reciprocal() const; + Tensor & reciprocal_(); + Tensor neg() const; + Tensor & neg_(); + Tensor atan2(const Tensor & other) const; + Tensor & atan2_(const Tensor & other); + Tensor pow(const Tensor & exponent) const; + Tensor & pow_(Scalar exponent); + Tensor & pow_(const Tensor & exponent); + Tensor lerp(const Tensor & end, Scalar weight) const; + Tensor & lerp_(const Tensor & end, Scalar weight); + Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const; + Tensor sign() const; + Tensor & sign_(); + Tensor trace() const; + Tensor fmod(Scalar other) const; + Tensor fmod(const Tensor & other) const; + Tensor & fmod_(Scalar other); + Tensor & fmod_(const Tensor & other); + Tensor remainder(Scalar other) const; + Tensor remainder(const Tensor & other) const; + Tensor & remainder_(Scalar other); + Tensor & remainder_(const Tensor & other); + Tensor tril(int64_t diagonal=0) const; + Tensor & tril_(int64_t diagonal=0); + Tensor triu(int64_t diagonal=0) const; + Tensor & triu_(int64_t diagonal=0); + Tensor cross(const Tensor & other, int64_t dim=-1) const; + Tensor diag(int64_t diagonal=0) const; + Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); + Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); + Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); + std::tuple gels(const Tensor & A) const; + std::tuple trtrs(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const; + std::tuple symeig(bool eigenvectors=false, bool upper=true) const; + std::tuple eig(bool eigenvectors=false) const; + std::tuple svd(bool some=true) const; + Tensor potrf(bool upper=true) const; + Tensor potrs(const Tensor & input2, bool upper=true) const; + Tensor potri(bool upper=true) const; + std::tuple pstrf(bool upper=true, Scalar tol=-1) const; + std::tuple qr() const; + std::tuple geqrf() const; + Tensor orgqr(const Tensor & input2) const; + Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const; + std::tuple btrifact(bool pivot=true) const; + std::tuple btrifact_with_info(bool pivot=true) const; + Tensor btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const; + Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr); + Tensor & random_(int64_t to, Generator * generator=nullptr); + Tensor & random_(Generator * generator=nullptr); + Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const; + Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr); + Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr); + Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr); + Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr); + Tensor & exponential_(double lambd=1, Generator * generator=nullptr); + Tensor & geometric_(double p, Generator * generator=nullptr); + Tensor abs() const; + Tensor & abs_(); + Tensor acos() const; + Tensor & acos_(); + Tensor add(const Tensor & other, Scalar alpha=1) const; + Tensor & add_(const Tensor & other, Scalar alpha=1); + Tensor add(Scalar other, Scalar alpha=1) const; + Tensor & add_(Scalar other, Scalar alpha=1); + Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); + Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); + Tensor all(int64_t dim, bool keepdim=false) const; + bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const; + Tensor any(int64_t dim, bool keepdim=false) const; + Tensor argmax(int64_t dim, bool keepdim=false) const; + Tensor argmax() const; + Tensor argmin(int64_t dim, bool keepdim=false) const; + Tensor argmin() const; + Tensor as_strided(IntList size, IntList stride) const; + Tensor & as_strided_(IntList size, IntList stride); + Tensor as_strided(IntList size, IntList stride, int64_t storage_offset) const; + Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset); + Tensor asin() const; + Tensor & asin_(); + Tensor atan() const; + Tensor & atan_(); + Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); + Tensor bernoulli(const Tensor & p, Generator * generator=nullptr) const; + Tensor bernoulli(double p, Generator * generator=nullptr) const; + Tensor bernoulli() const; + Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr); + Tensor & bernoulli_(double p, Generator * generator=nullptr); + Tensor & bernoulli_(); + Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const; + Tensor bmm(const Tensor & mat2) const; + Tensor ceil() const; + Tensor & ceil_(); + std::vector chunk(int64_t chunks, int64_t dim=0) const; + Tensor clamp(Scalar min, Scalar max) const; + Tensor & clamp_(Scalar min, Scalar max); + Tensor clamp_max(Scalar max) const; + Tensor & clamp_max_(Scalar max); + Tensor clamp_min(Scalar min) const; + Tensor & clamp_min_(Scalar min); + Tensor cos() const; + Tensor & cos_(); + Tensor cosh() const; + Tensor & cosh_(); + Tensor cumsum(int64_t dim, ScalarType dtype) const; + Tensor cumsum(int64_t dim) const; + Tensor cumprod(int64_t dim, ScalarType dtype) const; + Tensor cumprod(int64_t dim) const; + Tensor det() const; + Tensor diagflat(int64_t offset=0) const; + Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const; + Tensor div(const Tensor & other) const; + Tensor & div_(const Tensor & other); + Tensor div(Scalar other) const; + Tensor & div_(Scalar other); + Tensor dot(const Tensor & tensor) const; + Tensor erf() const; + Tensor & erf_(); + Tensor erfc() const; + Tensor & erfc_(); + Tensor exp() const; + Tensor & exp_(); + Tensor expm1() const; + Tensor & expm1_(); + Tensor expand(IntList size, bool implicit=false) const; + Tensor expand_as(const Tensor & other) const; + Tensor flatten(int64_t start_dim=0, int64_t end_dim=-1) const; + Tensor & fill_(Scalar value); + Tensor & fill_(const Tensor & value); + Tensor floor() const; + Tensor & floor_(); + Tensor ger(const Tensor & vec2) const; + std::tuple gesv(const Tensor & A) const; + Tensor fft(int64_t signal_ndim, bool normalized=false) const; + Tensor ifft(int64_t signal_ndim, bool normalized=false) const; + Tensor rfft(int64_t signal_ndim, bool normalized=false, bool onesided=true) const; + Tensor irfft(int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const; + Tensor index(TensorList indices) const; + Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source); + Tensor index_put(TensorList indices, const Tensor & values) const; + Tensor & index_put_(TensorList indices, const Tensor & values); + Tensor inverse() const; + Tensor isclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const; + bool is_cuda() const; + bool is_distributed() const; + bool is_floating_point() const; + bool is_complex() const; + bool is_nonzero() const; + bool is_same_size(const Tensor & other) const; + bool is_signed() const; + bool is_sparse() const; + std::tuple kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const; + Tensor log() const; + Tensor & log_(); + Tensor log10() const; + Tensor & log10_(); + Tensor log1p() const; + Tensor & log1p_(); + Tensor log2() const; + Tensor & log2_(); + Tensor logdet() const; + Tensor log_softmax(int64_t dim) const; + Tensor logsumexp(int64_t dim, bool keepdim=false) const; + Tensor matmul(const Tensor & other) const; + Tensor matrix_power(int64_t n) const; + std::tuple max(int64_t dim, bool keepdim=false) const; + Tensor max_values(int64_t dim, bool keepdim=false) const; + Tensor mean(ScalarType dtype) const; + Tensor mean() const; + Tensor mean(int64_t dim, bool keepdim, ScalarType dtype) const; + Tensor mean(int64_t dim, bool keepdim=false) const; + Tensor mean(int64_t dim, ScalarType dtype) const; + std::tuple median(int64_t dim, bool keepdim=false) const; + std::tuple min(int64_t dim, bool keepdim=false) const; + Tensor min_values(int64_t dim, bool keepdim=false) const; + Tensor mm(const Tensor & mat2) const; + std::tuple mode(int64_t dim=-1, bool keepdim=false) const; + Tensor mul(const Tensor & other) const; + Tensor & mul_(const Tensor & other); + Tensor mul(Scalar other) const; + Tensor & mul_(Scalar other); + Tensor mv(const Tensor & vec) const; + Tensor mvlgamma(int64_t p) const; + Tensor & mvlgamma_(int64_t p); + Tensor narrow(int64_t dim, int64_t start, int64_t length) const; + Tensor permute(IntList dims) const; + Tensor pin_memory() const; + Tensor pinverse(double rcond=1e-15) const; + Tensor repeat(IntList repeats) const; + Tensor reshape(IntList shape) const; + Tensor reshape_as(const Tensor & other) const; + Tensor round() const; + Tensor & round_(); + Tensor relu() const; + Tensor & relu_(); + Tensor hardshrink(Scalar lambd=0.5) const; + Tensor hardshrink_backward(const Tensor & grad_out, Scalar lambd) const; + Tensor rsqrt() const; + Tensor & rsqrt_(); + Tensor select(int64_t dim, int64_t index) const; + Tensor sigmoid() const; + Tensor & sigmoid_(); + Tensor sin() const; + Tensor & sin_(); + Tensor sinh() const; + Tensor & sinh_(); + Tensor detach() const; + Tensor & detach_(); + int64_t size(int64_t dim) const; + Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const; + std::tuple slogdet() const; + Tensor smm(const Tensor & mat2) const; + Tensor softmax(int64_t dim) const; + std::vector split(int64_t split_size, int64_t dim=0) const; + std::vector split_with_sizes(IntList split_sizes, int64_t dim=0) const; + Tensor squeeze() const; + Tensor squeeze(int64_t dim) const; + Tensor & squeeze_(); + Tensor & squeeze_(int64_t dim); + Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const; + int64_t stride(int64_t dim) const; + Tensor sum(ScalarType dtype) const; + Tensor sum() const; + Tensor sum(IntList dim, bool keepdim, ScalarType dtype) const; + Tensor sum(IntList dim, bool keepdim=false) const; + Tensor sum(IntList dim, ScalarType dtype) const; + Tensor sqrt() const; + Tensor & sqrt_(); + Tensor std(bool unbiased=true) const; + Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const; + Tensor prod(ScalarType dtype) const; + Tensor prod() const; + Tensor prod(int64_t dim, bool keepdim, ScalarType dtype) const; + Tensor prod(int64_t dim, bool keepdim=false) const; + Tensor prod(int64_t dim, ScalarType dtype) const; + Tensor t() const; + Tensor & t_(); + Tensor tan() const; + Tensor & tan_(); + Tensor tanh() const; + Tensor & tanh_(); + Tensor transpose(int64_t dim0, int64_t dim1) const; + Tensor & transpose_(int64_t dim0, int64_t dim1); + Tensor flip(IntList dims) const; + Tensor rot90(int64_t k=1, IntList dims={0,1}) const; + Tensor trunc() const; + Tensor & trunc_(); + Tensor type_as(const Tensor & other) const; + Tensor unsqueeze(int64_t dim) const; + Tensor & unsqueeze_(int64_t dim); + Tensor var(bool unbiased=true) const; + Tensor var(int64_t dim, bool unbiased=true, bool keepdim=false) const; + Tensor view_as(const Tensor & other) const; + Tensor where(const Tensor & condition, const Tensor & other) const; + Tensor norm(Scalar p=2) const; + Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const; + Tensor clone() const; + Tensor & resize_as_(const Tensor & the_template); + Tensor pow(Scalar exponent) const; + Tensor & zero_(); + Tensor sub(const Tensor & other, Scalar alpha=1) const; + Tensor & sub_(const Tensor & other, Scalar alpha=1); + Tensor sub(Scalar other, Scalar alpha=1) const; + Tensor & sub_(Scalar other, Scalar alpha=1); + Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); + Tensor & sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims); + Tensor & sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims); + Tensor sparse_mask(SparseTensorRef mask) const; + Tensor to_dense() const; + int64_t _sparseDims() const; + int64_t _denseDims() const; + int64_t _nnz() const; + Tensor coalesce() const; + bool is_coalesced() const; + Tensor _indices() const; + Tensor _values() const; + int64_t numel() const; + std::vector unbind(int64_t dim=0) const; + int64_t get_device() const; + Tensor to(Device device, ScalarType dtype, bool non_blocking=false) const; + Tensor to(ScalarType dtype, bool non_blocking=false) const; + Tensor to(Device device, bool non_blocking=false) const; + Tensor to(const Tensor & other, bool non_blocking=false) const; + Scalar _local_scalar() const; + + template + auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward(params)...)) { + return func(*this, std::forward(params)...); + } + + friend struct WeakTensor; + +protected: + c10::intrusive_ptr tensor_impl_; +}; + +struct AT_API WeakTensor { + WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {} + + // XXX: this can return undefined tensors + // Ideally it would be at::optional, but MSVC is too cool for that + Tensor lock() const { + return Tensor(weak_tensor_impl_.lock()); + } + + bool is_same(const WeakTensor& other) const noexcept { + return weak_tensor_impl_ == other.weak_tensor_impl_; + } + + size_t use_count() const noexcept { + return weak_tensor_impl_.use_count(); + } + size_t weak_use_count() const noexcept { + return weak_tensor_impl_.weak_use_count(); + } + + TensorImpl* unsafeGetTensorImpl() const { + return weak_tensor_impl_._unsafe_get_target(); + } + +private: + c10::weak_intrusive_ptr weak_tensor_impl_; +}; +} // namespace at + +#include "ATen/core/TensorMethods.h" diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index e3a73a3cbea2e5..d8a851d998332a 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -2,46 +2,145 @@ #include #include +#include namespace at { +// The PtrTraits argument to the TensorAccessor/PackedTensorAccessor +// is used to enable the __restrict__ keyword/modifier for the data +// passed to cuda. +template +struct DefaultPtrTraits { + typedef T* PtrType; +}; + +#ifdef __CUDACC__ +template +struct RestrictPtrTraits { + typedef T* __restrict__ PtrType; +}; +#endif + +// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. +// For CUDA tensors it is used in device code (only). This means that we restrict ourselves +// to functions and types available there (e.g. IntList isn't). -template +// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers. +template class PtrTraits = DefaultPtrTraits> class TensorAccessorBase { public: - TensorAccessorBase(T * data_, const int64_t * sizes_, const int64_t * strides_) + typedef typename PtrTraits::PtrType PtrType; + + AT_HOST_DEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : data_(data_), sizes_(sizes_), strides_(strides_) {} - IntList sizes() { + AT_HOST IntList sizes() const { return IntList(sizes_,N); } - IntList strides() { + AT_HOST IntList strides() const { return IntList(strides_,N); } - int64_t stride(int64_t i) { return strides()[i]; } - int64_t size(int64_t i) { return sizes()[i]; } + AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; } + AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; } + AT_HOST_DEVICE T *data() { return data_; } + AT_HOST_DEVICE const T *data() const { return data_; } protected: - T * data_; + PtrType data_; const int64_t* sizes_; const int64_t* strides_; }; -template -class TensorAccessor : public TensorAccessorBase { +// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using +// `Tensor.accessor()`. +// For CUDA `Tensor`s, `PackedTensorAccessor` is used on the host and only +// indexing on the device uses `TensorAccessor`s. +template class PtrTraits = DefaultPtrTraits> +class TensorAccessor : public TensorAccessorBase { public: - TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_) + typedef typename PtrTraits::PtrType PtrType; + + AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : TensorAccessorBase(data_,sizes_,strides_) {} - TensorAccessor operator[](int64_t i) { + AT_HOST_DEVICE TensorAccessor operator[](int64_t i) { + return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); + } + + AT_HOST_DEVICE const TensorAccessor operator[](int64_t i) const { return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); } }; -template -class TensorAccessor : public TensorAccessorBase { +template class PtrTraits> +class TensorAccessor : public TensorAccessorBase { +public: + typedef typename PtrTraits::PtrType PtrType; + + AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : TensorAccessorBase(data_,sizes_,strides_) {} + AT_HOST_DEVICE T & operator[](int64_t i) { + return this->data_[this->strides_[0]*i]; + } +}; + + +// PackedTensorAccessorBase and PackedTensorAccessor are used on for CUDA `Tensor`s on the host +// and as +// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host) +// in order to transfer them on the device when calling kernels. +// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s. +// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__. +// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available +// on the device, so those functions are host only. +template class PtrTraits = DefaultPtrTraits> +class PackedTensorAccessorBase { +public: + typedef typename PtrTraits::PtrType PtrType; + AT_HOST PackedTensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : data_(data_) + { + std::copy(sizes_, sizes_ + N, std::begin(this->sizes_)); + std::copy(strides_, strides_ + N, std::begin(this->strides_)); + } + AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; } + AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; } +protected: + PtrType data_; + int64_t sizes_[N]; + int64_t strides_[N]; +}; + +template class PtrTraits = DefaultPtrTraits> +class PackedTensorAccessor : public PackedTensorAccessorBase { +public: + typedef typename PtrTraits::PtrType PtrType; + + AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : PackedTensorAccessorBase(data_, sizes_, strides_) {}; + + AT_DEVICE TensorAccessor operator[](int64_t i) { + int64_t* new_sizes = this->sizes_+1; + int64_t* new_strides = this->strides_+1; + return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); + } + + AT_DEVICE const TensorAccessor operator[](int64_t i) const { + int64_t* new_sizes = this->sizes_+1; + int64_t* new_strides = this->strides_+1; + return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); + } +}; + +template class PtrTraits> +class PackedTensorAccessor : public PackedTensorAccessorBase { public: - TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_) - : TensorAccessorBase(data_,sizes_,strides_) {} - T & operator[](int64_t i) { + typedef typename PtrTraits::PtrType PtrType; + AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : PackedTensorAccessorBase(data_, sizes_, strides_) {}; + + AT_DEVICE T & operator[](int64_t i) { + return this->data_[this->strides_[0]*i]; + } + AT_DEVICE const T& operator[](int64_t i) const { return this->data_[this->strides_[0]*i]; } }; diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp similarity index 79% rename from aten/src/ATen/TensorImpl.cpp rename to aten/src/ATen/core/TensorImpl.cpp index f4ecaf0b6253fa..5b568482d8dfe2 100644 --- a/aten/src/ATen/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -17,19 +17,19 @@ const Tensor& TensorImpl::grad() const { AT_ERROR("grad is not implemented for Tensor"); } -TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable) - : TensorImpl({}, type_id, scalar_type, is_variable) { +TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable) + : TensorImpl({}, type_id, data_type, is_variable) { // UndefinedTensors and SparseTensors don't have storages. - if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined + if (type_id != UndefinedTensorId() && data_type.id() != caffe2::TypeIdentifier::uninitialized() && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) { - storage_ = Storage(scalar_type, 0, allocator, true); + storage_ = Storage(data_type, 0, allocator, true); } } TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable) - : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype()), is_variable) {} + : TensorImpl(std::move(storage), type_id, storage.dtype(), is_variable) {} -TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable) +TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable) : storage_(std::move(storage)), storage_offset_(0), sizes_{0}, @@ -37,7 +37,7 @@ TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scala is_contiguous_(true), numel_(0), type_id_(type_id), - scalar_type_(scalar_type), + data_type_(data_type), is_variable_(is_variable) {} IntList TensorImpl::sizes() const { diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h new file mode 100644 index 00000000000000..d2f98ff52780f8 --- /dev/null +++ b/aten/src/ATen/core/TensorImpl.h @@ -0,0 +1,210 @@ +#pragma once + +#include +#include + +#include "ATen/core/Storage.h" +#include "ATen/core/optional.h" +#include "ATen/core/TensorTypeId.h" +#include "ATen/core/TensorTypeIdRegistration.h" +#include "ATen/core/LegacyTypeDispatch.h" +#include "ATen/core/Backend.h" + +struct THTensor; + +namespace at { +class Scalar; +struct Type; +struct Storage; +struct Tensor; +} // namespace at + +namespace at { +struct AT_API TensorImpl : public c10::intrusive_ptr_target { + TensorImpl() = delete; + TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable); + TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); + + virtual void release_resources() override; + + Type & type() const { + // NB: It's valid to use getTypeRaw here, because the TensorImpl + // could not have been created without initializing the Type first. + // TODO: This is not actually true via the Caffe2 codepath! Make + // it so. + return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), dataTypeToScalarType(dtype().id()), is_variable()); + } + + TensorTypeId type_id() const { return type_id_; } + virtual IntList sizes() const; + virtual IntList strides() const; + virtual int64_t dim() const; + virtual const Storage& storage() const; + friend struct Type; + + virtual int64_t numel() const { +#ifdef DEBUG + AT_ASSERT(compute_numel() == numel_); +#endif + return numel_; + } + + virtual bool is_contiguous() const { +#ifdef DEBUG + AT_ASSERT(compute_contiguous() == is_contiguous_); +#endif + return is_contiguous_; + } + + // this is called by the generated wrapper code when there are conditions + // when this output tensor should be zero dimensional. e.g. when all inputs + // to a function 'add' were zero dimensional, then condition_when_zero_dim == true. + // we also prevent this from getting marked as a zero dim tensor if it is not + // the right shape afterall. + virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim); + + // True if a tensor was auto-wrapped from a C++ or Python number. + // Wrapped numbers do not participate in the result type computation for + // mixed-type operations if there are any Tensors that are not wrapped + // numbers. Otherwise, they behave like their non-wrapped equivalents. + // See [Result type computation] in TensorIterator.h. + bool is_wrapped_number() const { + return is_wrapped_number_; + } + void set_wrapped_number(bool value) { + AT_ASSERT(dim() == 0); + is_wrapped_number_ = value; + } + + // ~~~~~ Autograd API ~~~~~ + // Some methods below are defined in TensorImpl.cpp because Tensor is an + // incomplete type. + + virtual void set_requires_grad(bool requires_grad) { + AT_ERROR("set_requires_grad is not implemented for Tensor"); + } + virtual bool requires_grad() const { + AT_ERROR("requires_grad is not implemented for Tensor"); + } + + virtual Tensor& grad(); + virtual const Tensor& grad() const; + + // TODO: make these protected + // Note: storage->size() may be greater than the recorded size + // of a tensor + at::Storage storage_; + + template + inline T * data() const { + return storage_.data() + storage_offset_; + } + + inline void* data() const { + return static_cast( + static_cast(storage_.data()) + + data_type_.itemsize() * storage_offset_); + } + + template + inline T * unsafe_data() const { + return storage_.unsafe_data() + storage_offset_; + } + + const caffe2::TypeMeta& dtype() const { + return data_type_; + } + + virtual int64_t storage_offset() const { + return storage_offset_; + } + + // represents that numel() == 0. + inline bool is_empty() const { + return numel() == 0; + } + + virtual void resize_dim(int64_t ndim) { + // NB: This is *truly* a resize; calling code (e.g., squeeze) + // assumes that old values are preserved + sizes_.resize(ndim); + strides_.resize(ndim); + refresh_numel(); + refresh_contiguous(); + } + + virtual void set_size(int64_t dim, int64_t new_size) { + sizes_[dim] = new_size; + refresh_numel(); + refresh_contiguous(); + } + + virtual void set_stride(int64_t dim, int64_t new_stride) { + strides_[dim] = new_stride; + refresh_numel(); + refresh_contiguous(); + } + + virtual void set_storage_offset(int64_t storage_offset) { + storage_offset_ = storage_offset; + refresh_numel(); + refresh_contiguous(); + } + + // WARNING: This function does not check if the requested + // sizes/strides are in bounds for the storage that is allocated; + // this is the responsibility of the caller + void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) { + AT_CHECK( + new_size.size() == new_stride.size(), + "dimensionality of sizes (", + new_size.size(), + ") must match dimensionality of strides (", + new_stride.size(), + ")"); + sizes_ = new_size.vec(); + strides_ = new_stride.vec(); + refresh_numel(); + refresh_contiguous(); + } + + virtual int64_t size(int64_t d) const; + virtual int64_t stride(int64_t d) const; + + bool is_variable() const { return is_variable_; }; + + private: + int64_t storage_offset_; + std::vector sizes_; + std::vector strides_; + + bool is_contiguous_; + int64_t numel_; + + int64_t compute_numel() const { + int64_t n = 1; + for (auto s : sizes()) { + n *= s; + } + return n; + } + bool compute_contiguous() const; + + protected: + void refresh_numel() { + numel_ = compute_numel(); + } + void refresh_contiguous() { + is_contiguous_ = compute_contiguous(); + } + TensorTypeId type_id_; + // INVARIANT: When storage is non-null, this type meta must + // agree with the type meta in storage + caffe2::TypeMeta data_type_; + bool is_variable_ = false; + bool is_wrapped_number_ = false; + + private: + TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable); +}; +} // namespace at diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h new file mode 100644 index 00000000000000..ff85267e78fb81 --- /dev/null +++ b/aten/src/ATen/core/TensorMethods.h @@ -0,0 +1,1258 @@ +#pragma once + +#include "ATen/core/Tensor.h" +#include "ATen/core/Scalar.h" +#include "ATen/core/SparseTensorRef.h" +#include "ATen/core/Type.h" +#include "ATen/core/TensorOptions.h" + +namespace at { + +inline Tensor Tensor::toType(const Type & t, bool non_blocking) const { + if(type() == t) + return *this; + return t.copy(*this, non_blocking); +} + +inline Tensor Tensor::cpu() const { + return toType(type().cpu()); +} + +inline Tensor Tensor::cuda() const { + return toType(type().cuda()); +} + +inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) { + return type().copy_(*this, src, non_blocking); +} + +inline Tensor Tensor::toType(ScalarType t) const { + return toType(type().toScalarType(t)); +} + +inline Tensor Tensor::toBackend(Backend b) const { + return toType(type().toBackend(b)); +} + +inline TensorOptions Tensor::options() const { + return TensorOptions().dtype(dtype()) + .device(device()) + .layout(layout()) + .is_variable(is_variable()); +} + +inline void Tensor::backward( + at::optional gradient, + bool keep_graph, + bool create_graph) { + type().backward(*this, std::move(gradient), keep_graph, create_graph); +} + +inline void Tensor::set_data(Tensor new_data) { + type().set_data(*this, new_data); +} + +// all static inline to allow for inlining of the non-dynamic part of dispatch +inline int64_t Tensor::storage_offset() const { + return type().storage_offset(*this); +} +inline Tensor & Tensor::resize_(IntList size) { + return type().resize_(*this, size); +} +inline Tensor & Tensor::set_(Storage source) { + return type().set_(*this, source); +} +inline Tensor & Tensor::set_(Storage source, int64_t storage_offset, IntList size, IntList stride) { + return type().set_(*this, source, storage_offset, size, stride); +} +inline Tensor & Tensor::set_(const Tensor & source) { + return type().set_(*this, source); +} +inline Tensor & Tensor::set_() { + return type().set_(*this); +} +inline bool Tensor::is_contiguous() const { + return type().is_contiguous(*this); +} +inline bool Tensor::is_set_to(const Tensor & tensor) const { + return type().is_set_to(*this, tensor); +} +inline Tensor & Tensor::masked_fill_(const Tensor & mask, Scalar value) { + return type().masked_fill_(*this, mask, value); +} +inline Tensor & Tensor::masked_fill_(const Tensor & mask, const Tensor & value) { + return type().masked_fill_(*this, mask, value); +} +inline Tensor & Tensor::masked_scatter_(const Tensor & mask, const Tensor & source) { + return type().masked_scatter_(*this, mask, source); +} +inline Tensor Tensor::masked_select(const Tensor & mask) const { + return type().masked_select(*this, mask); +} +inline Tensor Tensor::nonzero() const { + return type().nonzero(*this); +} +inline Tensor Tensor::contiguous() const { + return type().contiguous(*this); +} +inline Tensor Tensor::view(IntList size) const { + return type().view(*this, size); +} +inline Tensor Tensor::index_select(int64_t dim, const Tensor & index) const { + return type().index_select(*this, dim, index); +} +inline Tensor Tensor::take(const Tensor & index) const { + return type().take(*this, index); +} +inline Tensor & Tensor::put_(const Tensor & index, const Tensor & source, bool accumulate) { + return type().put_(*this, index, source, accumulate); +} +inline Tensor & Tensor::index_add_(int64_t dim, const Tensor & index, const Tensor & source) { + return type().index_add_(*this, dim, index, source); +} +inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, Scalar value) { + return type().index_fill_(*this, dim, index, value); +} +inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, const Tensor & value) { + return type().index_fill_(*this, dim, index, value); +} +inline Tensor Tensor::unfold(int64_t dimension, int64_t size, int64_t step) const { + return type().unfold(*this, dimension, size, step); +} +inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, const Tensor & src) { + return type().scatter_(*this, dim, index, src); +} +inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, Scalar value) { + return type().scatter_(*this, dim, index, value); +} +inline Tensor & Tensor::scatter_add_(int64_t dim, const Tensor & index, const Tensor & src) { + return type().scatter_add_(*this, dim, index, src); +} +inline Tensor Tensor::gather(int64_t dim, const Tensor & index) const { + return type().gather(*this, dim, index); +} +inline void* Tensor::data_ptr() const { + return type().data_ptr(*this); +} +inline bool Tensor::equal(const Tensor & other) const { + return type().equal(*this, other); +} +inline Tensor Tensor::__and__(Scalar other) const { + return type().__and__(*this, other); +} +inline Tensor Tensor::__and__(const Tensor & other) const { + return type().__and__(*this, other); +} +inline Tensor & Tensor::__iand__(Scalar other) { + return type().__iand__(*this, other); +} +inline Tensor & Tensor::__iand__(const Tensor & other) { + return type().__iand__(*this, other); +} +inline Tensor Tensor::__or__(Scalar other) const { + return type().__or__(*this, other); +} +inline Tensor Tensor::__or__(const Tensor & other) const { + return type().__or__(*this, other); +} +inline Tensor & Tensor::__ior__(Scalar other) { + return type().__ior__(*this, other); +} +inline Tensor & Tensor::__ior__(const Tensor & other) { + return type().__ior__(*this, other); +} +inline Tensor Tensor::__xor__(Scalar other) const { + return type().__xor__(*this, other); +} +inline Tensor Tensor::__xor__(const Tensor & other) const { + return type().__xor__(*this, other); +} +inline Tensor & Tensor::__ixor__(Scalar other) { + return type().__ixor__(*this, other); +} +inline Tensor & Tensor::__ixor__(const Tensor & other) { + return type().__ixor__(*this, other); +} +inline Tensor Tensor::__lshift__(Scalar other) const { + return type().__lshift__(*this, other); +} +inline Tensor Tensor::__lshift__(const Tensor & other) const { + return type().__lshift__(*this, other); +} +inline Tensor & Tensor::__ilshift__(Scalar other) { + return type().__ilshift__(*this, other); +} +inline Tensor & Tensor::__ilshift__(const Tensor & other) { + return type().__ilshift__(*this, other); +} +inline Tensor Tensor::__rshift__(Scalar other) const { + return type().__rshift__(*this, other); +} +inline Tensor Tensor::__rshift__(const Tensor & other) const { + return type().__rshift__(*this, other); +} +inline Tensor & Tensor::__irshift__(Scalar other) { + return type().__irshift__(*this, other); +} +inline Tensor & Tensor::__irshift__(const Tensor & other) { + return type().__irshift__(*this, other); +} +inline Tensor Tensor::lt(Scalar other) const { + return type().lt(*this, other); +} +inline Tensor Tensor::lt(const Tensor & other) const { + return type().lt(*this, other); +} +inline Tensor & Tensor::lt_(Scalar other) { + return type().lt_(*this, other); +} +inline Tensor & Tensor::lt_(const Tensor & other) { + return type().lt_(*this, other); +} +inline Tensor Tensor::gt(Scalar other) const { + return type().gt(*this, other); +} +inline Tensor Tensor::gt(const Tensor & other) const { + return type().gt(*this, other); +} +inline Tensor & Tensor::gt_(Scalar other) { + return type().gt_(*this, other); +} +inline Tensor & Tensor::gt_(const Tensor & other) { + return type().gt_(*this, other); +} +inline Tensor Tensor::le(Scalar other) const { + return type().le(*this, other); +} +inline Tensor Tensor::le(const Tensor & other) const { + return type().le(*this, other); +} +inline Tensor & Tensor::le_(Scalar other) { + return type().le_(*this, other); +} +inline Tensor & Tensor::le_(const Tensor & other) { + return type().le_(*this, other); +} +inline Tensor Tensor::ge(Scalar other) const { + return type().ge(*this, other); +} +inline Tensor Tensor::ge(const Tensor & other) const { + return type().ge(*this, other); +} +inline Tensor & Tensor::ge_(Scalar other) { + return type().ge_(*this, other); +} +inline Tensor & Tensor::ge_(const Tensor & other) { + return type().ge_(*this, other); +} +inline Tensor Tensor::eq(Scalar other) const { + return type().eq(*this, other); +} +inline Tensor Tensor::eq(const Tensor & other) const { + return type().eq(*this, other); +} +inline Tensor & Tensor::eq_(Scalar other) { + return type().eq_(*this, other); +} +inline Tensor & Tensor::eq_(const Tensor & other) { + return type().eq_(*this, other); +} +inline Tensor Tensor::ne(Scalar other) const { + return type().ne(*this, other); +} +inline Tensor Tensor::ne(const Tensor & other) const { + return type().ne(*this, other); +} +inline Tensor & Tensor::ne_(Scalar other) { + return type().ne_(*this, other); +} +inline Tensor & Tensor::ne_(const Tensor & other) { + return type().ne_(*this, other); +} +inline Tensor Tensor::min(const Tensor & other) const { + return type().min(*this, other); +} +inline Tensor Tensor::min() const { + return type().min(*this); +} +inline Tensor Tensor::max(const Tensor & other) const { + return type().max(*this, other); +} +inline Tensor Tensor::max() const { + return type().max(*this); +} +inline Tensor Tensor::median() const { + return type().median(*this); +} +inline std::tuple Tensor::sort(int64_t dim, bool descending) const { + return type().sort(*this, dim, descending); +} +inline std::tuple Tensor::topk(int64_t k, int64_t dim, bool largest, bool sorted) const { + return type().topk(*this, k, dim, largest, sorted); +} +inline Tensor Tensor::all() const { + return type().all(*this); +} +inline Tensor Tensor::any() const { + return type().any(*this); +} +inline Tensor Tensor::lgamma() const { + return type().lgamma(*this); +} +inline Tensor & Tensor::lgamma_() { + return type().lgamma_(*this); +} +inline Tensor Tensor::digamma() const { + return type().digamma(*this); +} +inline Tensor & Tensor::digamma_() { + return type().digamma_(*this); +} +inline Tensor Tensor::polygamma(int64_t n) const { + return type().polygamma(n, *this); +} +inline Tensor & Tensor::polygamma_(int64_t n) { + return type().polygamma_(*this, n); +} +inline Tensor & Tensor::erfinv_() { + return type().erfinv_(*this); +} +inline Tensor Tensor::erfinv() const { + return type().erfinv(*this); +} +inline Tensor & Tensor::frac_() { + return type().frac_(*this); +} +inline Tensor Tensor::frac() const { + return type().frac(*this); +} +inline Tensor Tensor::renorm(Scalar p, int64_t dim, Scalar maxnorm) const { + return type().renorm(*this, p, dim, maxnorm); +} +inline Tensor & Tensor::renorm_(Scalar p, int64_t dim, Scalar maxnorm) { + return type().renorm_(*this, p, dim, maxnorm); +} +inline Tensor Tensor::dist(const Tensor & other, Scalar p) const { + return type().dist(*this, other, p); +} +inline Tensor Tensor::reciprocal() const { + return type().reciprocal(*this); +} +inline Tensor & Tensor::reciprocal_() { + return type().reciprocal_(*this); +} +inline Tensor Tensor::neg() const { + return type().neg(*this); +} +inline Tensor & Tensor::neg_() { + return type().neg_(*this); +} +inline Tensor Tensor::atan2(const Tensor & other) const { + return type().atan2(*this, other); +} +inline Tensor & Tensor::atan2_(const Tensor & other) { + return type().atan2_(*this, other); +} +inline Tensor Tensor::pow(const Tensor & exponent) const { + return type().pow(*this, exponent); +} +inline Tensor & Tensor::pow_(Scalar exponent) { + return type().pow_(*this, exponent); +} +inline Tensor & Tensor::pow_(const Tensor & exponent) { + return type().pow_(*this, exponent); +} +inline Tensor Tensor::lerp(const Tensor & end, Scalar weight) const { + return type().lerp(*this, end, weight); +} +inline Tensor & Tensor::lerp_(const Tensor & end, Scalar weight) { + return type().lerp_(*this, end, weight); +} +inline Tensor Tensor::histc(int64_t bins, Scalar min, Scalar max) const { + return type().histc(*this, bins, min, max); +} +inline Tensor Tensor::sign() const { + return type().sign(*this); +} +inline Tensor & Tensor::sign_() { + return type().sign_(*this); +} +inline Tensor Tensor::trace() const { + return type().trace(*this); +} +inline Tensor Tensor::fmod(Scalar other) const { + return type().fmod(*this, other); +} +inline Tensor Tensor::fmod(const Tensor & other) const { + return type().fmod(*this, other); +} +inline Tensor & Tensor::fmod_(Scalar other) { + return type().fmod_(*this, other); +} +inline Tensor & Tensor::fmod_(const Tensor & other) { + return type().fmod_(*this, other); +} +inline Tensor Tensor::remainder(Scalar other) const { + return type().remainder(*this, other); +} +inline Tensor Tensor::remainder(const Tensor & other) const { + return type().remainder(*this, other); +} +inline Tensor & Tensor::remainder_(Scalar other) { + return type().remainder_(*this, other); +} +inline Tensor & Tensor::remainder_(const Tensor & other) { + return type().remainder_(*this, other); +} +inline Tensor Tensor::tril(int64_t diagonal) const { + return type().tril(*this, diagonal); +} +inline Tensor & Tensor::tril_(int64_t diagonal) { + return type().tril_(*this, diagonal); +} +inline Tensor Tensor::triu(int64_t diagonal) const { + return type().triu(*this, diagonal); +} +inline Tensor & Tensor::triu_(int64_t diagonal) { + return type().triu_(*this, diagonal); +} +inline Tensor Tensor::cross(const Tensor & other, int64_t dim) const { + return type().cross(*this, other, dim); +} +inline Tensor Tensor::diag(int64_t diagonal) const { + return type().diag(*this, diagonal); +} +inline Tensor Tensor::addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const { + return type().addbmm(*this, batch1, batch2, beta, alpha); +} +inline Tensor & Tensor::addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return type().addbmm_(*this, batch1, batch2, beta, alpha); +} +inline Tensor Tensor::addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const { + return type().addcmul(*this, tensor1, tensor2, value); +} +inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return type().addcmul_(*this, tensor1, tensor2, value); +} +inline Tensor Tensor::addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const { + return type().addcdiv(*this, tensor1, tensor2, value); +} +inline Tensor & Tensor::addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return type().addcdiv_(*this, tensor1, tensor2, value); +} +inline std::tuple Tensor::gels(const Tensor & A) const { + return type().gels(*this, A); +} +inline std::tuple Tensor::trtrs(const Tensor & A, bool upper, bool transpose, bool unitriangular) const { + return type().trtrs(*this, A, upper, transpose, unitriangular); +} +inline std::tuple Tensor::symeig(bool eigenvectors, bool upper) const { + return type().symeig(*this, eigenvectors, upper); +} +inline std::tuple Tensor::eig(bool eigenvectors) const { + return type().eig(*this, eigenvectors); +} +inline std::tuple Tensor::svd(bool some) const { + return type().svd(*this, some); +} +inline Tensor Tensor::potrf(bool upper) const { + return type().potrf(*this, upper); +} +inline Tensor Tensor::potrs(const Tensor & input2, bool upper) const { + return type().potrs(*this, input2, upper); +} +inline Tensor Tensor::potri(bool upper) const { + return type().potri(*this, upper); +} +inline std::tuple Tensor::pstrf(bool upper, Scalar tol) const { + return type().pstrf(*this, upper, tol); +} +inline std::tuple Tensor::qr() const { + return type().qr(*this); +} +inline std::tuple Tensor::geqrf() const { + return type().geqrf(*this); +} +inline Tensor Tensor::orgqr(const Tensor & input2) const { + return type().orgqr(*this, input2); +} +inline Tensor Tensor::ormqr(const Tensor & input2, const Tensor & input3, bool left, bool transpose) const { + return type().ormqr(*this, input2, input3, left, transpose); +} +inline std::tuple Tensor::btrifact(bool pivot) const { + return type().btrifact(*this, pivot); +} +inline std::tuple Tensor::btrifact_with_info(bool pivot) const { + return type().btrifact_with_info(*this, pivot); +} +inline Tensor Tensor::btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const { + return type().btrisolve(*this, LU_data, LU_pivots); +} +inline Tensor & Tensor::random_(int64_t from, int64_t to, Generator * generator) { + return type().random_(*this, from, to, generator); +} +inline Tensor & Tensor::random_(int64_t to, Generator * generator) { + return type().random_(*this, to, generator); +} +inline Tensor & Tensor::random_(Generator * generator) { + return type().random_(*this, generator); +} +inline Tensor Tensor::multinomial(int64_t num_samples, bool replacement, Generator * generator) const { + return type().multinomial(*this, num_samples, replacement, generator); +} +inline Tensor & Tensor::uniform_(double from, double to, Generator * generator) { + return type().uniform_(*this, from, to, generator); +} +inline Tensor & Tensor::normal_(double mean, double std, Generator * generator) { + return type().normal_(*this, mean, std, generator); +} +inline Tensor & Tensor::cauchy_(double median, double sigma, Generator * generator) { + return type().cauchy_(*this, median, sigma, generator); +} +inline Tensor & Tensor::log_normal_(double mean, double std, Generator * generator) { + return type().log_normal_(*this, mean, std, generator); +} +inline Tensor & Tensor::exponential_(double lambd, Generator * generator) { + return type().exponential_(*this, lambd, generator); +} +inline Tensor & Tensor::geometric_(double p, Generator * generator) { + return type().geometric_(*this, p, generator); +} +inline Tensor Tensor::abs() const { + return type().abs(*this); +} +inline Tensor & Tensor::abs_() { + return type().abs_(*this); +} +inline Tensor Tensor::acos() const { + return type().acos(*this); +} +inline Tensor & Tensor::acos_() { + return type().acos_(*this); +} +inline Tensor Tensor::add(const Tensor & other, Scalar alpha) const { + return type().add(*this, other, alpha); +} +inline Tensor & Tensor::add_(const Tensor & other, Scalar alpha) { + return type().add_(*this, other, alpha); +} +inline Tensor Tensor::add(Scalar other, Scalar alpha) const { + return type().add(*this, other, alpha); +} +inline Tensor & Tensor::add_(Scalar other, Scalar alpha) { + return type().add_(*this, other, alpha); +} +inline Tensor Tensor::addmv(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const { + return type().addmv(*this, mat, vec, beta, alpha); +} +inline Tensor & Tensor::addmv_(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return type().addmv_(*this, mat, vec, beta, alpha); +} +inline Tensor Tensor::addr(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const { + return type().addr(*this, vec1, vec2, beta, alpha); +} +inline Tensor & Tensor::addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return type().addr_(*this, vec1, vec2, beta, alpha); +} +inline Tensor Tensor::all(int64_t dim, bool keepdim) const { + return type().all(*this, dim, keepdim); +} +inline bool Tensor::allclose(const Tensor & other, double rtol, double atol, bool equal_nan) const { + return type().allclose(*this, other, rtol, atol, equal_nan); +} +inline Tensor Tensor::any(int64_t dim, bool keepdim) const { + return type().any(*this, dim, keepdim); +} +inline Tensor Tensor::argmax(int64_t dim, bool keepdim) const { + return type().argmax(*this, dim, keepdim); +} +inline Tensor Tensor::argmax() const { + return type().argmax(*this); +} +inline Tensor Tensor::argmin(int64_t dim, bool keepdim) const { + return type().argmin(*this, dim, keepdim); +} +inline Tensor Tensor::argmin() const { + return type().argmin(*this); +} +inline Tensor Tensor::as_strided(IntList size, IntList stride) const { + return type().as_strided(*this, size, stride); +} +inline Tensor & Tensor::as_strided_(IntList size, IntList stride) { + return type().as_strided_(*this, size, stride); +} +inline Tensor Tensor::as_strided(IntList size, IntList stride, int64_t storage_offset) const { + return type().as_strided(*this, size, stride, storage_offset); +} +inline Tensor & Tensor::as_strided_(IntList size, IntList stride, int64_t storage_offset) { + return type().as_strided_(*this, size, stride, storage_offset); +} +inline Tensor Tensor::asin() const { + return type().asin(*this); +} +inline Tensor & Tensor::asin_() { + return type().asin_(*this); +} +inline Tensor Tensor::atan() const { + return type().atan(*this); +} +inline Tensor & Tensor::atan_() { + return type().atan_(*this); +} +inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const { + return type().baddbmm(*this, batch1, batch2, beta, alpha); +} +inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return type().baddbmm_(*this, batch1, batch2, beta, alpha); +} +inline Tensor Tensor::bernoulli(const Tensor & p, Generator * generator) const { + return type().bernoulli(*this, p, generator); +} +inline Tensor Tensor::bernoulli(double p, Generator * generator) const { + return type().bernoulli(*this, p, generator); +} +inline Tensor Tensor::bernoulli() const { + return type().bernoulli(*this); +} +inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) { + return type().bernoulli_(*this, p, generator); +} +inline Tensor & Tensor::bernoulli_(double p, Generator * generator) { + return type().bernoulli_(*this, p, generator); +} +inline Tensor & Tensor::bernoulli_() { + return type().bernoulli_(*this); +} +inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const { + return type().bincount(*this, weights, minlength); +} +inline Tensor Tensor::bmm(const Tensor & mat2) const { + return type().bmm(*this, mat2); +} +inline Tensor Tensor::ceil() const { + return type().ceil(*this); +} +inline Tensor & Tensor::ceil_() { + return type().ceil_(*this); +} +inline std::vector Tensor::chunk(int64_t chunks, int64_t dim) const { + return type().chunk(*this, chunks, dim); +} +inline Tensor Tensor::clamp(Scalar min, Scalar max) const { + return type().clamp(*this, min, max); +} +inline Tensor & Tensor::clamp_(Scalar min, Scalar max) { + return type().clamp_(*this, min, max); +} +inline Tensor Tensor::clamp_max(Scalar max) const { + return type().clamp_max(*this, max); +} +inline Tensor & Tensor::clamp_max_(Scalar max) { + return type().clamp_max_(*this, max); +} +inline Tensor Tensor::clamp_min(Scalar min) const { + return type().clamp_min(*this, min); +} +inline Tensor & Tensor::clamp_min_(Scalar min) { + return type().clamp_min_(*this, min); +} +inline Tensor Tensor::cos() const { + return type().cos(*this); +} +inline Tensor & Tensor::cos_() { + return type().cos_(*this); +} +inline Tensor Tensor::cosh() const { + return type().cosh(*this); +} +inline Tensor & Tensor::cosh_() { + return type().cosh_(*this); +} +inline Tensor Tensor::cumsum(int64_t dim, ScalarType dtype) const { + return type().cumsum(*this, dim, dtype); +} +inline Tensor Tensor::cumsum(int64_t dim) const { + return type().cumsum(*this, dim); +} +inline Tensor Tensor::cumprod(int64_t dim, ScalarType dtype) const { + return type().cumprod(*this, dim, dtype); +} +inline Tensor Tensor::cumprod(int64_t dim) const { + return type().cumprod(*this, dim); +} +inline Tensor Tensor::det() const { + return type().det(*this); +} +inline Tensor Tensor::diagflat(int64_t offset) const { + return type().diagflat(*this, offset); +} +inline Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const { + return type().diagonal(*this, offset, dim1, dim2); +} +inline Tensor Tensor::div(const Tensor & other) const { + return type().div(*this, other); +} +inline Tensor & Tensor::div_(const Tensor & other) { + return type().div_(*this, other); +} +inline Tensor Tensor::div(Scalar other) const { + return type().div(*this, other); +} +inline Tensor & Tensor::div_(Scalar other) { + return type().div_(*this, other); +} +inline Tensor Tensor::dot(const Tensor & tensor) const { + return type().dot(*this, tensor); +} +inline Tensor Tensor::erf() const { + return type().erf(*this); +} +inline Tensor & Tensor::erf_() { + return type().erf_(*this); +} +inline Tensor Tensor::erfc() const { + return type().erfc(*this); +} +inline Tensor & Tensor::erfc_() { + return type().erfc_(*this); +} +inline Tensor Tensor::exp() const { + return type().exp(*this); +} +inline Tensor & Tensor::exp_() { + return type().exp_(*this); +} +inline Tensor Tensor::expm1() const { + return type().expm1(*this); +} +inline Tensor & Tensor::expm1_() { + return type().expm1_(*this); +} +inline Tensor Tensor::expand(IntList size, bool implicit) const { + return type().expand(*this, size, implicit); +} +inline Tensor Tensor::expand_as(const Tensor & other) const { + return type().expand_as(*this, other); +} +inline Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim) const { + return type().flatten(*this, start_dim, end_dim); +} +inline Tensor & Tensor::fill_(Scalar value) { + return type().fill_(*this, value); +} +inline Tensor & Tensor::fill_(const Tensor & value) { + return type().fill_(*this, value); +} +inline Tensor Tensor::floor() const { + return type().floor(*this); +} +inline Tensor & Tensor::floor_() { + return type().floor_(*this); +} +inline Tensor Tensor::ger(const Tensor & vec2) const { + return type().ger(*this, vec2); +} +inline std::tuple Tensor::gesv(const Tensor & A) const { + return type().gesv(*this, A); +} +inline Tensor Tensor::fft(int64_t signal_ndim, bool normalized) const { + return type().fft(*this, signal_ndim, normalized); +} +inline Tensor Tensor::ifft(int64_t signal_ndim, bool normalized) const { + return type().ifft(*this, signal_ndim, normalized); +} +inline Tensor Tensor::rfft(int64_t signal_ndim, bool normalized, bool onesided) const { + return type().rfft(*this, signal_ndim, normalized, onesided); +} +inline Tensor Tensor::irfft(int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const { + return type().irfft(*this, signal_ndim, normalized, onesided, signal_sizes); +} +inline Tensor Tensor::index(TensorList indices) const { + return type().index(*this, indices); +} +inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) { + return type().index_copy_(*this, dim, index, source); +} +inline Tensor Tensor::index_put(TensorList indices, const Tensor & values) const { + return type().index_put(*this, indices, values); +} +inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values) { + return type().index_put_(*this, indices, values); +} +inline Tensor Tensor::inverse() const { + return type().inverse(*this); +} +inline Tensor Tensor::isclose(const Tensor & other, double rtol, double atol, bool equal_nan) const { + return type().isclose(*this, other, rtol, atol, equal_nan); +} +inline bool Tensor::is_cuda() const { + return type().is_cuda(*this); +} +inline bool Tensor::is_distributed() const { + return type().is_distributed(*this); +} +inline bool Tensor::is_floating_point() const { + return type().is_floating_point(*this); +} +inline bool Tensor::is_complex() const { + return type().is_complex(*this); +} +inline bool Tensor::is_nonzero() const { + return type().is_nonzero(*this); +} +inline bool Tensor::is_same_size(const Tensor & other) const { + return type().is_same_size(*this, other); +} +inline bool Tensor::is_signed() const { + return type().is_signed(*this); +} +inline bool Tensor::is_sparse() const { + return type().is_sparse(*this); +} +inline std::tuple Tensor::kthvalue(int64_t k, int64_t dim, bool keepdim) const { + return type().kthvalue(*this, k, dim, keepdim); +} +inline Tensor Tensor::log() const { + return type().log(*this); +} +inline Tensor & Tensor::log_() { + return type().log_(*this); +} +inline Tensor Tensor::log10() const { + return type().log10(*this); +} +inline Tensor & Tensor::log10_() { + return type().log10_(*this); +} +inline Tensor Tensor::log1p() const { + return type().log1p(*this); +} +inline Tensor & Tensor::log1p_() { + return type().log1p_(*this); +} +inline Tensor Tensor::log2() const { + return type().log2(*this); +} +inline Tensor & Tensor::log2_() { + return type().log2_(*this); +} +inline Tensor Tensor::logdet() const { + return type().logdet(*this); +} +inline Tensor Tensor::log_softmax(int64_t dim) const { + return type().log_softmax(*this, dim); +} +inline Tensor Tensor::logsumexp(int64_t dim, bool keepdim) const { + return type().logsumexp(*this, dim, keepdim); +} +inline Tensor Tensor::matmul(const Tensor & other) const { + return type().matmul(*this, other); +} +inline Tensor Tensor::matrix_power(int64_t n) const { + return type().matrix_power(*this, n); +} +inline std::tuple Tensor::max(int64_t dim, bool keepdim) const { + return type().max(*this, dim, keepdim); +} +inline Tensor Tensor::max_values(int64_t dim, bool keepdim) const { + return type().max_values(*this, dim, keepdim); +} +inline Tensor Tensor::mean(ScalarType dtype) const { + return type().mean(*this, dtype); +} +inline Tensor Tensor::mean() const { + return type().mean(*this); +} +inline Tensor Tensor::mean(int64_t dim, bool keepdim, ScalarType dtype) const { + return type().mean(*this, dim, keepdim, dtype); +} +inline Tensor Tensor::mean(int64_t dim, bool keepdim) const { + return type().mean(*this, dim, keepdim); +} +inline Tensor Tensor::mean(int64_t dim, ScalarType dtype) const { + return type().mean(*this, dim, dtype); +} +inline std::tuple Tensor::median(int64_t dim, bool keepdim) const { + return type().median(*this, dim, keepdim); +} +inline std::tuple Tensor::min(int64_t dim, bool keepdim) const { + return type().min(*this, dim, keepdim); +} +inline Tensor Tensor::min_values(int64_t dim, bool keepdim) const { + return type().min_values(*this, dim, keepdim); +} +inline Tensor Tensor::mm(const Tensor & mat2) const { + return type().mm(*this, mat2); +} +inline std::tuple Tensor::mode(int64_t dim, bool keepdim) const { + return type().mode(*this, dim, keepdim); +} +inline Tensor Tensor::mul(const Tensor & other) const { + return type().mul(*this, other); +} +inline Tensor & Tensor::mul_(const Tensor & other) { + return type().mul_(*this, other); +} +inline Tensor Tensor::mul(Scalar other) const { + return type().mul(*this, other); +} +inline Tensor & Tensor::mul_(Scalar other) { + return type().mul_(*this, other); +} +inline Tensor Tensor::mv(const Tensor & vec) const { + return type().mv(*this, vec); +} +inline Tensor Tensor::mvlgamma(int64_t p) const { + return type().mvlgamma(*this, p); +} +inline Tensor & Tensor::mvlgamma_(int64_t p) { + return type().mvlgamma_(*this, p); +} +inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const { + return type().narrow(*this, dim, start, length); +} +inline Tensor Tensor::permute(IntList dims) const { + return type().permute(*this, dims); +} +inline Tensor Tensor::pin_memory() const { + return type().pin_memory(*this); +} +inline Tensor Tensor::pinverse(double rcond) const { + return type().pinverse(*this, rcond); +} +inline Tensor Tensor::repeat(IntList repeats) const { + return type().repeat(*this, repeats); +} +inline Tensor Tensor::reshape(IntList shape) const { + return type().reshape(*this, shape); +} +inline Tensor Tensor::reshape_as(const Tensor & other) const { + return type().reshape_as(*this, other); +} +inline Tensor Tensor::round() const { + return type().round(*this); +} +inline Tensor & Tensor::round_() { + return type().round_(*this); +} +inline Tensor Tensor::relu() const { + return type().relu(*this); +} +inline Tensor & Tensor::relu_() { + return type().relu_(*this); +} +inline Tensor Tensor::hardshrink(Scalar lambd) const { + return type().hardshrink(*this, lambd); +} +inline Tensor Tensor::hardshrink_backward(const Tensor & grad_out, Scalar lambd) const { + return type().hardshrink_backward(grad_out, *this, lambd); +} +inline Tensor Tensor::rsqrt() const { + return type().rsqrt(*this); +} +inline Tensor & Tensor::rsqrt_() { + return type().rsqrt_(*this); +} +inline Tensor Tensor::select(int64_t dim, int64_t index) const { + return type().select(*this, dim, index); +} +inline Tensor Tensor::sigmoid() const { + return type().sigmoid(*this); +} +inline Tensor & Tensor::sigmoid_() { + return type().sigmoid_(*this); +} +inline Tensor Tensor::sin() const { + return type().sin(*this); +} +inline Tensor & Tensor::sin_() { + return type().sin_(*this); +} +inline Tensor Tensor::sinh() const { + return type().sinh(*this); +} +inline Tensor & Tensor::sinh_() { + return type().sinh_(*this); +} +inline Tensor Tensor::detach() const { + return type().detach(*this); +} +inline Tensor & Tensor::detach_() { + return type().detach_(*this); +} +inline int64_t Tensor::size(int64_t dim) const { + return type().size(*this, dim); +} +inline Tensor Tensor::slice(int64_t dim, int64_t start, int64_t end, int64_t step) const { + return type().slice(*this, dim, start, end, step); +} +inline std::tuple Tensor::slogdet() const { + return type().slogdet(*this); +} +inline Tensor Tensor::smm(const Tensor & mat2) const { + return type().smm(*this, mat2); +} +inline Tensor Tensor::softmax(int64_t dim) const { + return type().softmax(*this, dim); +} +inline std::vector Tensor::split(int64_t split_size, int64_t dim) const { + return type().split(*this, split_size, dim); +} +inline std::vector Tensor::split_with_sizes(IntList split_sizes, int64_t dim) const { + return type().split_with_sizes(*this, split_sizes, dim); +} +inline Tensor Tensor::squeeze() const { + return type().squeeze(*this); +} +inline Tensor Tensor::squeeze(int64_t dim) const { + return type().squeeze(*this, dim); +} +inline Tensor & Tensor::squeeze_() { + return type().squeeze_(*this); +} +inline Tensor & Tensor::squeeze_(int64_t dim) { + return type().squeeze_(*this, dim); +} +inline Tensor Tensor::sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const { + return type().sspaddmm(*this, mat1, mat2, beta, alpha); +} +inline Tensor Tensor::stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const { + return type().stft(*this, n_fft, hop_length, win_length, window, normalized, onesided); +} +inline int64_t Tensor::stride(int64_t dim) const { + return type().stride(*this, dim); +} +inline Tensor Tensor::sum(ScalarType dtype) const { + return type().sum(*this, dtype); +} +inline Tensor Tensor::sum() const { + return type().sum(*this); +} +inline Tensor Tensor::sum(IntList dim, bool keepdim, ScalarType dtype) const { + return type().sum(*this, dim, keepdim, dtype); +} +inline Tensor Tensor::sum(IntList dim, bool keepdim) const { + return type().sum(*this, dim, keepdim); +} +inline Tensor Tensor::sum(IntList dim, ScalarType dtype) const { + return type().sum(*this, dim, dtype); +} +inline Tensor Tensor::sqrt() const { + return type().sqrt(*this); +} +inline Tensor & Tensor::sqrt_() { + return type().sqrt_(*this); +} +inline Tensor Tensor::std(bool unbiased) const { + return type().std(*this, unbiased); +} +inline Tensor Tensor::std(int64_t dim, bool unbiased, bool keepdim) const { + return type().std(*this, dim, unbiased, keepdim); +} +inline Tensor Tensor::prod(ScalarType dtype) const { + return type().prod(*this, dtype); +} +inline Tensor Tensor::prod() const { + return type().prod(*this); +} +inline Tensor Tensor::prod(int64_t dim, bool keepdim, ScalarType dtype) const { + return type().prod(*this, dim, keepdim, dtype); +} +inline Tensor Tensor::prod(int64_t dim, bool keepdim) const { + return type().prod(*this, dim, keepdim); +} +inline Tensor Tensor::prod(int64_t dim, ScalarType dtype) const { + return type().prod(*this, dim, dtype); +} +inline Tensor Tensor::t() const { + return type().t(*this); +} +inline Tensor & Tensor::t_() { + return type().t_(*this); +} +inline Tensor Tensor::tan() const { + return type().tan(*this); +} +inline Tensor & Tensor::tan_() { + return type().tan_(*this); +} +inline Tensor Tensor::tanh() const { + return type().tanh(*this); +} +inline Tensor & Tensor::tanh_() { + return type().tanh_(*this); +} +inline Tensor Tensor::transpose(int64_t dim0, int64_t dim1) const { + return type().transpose(*this, dim0, dim1); +} +inline Tensor & Tensor::transpose_(int64_t dim0, int64_t dim1) { + return type().transpose_(*this, dim0, dim1); +} +inline Tensor Tensor::flip(IntList dims) const { + return type().flip(*this, dims); +} +inline Tensor Tensor::rot90(int64_t k, IntList dims) const { + return type().rot90(*this, k, dims); +} +inline Tensor Tensor::trunc() const { + return type().trunc(*this); +} +inline Tensor & Tensor::trunc_() { + return type().trunc_(*this); +} +inline Tensor Tensor::type_as(const Tensor & other) const { + return type().type_as(*this, other); +} +inline Tensor Tensor::unsqueeze(int64_t dim) const { + return type().unsqueeze(*this, dim); +} +inline Tensor & Tensor::unsqueeze_(int64_t dim) { + return type().unsqueeze_(*this, dim); +} +inline Tensor Tensor::var(bool unbiased) const { + return type().var(*this, unbiased); +} +inline Tensor Tensor::var(int64_t dim, bool unbiased, bool keepdim) const { + return type().var(*this, dim, unbiased, keepdim); +} +inline Tensor Tensor::view_as(const Tensor & other) const { + return type().view_as(*this, other); +} +inline Tensor Tensor::where(const Tensor & condition, const Tensor & other) const { + return type().where(condition, *this, other); +} +inline Tensor Tensor::norm(Scalar p) const { + return type().norm(*this, p); +} +inline Tensor Tensor::norm(Scalar p, int64_t dim, bool keepdim) const { + return type().norm(*this, p, dim, keepdim); +} +inline Tensor Tensor::clone() const { + return type().clone(*this); +} +inline Tensor & Tensor::resize_as_(const Tensor & the_template) { + return type().resize_as_(*this, the_template); +} +inline Tensor Tensor::pow(Scalar exponent) const { + return type().pow(*this, exponent); +} +inline Tensor & Tensor::zero_() { + return type().zero_(*this); +} +inline Tensor Tensor::sub(const Tensor & other, Scalar alpha) const { + return type().sub(*this, other, alpha); +} +inline Tensor & Tensor::sub_(const Tensor & other, Scalar alpha) { + return type().sub_(*this, other, alpha); +} +inline Tensor Tensor::sub(Scalar other, Scalar alpha) const { + return type().sub(*this, other, alpha); +} +inline Tensor & Tensor::sub_(Scalar other, Scalar alpha) { + return type().sub_(*this, other, alpha); +} +inline Tensor Tensor::addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const { + return type().addmm(*this, mat1, mat2, beta, alpha); +} +inline Tensor & Tensor::addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return type().addmm_(*this, mat1, mat2, beta, alpha); +} +inline Tensor & Tensor::sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims) { + return type().sparse_resize_(*this, size, sparseDims, denseDims); +} +inline Tensor & Tensor::sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims) { + return type().sparse_resize_and_clear_(*this, size, sparseDims, denseDims); +} +inline Tensor Tensor::sparse_mask(SparseTensorRef mask) const { + return type().sparse_mask(*this, mask); +} +inline Tensor Tensor::to_dense() const { + return type().to_dense(*this); +} +inline int64_t Tensor::_sparseDims() const { + return type()._sparseDims(*this); +} +inline int64_t Tensor::_denseDims() const { + return type()._denseDims(*this); +} +inline int64_t Tensor::_nnz() const { + return type()._nnz(*this); +} +inline Tensor Tensor::coalesce() const { + return type().coalesce(*this); +} +inline bool Tensor::is_coalesced() const { + return type().is_coalesced(*this); +} +inline Tensor Tensor::_indices() const { + return type()._indices(*this); +} +inline Tensor Tensor::_values() const { + return type()._values(*this); +} +inline int64_t Tensor::numel() const { + return type().numel(*this); +} +inline std::vector Tensor::unbind(int64_t dim) const { + return type().unbind(*this, dim); +} +inline int64_t Tensor::get_device() const { + return type().get_device(*this); +} +inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) const { + return type().to(*this, device, dtype, non_blocking); +} +inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const { + return type().to(*this, dtype, non_blocking); +} +inline Tensor Tensor::to(Device device, bool non_blocking) const { + return type().to(*this, device, non_blocking); +} +inline Tensor Tensor::to(const Tensor & other, bool non_blocking) const { + return type().to(*this, other, non_blocking); +} +inline Scalar Tensor::_local_scalar() const { + return type()._local_scalar(*this); +} + +inline bool Tensor::is_variable() const noexcept { + return type().is_variable(); +} + +inline ScalarType Tensor::dtype() const noexcept { + return type().scalarType(); +} + +inline Layout Tensor::layout() const noexcept { + return type().layout(); +} + +inline Device Tensor::device() const { + return Device(type().device_type(), type().is_cuda() ? get_device() : -1); +} + +#define DEFINE_CAST(T, name, _) \ + template <> \ + inline T* Tensor::data() const { \ + AT_CHECK( \ + type().scalarType() == ScalarType::name, \ + "expected scalar type ", \ + #name, \ + " but found ", \ + at::toString(type().scalarType())); \ + return static_cast(this->data_ptr()); \ + } \ + inline T* Tensor::to##name##Data() const { \ + return data(); \ + } + +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST) +#undef DEFINE_CAST + +#define DEFINE_TO_C_TYPE(T,name,_) \ +inline T Tensor::toC##name () const { return _local_scalar().to##name (); } + +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE) +#undef DEFINE_TO_C_TYPE + +} //namespace at diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h new file mode 100644 index 00000000000000..1366f899c30b84 --- /dev/null +++ b/aten/src/ATen/core/Type.h @@ -0,0 +1,634 @@ +#pragma once + +#include "ATen/core/ATenGeneral.h" +#include "ATen/core/Allocator.h" +#include "ATen/core/Deprecated.h" +#include "ATen/core/Generator.h" +#include "ATen/core/Layout.h" +#include "ATen/core/Scalar.h" +#include "ATen/core/ScalarType.h" +#include "ATen/core/SparseTensorRef.h" +#include "ATen/core/ArrayRef.h" +#include "ATen/core/Half.h" +#include "ATen/core/TensorTypeIdRegistration.h" +#include "ATen/core/Reduction.h" +#include "ATen/core/TensorOptions.h" + +#include +#include +#include +#include +#include + +// To solve the conflict of s_addr in inaddr.h +#ifdef _MSC_VER +#ifdef s_addr +#undef s_addr +#endif +#endif + +namespace at { + +class Context; +struct Allocator; +struct Generator; +struct Storage; +struct Tensor; + +static inline void noop_deleter(void*) {} + +enum class TypeID { + CPUByte, + CPUChar, + CPUDouble, + CPUFloat, + CPUInt, + CPULong, + CPUShort, + CPUHalf, + SparseCPUByte, + SparseCPUChar, + SparseCPUDouble, + SparseCPUFloat, + SparseCPUInt, + SparseCPULong, + SparseCPUShort, + CUDAByte, + CUDAChar, + CUDADouble, + CUDAFloat, + CUDAInt, + CUDALong, + CUDAShort, + CUDAHalf, + SparseCUDAByte, + SparseCUDAChar, + SparseCUDADouble, + SparseCUDAFloat, + SparseCUDAInt, + SparseCUDALong, + SparseCUDAShort, + CPUComplexFloat, + CPUComplexDouble, + CUDAComplexFloat, + CUDAComplexDouble, + Undefined, + NumOptions +}; + +struct AT_API Type { + explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined) + : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {} + + virtual ~Type() {} + virtual ScalarType scalarType() const = 0; + virtual caffe2::TypeMeta typeMeta() const = 0; + virtual Backend backend() const = 0; + Layout layout() const noexcept { return layout_from_backend(backend()); } + virtual bool is_cuda() const = 0; + virtual bool is_sparse() const = 0; + virtual bool is_distributed() const = 0; + bool is_variable() const noexcept { return is_variable_; } + bool is_undefined() const noexcept { return is_undefined_; } + virtual Allocator * allocator() const = 0; + virtual Device getDeviceFromPtr(void * data) const = 0; + virtual Storage storage(bool resizable = false) const = 0; + virtual Storage storage(size_t size, bool resizable = false) const = 0; + virtual Storage storageFromBlob(void * data, int64_t size, const std::function & deleter=noop_deleter) const = 0; + virtual Storage storageWithAllocator(int64_t size, Allocator* allocator) const = 0; + virtual std::unique_ptr generator() const = 0; + virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0; + virtual Storage unsafeStorageFromTH(void * th_pointer, bool retain) const = 0; + virtual const char * toString() const = 0; + virtual size_t elementSizeInBytes() const = 0; + virtual Type & toBackend(Backend b) const = 0; + virtual Type & toScalarType(ScalarType s) const = 0; + Type & toSparse() const { + return this->toBackend(at::toSparse(this->backend())); + } + Type & toDense() const { + return this->toBackend(at::toDense(this->backend())); + } + Type & cpu() const { + return this->toBackend(at::backendToCPU(this->backend())); + } + Type & cuda() const { + return this->toBackend(at::backendToCUDA(this->backend())); + } + // contiguous IDs for all types in the system + // for external dispatch + virtual TypeID ID() const = 0; + + // New-style TensorTypeId that supports open registration. + TensorTypeId type_id() const { return type_id_; } + + // NB: This will return DeviceType::CPU for Backend::SparseCPU + DeviceType device_type() const { + return backendToDeviceType(backend()); + } + + virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional to_device={}) const = 0; + virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0; + virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; + virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0; + + virtual void backward(Tensor & self, at::optional gradient, bool keep_graph, bool create_graph) const = 0; + virtual void set_data(Tensor & self, Tensor new_data) const = 0; + + virtual Tensor tensorFromBlob(void * data, IntList sizes, const std::function & deleter=noop_deleter) const = 0; + virtual Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function & deleter=noop_deleter) const = 0; + virtual Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const = 0; + virtual Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const = 0; + virtual Tensor scalarTensor(Scalar s) const = 0; + + bool operator==(const Type& other) const { + return this == &other; + } + bool operator!=(const Type& other) const { + return this != &other; + } + + /// Constructs the `TensorOptions` from a type and a `device_index`. + TensorOptions options(int32_t device_index = -1) const { + TensorOptions r; + r.dtype(scalarType()); + r.device({backendToDeviceType(backend()), device_index}); + r.layout(layout()); + r.is_variable(is_variable()); + return r; + } + + operator TensorOptions() const { + return options(); + } + + // example + // virtual Tensor * add(Tensor & a, Tensor & b) = 0; + virtual int64_t storage_offset(const Tensor & self) const = 0; + virtual Tensor & resize_(Tensor & self, IntList size) const = 0; + virtual Tensor & set_(Tensor & self, Storage source) const = 0; + virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride) const = 0; + virtual Tensor & set_(Tensor & self, const Tensor & source) const = 0; + virtual Tensor & set_(Tensor & self) const = 0; + virtual bool is_contiguous(const Tensor & self) const = 0; + virtual bool is_set_to(const Tensor & self, const Tensor & tensor) const = 0; + virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0; + virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0; + virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0; + virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0; + virtual Tensor & s_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0; + virtual Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0; + virtual Tensor s_masked_select(const Tensor & self, const Tensor & mask) const = 0; + virtual Tensor masked_select(const Tensor & self, const Tensor & mask) const = 0; + virtual Tensor nonzero(const Tensor & self) const = 0; + virtual Tensor contiguous(const Tensor & self) const = 0; + virtual Tensor view(const Tensor & self, IntList size) const = 0; + virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const = 0; + virtual Tensor take(const Tensor & self, const Tensor & index) const = 0; + virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) const = 0; + virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0; + virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0; + virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const = 0; + virtual Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step) const = 0; + virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0; + virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0; + virtual Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0; + virtual Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) const = 0; + virtual void* data_ptr(const Tensor & self) const = 0; + virtual bool equal(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __and__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___and__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __and__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __iand__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___iand__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __iand__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __or__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___or__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __or__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ior__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___ior__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ior__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __xor__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___xor__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __xor__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ixor__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___ixor__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ixor__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __lshift__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___lshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __lshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ilshift__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___ilshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ilshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __rshift__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___rshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __rshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __irshift__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___irshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __irshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor lt(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_lt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor lt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & lt_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_lt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & lt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor gt(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_gt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor gt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & gt_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_gt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & gt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor le(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_le(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor le(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & le_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_le_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & le_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor ge(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_ge(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor ge(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ge_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_ge_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ge_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor eq(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_eq(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor eq(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & eq_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_eq_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & eq_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor ne(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_ne(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor ne(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ne_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_ne_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ne_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor s_min(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor min(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor min(const Tensor & self) const = 0; + virtual Tensor s_max(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor max(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor max(const Tensor & self) const = 0; + virtual Tensor median(const Tensor & self) const = 0; + virtual std::tuple sort(const Tensor & self, int64_t dim, bool descending) const = 0; + virtual std::tuple topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) const = 0; + virtual Tensor all(const Tensor & self) const = 0; + virtual Tensor any(const Tensor & self) const = 0; + virtual Tensor lgamma(const Tensor & self) const = 0; + virtual Tensor & lgamma_(Tensor & self) const = 0; + virtual Tensor digamma(const Tensor & self) const = 0; + virtual Tensor & digamma_(Tensor & self) const = 0; + virtual Tensor polygamma(int64_t n, const Tensor & self) const = 0; + virtual Tensor & polygamma_(Tensor & self, int64_t n) const = 0; + virtual Tensor & erfinv_(Tensor & self) const = 0; + virtual Tensor erfinv(const Tensor & self) const = 0; + virtual Tensor & frac_(Tensor & self) const = 0; + virtual Tensor frac(const Tensor & self) const = 0; + virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0; + virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0; + virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p) const = 0; + virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p) const = 0; + virtual Tensor reciprocal(const Tensor & self) const = 0; + virtual Tensor & reciprocal_(Tensor & self) const = 0; + virtual Tensor neg(const Tensor & self) const = 0; + virtual Tensor & neg_(Tensor & self) const = 0; + virtual Tensor s_atan2(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor atan2(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & s_atan2_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & atan2_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor s_pow(const Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor pow(const Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor pow(Scalar base, const Tensor & self) const = 0; + virtual Tensor & pow_(Tensor & self, Scalar exponent) const = 0; + virtual Tensor & s_pow_(Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor & pow_(Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor s_lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) const = 0; + virtual Tensor sign(const Tensor & self) const = 0; + virtual Tensor & sign_(Tensor & self) const = 0; + virtual Tensor trace(const Tensor & self) const = 0; + virtual Tensor fmod(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_fmod(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor fmod(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & fmod_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_fmod_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & fmod_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor remainder(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_remainder(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor remainder(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & remainder_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & remainder_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor tril(const Tensor & self, int64_t diagonal) const = 0; + virtual Tensor & tril_(Tensor & self, int64_t diagonal) const = 0; + virtual Tensor triu(const Tensor & self, int64_t diagonal) const = 0; + virtual Tensor & triu_(Tensor & self, int64_t diagonal) const = 0; + virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) const = 0; + virtual Tensor diag(const Tensor & self, int64_t diagonal) const = 0; + virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual std::tuple gels(const Tensor & self, const Tensor & A) const = 0; + virtual std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) const = 0; + virtual std::tuple symeig(const Tensor & self, bool eigenvectors, bool upper) const = 0; + virtual std::tuple eig(const Tensor & self, bool eigenvectors) const = 0; + virtual std::tuple svd(const Tensor & self, bool some) const = 0; + virtual Tensor potrf(const Tensor & self, bool upper) const = 0; + virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) const = 0; + virtual Tensor potri(const Tensor & self, bool upper) const = 0; + virtual std::tuple pstrf(const Tensor & self, bool upper, Scalar tol) const = 0; + virtual std::tuple qr(const Tensor & self) const = 0; + virtual std::tuple geqrf(const Tensor & self) const = 0; + virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0; + virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) const = 0; + virtual std::tuple btrifact(const Tensor & self, bool pivot) const = 0; + virtual std::tuple btrifact_with_info(const Tensor & self, bool pivot) const = 0; + virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const = 0; + virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator) const = 0; + virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator) const = 0; + virtual Tensor & random_(Tensor & self, Generator * generator) const = 0; + virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) const = 0; + virtual Tensor & uniform_(Tensor & self, double from, double to, Generator * generator) const = 0; + virtual Tensor & normal_(Tensor & self, double mean, double std, Generator * generator) const = 0; + virtual Tensor & cauchy_(Tensor & self, double median, double sigma, Generator * generator) const = 0; + virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0; + virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0; + virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0; + virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride) const = 0; + virtual Tensor tensor(IntList size, IntList stride) const = 0; + virtual Tensor abs(const Tensor & self) const = 0; + virtual Tensor & abs_(Tensor & self) const = 0; + virtual Tensor acos(const Tensor & self) const = 0; + virtual Tensor & acos_(Tensor & self) const = 0; + virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0; + virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0; + virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim) const = 0; + AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step) const = 0); + AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0); + virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor argmax(const Tensor & self) const = 0; + virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor argmin(const Tensor & self) const = 0; + virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0; + virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0; + virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0; + virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0; + virtual Tensor asin(const Tensor & self) const = 0; + virtual Tensor & asin_(Tensor & self) const = 0; + virtual Tensor atan(const Tensor & self) const = 0; + virtual Tensor & atan_(Tensor & self) const = 0; + virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator) const = 0; + virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0; + virtual Tensor bernoulli(const Tensor & self) const = 0; + virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) const = 0; + virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator) const = 0; + virtual Tensor & bernoulli_(Tensor & self) const = 0; + virtual Tensor bincount(const Tensor & self, const Tensor & weights, int64_t minlength) const = 0; + virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0; + virtual Tensor ceil(const Tensor & self) const = 0; + virtual Tensor & ceil_(Tensor & self) const = 0; + virtual std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim) const = 0; + virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const = 0; + virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const = 0; + virtual Tensor clamp_max(const Tensor & self, Scalar max) const = 0; + virtual Tensor & clamp_max_(Tensor & self, Scalar max) const = 0; + virtual Tensor clamp_min(const Tensor & self, Scalar min) const = 0; + virtual Tensor & clamp_min_(Tensor & self, Scalar min) const = 0; + virtual Tensor cos(const Tensor & self) const = 0; + virtual Tensor & cos_(Tensor & self) const = 0; + virtual Tensor cosh(const Tensor & self) const = 0; + virtual Tensor & cosh_(Tensor & self) const = 0; + virtual Tensor cumsum(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual Tensor cumsum(const Tensor & self, int64_t dim) const = 0; + virtual Tensor cumprod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual Tensor cumprod(const Tensor & self, int64_t dim) const = 0; + virtual Tensor det(const Tensor & self) const = 0; + virtual Tensor diagflat(const Tensor & self, int64_t offset) const = 0; + virtual Tensor diagonal(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2) const = 0; + virtual Tensor div(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & div_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor div(const Tensor & self, Scalar other) const = 0; + virtual Tensor & div_(Tensor & self, Scalar other) const = 0; + virtual Tensor dot(const Tensor & self, const Tensor & tensor) const = 0; + AT_DEPRECATED(virtual Tensor empty(IntList size) const = 0); + virtual Tensor erf(const Tensor & self) const = 0; + virtual Tensor & erf_(Tensor & self) const = 0; + virtual Tensor erfc(const Tensor & self) const = 0; + virtual Tensor & erfc_(Tensor & self) const = 0; + virtual Tensor exp(const Tensor & self) const = 0; + virtual Tensor & exp_(Tensor & self) const = 0; + virtual Tensor expm1(const Tensor & self) const = 0; + virtual Tensor & expm1_(Tensor & self) const = 0; + virtual Tensor expand(const Tensor & self, IntList size, bool implicit) const = 0; + virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0; + AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m) const = 0); + virtual Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim) const = 0; + virtual Tensor & fill_(Tensor & self, Scalar value) const = 0; + virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0; + virtual Tensor floor(const Tensor & self) const = 0; + virtual Tensor & floor_(Tensor & self) const = 0; + AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0); + virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0; + virtual std::tuple gesv(const Tensor & self, const Tensor & A) const = 0; + virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0; + virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0; + virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided) const = 0; + virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const = 0; + virtual Tensor index(const Tensor & self, TensorList indices) const = 0; + virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0; + virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0; + virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0; + virtual Tensor inverse(const Tensor & self) const = 0; + virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0; + virtual bool is_cuda(const Tensor & self) const = 0; + virtual bool is_distributed(const Tensor & self) const = 0; + virtual bool is_floating_point(const Tensor & self) const = 0; + virtual bool is_complex(const Tensor & self) const = 0; + virtual bool is_nonzero(const Tensor & self) const = 0; + virtual bool is_same_size(const Tensor & self, const Tensor & other) const = 0; + virtual bool is_signed(const Tensor & self) const = 0; + virtual bool is_sparse(const Tensor & self) const = 0; + virtual std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) const = 0; + AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps) const = 0); + virtual Tensor log(const Tensor & self) const = 0; + virtual Tensor & log_(Tensor & self) const = 0; + virtual Tensor log10(const Tensor & self) const = 0; + virtual Tensor & log10_(Tensor & self) const = 0; + virtual Tensor log1p(const Tensor & self) const = 0; + virtual Tensor & log1p_(Tensor & self) const = 0; + virtual Tensor log2(const Tensor & self) const = 0; + virtual Tensor & log2_(Tensor & self) const = 0; + virtual Tensor logdet(const Tensor & self) const = 0; + AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps) const = 0); + virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0; + virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor matrix_power(const Tensor & self, int64_t n) const = 0; + virtual std::tuple max(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor mean(const Tensor & self, ScalarType dtype) const = 0; + virtual Tensor mean(const Tensor & self) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual std::tuple median(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual std::tuple min(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor mm(const Tensor & self, const Tensor & mat2) const = 0; + virtual std::tuple mode(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor mul(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & mul_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor mul(const Tensor & self, Scalar other) const = 0; + virtual Tensor & mul_(Tensor & self, Scalar other) const = 0; + virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0; + virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0; + virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0; + virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0; + AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0); + virtual Tensor permute(const Tensor & self, IntList dims) const = 0; + virtual Tensor pin_memory(const Tensor & self) const = 0; + virtual Tensor pinverse(const Tensor & self, double rcond) const = 0; + AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step) const = 0); + virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0; + virtual Tensor reshape(const Tensor & self, IntList shape) const = 0; + virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor round(const Tensor & self) const = 0; + virtual Tensor & round_(Tensor & self) const = 0; + virtual Tensor relu(const Tensor & self) const = 0; + virtual Tensor & relu_(Tensor & self) const = 0; + virtual Tensor hardshrink(const Tensor & self, Scalar lambd) const = 0; + virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0; + virtual Tensor rsqrt(const Tensor & self) const = 0; + virtual Tensor & rsqrt_(Tensor & self) const = 0; + virtual Tensor select(const Tensor & self, int64_t dim, int64_t index) const = 0; + virtual Tensor sigmoid(const Tensor & self) const = 0; + virtual Tensor & sigmoid_(Tensor & self) const = 0; + virtual Tensor sin(const Tensor & self) const = 0; + virtual Tensor & sin_(Tensor & self) const = 0; + virtual Tensor sinh(const Tensor & self) const = 0; + virtual Tensor & sinh_(Tensor & self) const = 0; + virtual Tensor detach(const Tensor & self) const = 0; + virtual Tensor & detach_(Tensor & self) const = 0; + virtual int64_t size(const Tensor & self, int64_t dim) const = 0; + virtual Tensor slice(const Tensor & self, int64_t dim, int64_t start, int64_t end, int64_t step) const = 0; + virtual std::tuple slogdet(const Tensor & self) const = 0; + virtual Tensor smm(const Tensor & self, const Tensor & mat2) const = 0; + virtual Tensor softmax(const Tensor & self, int64_t dim) const = 0; + virtual std::vector split(const Tensor & self, int64_t split_size, int64_t dim) const = 0; + virtual std::vector split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim) const = 0; + virtual Tensor squeeze(const Tensor & self) const = 0; + virtual Tensor squeeze(const Tensor & self, int64_t dim) const = 0; + virtual Tensor & squeeze_(Tensor & self) const = 0; + virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0; + virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const = 0; + virtual int64_t stride(const Tensor & self, int64_t dim) const = 0; + virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0; + virtual Tensor sum(const Tensor & self) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0; + virtual Tensor sqrt(const Tensor & self) const = 0; + virtual Tensor & sqrt_(Tensor & self) const = 0; + virtual Tensor std(const Tensor & self, bool unbiased) const = 0; + virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0; + virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0; + virtual Tensor prod(const Tensor & self) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual Tensor t(const Tensor & self) const = 0; + virtual Tensor & t_(Tensor & self) const = 0; + virtual Tensor tan(const Tensor & self) const = 0; + virtual Tensor & tan_(Tensor & self) const = 0; + virtual Tensor tanh(const Tensor & self) const = 0; + virtual Tensor & tanh_(Tensor & self) const = 0; + virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const = 0; + virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const = 0; + virtual Tensor flip(const Tensor & self, IntList dims) const = 0; + virtual Tensor rot90(const Tensor & self, int64_t k, IntList dims) const = 0; + virtual Tensor trunc(const Tensor & self) const = 0; + virtual Tensor & trunc_(Tensor & self) const = 0; + virtual Tensor type_as(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const = 0; + virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const = 0; + virtual Tensor var(const Tensor & self, bool unbiased) const = 0; + virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0; + virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0; + AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0); + virtual Tensor norm(const Tensor & self, Scalar p) const = 0; + virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0; + virtual Tensor clone(const Tensor & self) const = 0; + virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0; + virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0; + virtual Tensor & zero_(Tensor & self) const = 0; + virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor tensor() const = 0; + virtual Tensor tensor(IntList size) const = 0; + virtual Tensor native_sparse_coo_tensor(IntList size) const = 0; + virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0; + virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0; + virtual Tensor sparse_coo_tensor(IntList size) const = 0; + virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0; + virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0; + virtual Tensor _native_sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntList size) const = 0; + virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0; + virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0; + virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0; + virtual Tensor to_dense(const Tensor & self) const = 0; + virtual int64_t _sparseDims(const Tensor & self) const = 0; + virtual int64_t _denseDims(const Tensor & self) const = 0; + virtual int64_t _nnz(const Tensor & self) const = 0; + virtual Tensor coalesce(const Tensor & self) const = 0; + virtual bool is_coalesced(const Tensor & self) const = 0; + virtual Tensor _indices(const Tensor & self) const = 0; + virtual Tensor _values(const Tensor & self) const = 0; + virtual int64_t numel(const Tensor & self) const = 0; + virtual std::vector unbind(const Tensor & self, int64_t dim) const = 0; + virtual int64_t get_device(const Tensor & self) const = 0; + virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) const = 0; + virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking) const = 0; + virtual Tensor to(const Tensor & self, Device device, bool non_blocking) const = 0; + virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking) const = 0; + virtual Scalar _local_scalar(const Tensor & self) const = 0; +protected: + TensorTypeId type_id_; + bool is_variable_; + bool is_undefined_; + +}; + +} // namespace at + +#include "ATen/core/Tensor.h" diff --git a/aten/src/ATen/core/UndefinedTensorImpl.cpp b/aten/src/ATen/core/UndefinedTensorImpl.cpp new file mode 100644 index 00000000000000..e26b61a03c87e0 --- /dev/null +++ b/aten/src/ATen/core/UndefinedTensorImpl.cpp @@ -0,0 +1,40 @@ +#include "ATen/core/UndefinedTensorImpl.h" +#include "ATen/core/Error.h" + +namespace at { + +// should this use the globalContext? Can it get a context passed in somehow? +UndefinedTensorImpl::UndefinedTensorImpl() +: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is variable */ false) { +} + +IntList UndefinedTensorImpl::sizes() const { + AT_ERROR("sizes() called on undefined Tensor"); +} + +int64_t UndefinedTensorImpl::size(int64_t d) const { + AT_ERROR("size(dim) called on an undefined Tensor"); +} + +int64_t UndefinedTensorImpl::stride(int64_t d) const { + AT_ERROR("stride(dim) called on an undefined Tensor"); +} + +int64_t UndefinedTensorImpl::dim() const { + AT_ERROR("dim() called on undefined Tensor"); +} + +const Storage& UndefinedTensorImpl::storage() const { + AT_ERROR("storage() called on undefined Tensor"); +} + +int64_t UndefinedTensorImpl::storage_offset() const { + AT_ERROR("storage_offset() called on an undefined Tensor"); +} + +IntList UndefinedTensorImpl::strides() const { + AT_ERROR("strides() called on undefined Tensor"); +} +UndefinedTensorImpl UndefinedTensorImpl::_singleton; + +} diff --git a/aten/src/ATen/UndefinedTensor.h b/aten/src/ATen/core/UndefinedTensorImpl.h similarity index 72% rename from aten/src/ATen/UndefinedTensor.h rename to aten/src/ATen/core/UndefinedTensorImpl.h index 86faf028802c1e..6c734950d90cad 100644 --- a/aten/src/ATen/UndefinedTensor.h +++ b/aten/src/ATen/core/UndefinedTensorImpl.h @@ -1,13 +1,13 @@ #pragma once -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" namespace at { -struct AT_API UndefinedTensor final : public TensorImpl { +struct AT_API UndefinedTensorImpl final : public TensorImpl { public: // Without this, we get: - // error: identifier "at::UndefinedTensor::_singleton" is undefined in device code + // error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code // (ostensibly because the constexpr tricks MSVC into trying to compile this // function for device as well). #ifdef _WIN32 @@ -25,8 +25,8 @@ struct AT_API UndefinedTensor final : public TensorImpl { const Storage& storage() const override; int64_t storage_offset() const override; private: - UndefinedTensor(); - static UndefinedTensor _singleton; + UndefinedTensorImpl(); + static UndefinedTensorImpl _singleton; public: friend struct UndefinedType; }; diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h index 2ca9a7f6851102..7cf1b7cc174980 100644 --- a/aten/src/ATen/core/context_base.h +++ b/aten/src/ATen/core/context_base.h @@ -61,7 +61,7 @@ class AT_CORE_API BaseContext { virtual BaseStaticContext* GetStaticContext() const = 0; /* Sorry for the naming, will get rid of this in future diff */ - virtual DeviceType GetDevicetype() const = 0; + virtual DeviceType device_type() const = 0; virtual void SwitchToDevice(int /*stream_id*/) = 0; @@ -96,13 +96,13 @@ class AT_CORE_API BaseContext { DeviceType type) { if (type == DeviceType::CPU) { CopyBytesToCPU(nbytes, src, dst); - } else if (type == GetDevicetype()) { + } else if (type == device_type()) { CopyBytesSameDevice(nbytes, src, dst); } else { AT_ERROR( "CopyBytesToDevice can only copy to CPU or between same " "device. Can't copy from: ", - GetDevicetype(), + device_type(), " to", type); } diff --git a/aten/src/ATen/core/intrusive_ptr.h b/aten/src/ATen/core/intrusive_ptr.h index 65c6b5e702f2a0..961915555a3756 100644 --- a/aten/src/ATen/core/intrusive_ptr.h +++ b/aten/src/ATen/core/intrusive_ptr.h @@ -119,6 +119,15 @@ struct AT_CORE_EXPORT intrusive_target_default_null_type final { return nullptr; } }; + +template +TTarget* assign_ptr_(TTarget* rhs) { + if (FromNullType::singleton() == rhs) { + return ToNullType::singleton(); + } else { + return rhs; + } +} } // namespace detail template @@ -191,17 +200,10 @@ class AT_CORE_EXPORT intrusive_ptr final { template /* implicit */ intrusive_ptr(intrusive_ptr&& rhs) noexcept - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr move constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr move constructor got pointer with differing null value."); -#endif rhs.target_ = FromNullType::singleton(); } @@ -212,17 +214,10 @@ class AT_CORE_EXPORT intrusive_ptr final { template /* implicit */ intrusive_ptr( const intrusive_ptr& rhs) - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr copy constructor got pointer with differing null value."); -#endif retain_(); } @@ -240,13 +235,6 @@ class AT_CORE_EXPORT intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr move assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr move assignment got pointer with differing null value."); -#endif intrusive_ptr tmp = std::move(rhs); swap(tmp); return *this; @@ -261,13 +249,6 @@ class AT_CORE_EXPORT intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr copy assignment got pointer with differing null value."); -#endif intrusive_ptr tmp = rhs; swap(tmp); return *this; @@ -464,17 +445,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { template /* implicit */ weak_intrusive_ptr( weak_intrusive_ptr&& rhs) noexcept - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr move constructor got pointer with differing null value."); -#endif rhs.target_ = FromNullType::singleton(); } @@ -486,17 +460,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { template /* implicit */ weak_intrusive_ptr( const weak_intrusive_ptr& rhs) - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr copy constructor got pointer with differing null value."); -#endif retain_(); } @@ -515,13 +482,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr move assignment got pointer with differing null value."); -#endif weak_intrusive_ptr tmp = std::move(rhs); swap(tmp); return *this; @@ -537,13 +497,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr copy assignment got pointer with differing null value."); -#endif weak_intrusive_ptr tmp = rhs; swap(tmp); return *this; diff --git a/aten/src/ATen/core/intrusive_ptr_test.cpp b/aten/src/ATen/core/intrusive_ptr_test.cpp index df98563f1ff1fc..4d0701ebe164d7 100644 --- a/aten/src/ATen/core/intrusive_ptr_test.cpp +++ b/aten/src/ATen/core/intrusive_ptr_test.cpp @@ -59,6 +59,23 @@ class ChildDestructableMock final : public DestructableMock { ChildDestructableMock(bool* resourcesReleased, bool* wasDestructed) : DestructableMock(resourcesReleased, wasDestructed) {} }; +class NullType1 final { + static SomeClass singleton_; +public: + static constexpr SomeClass* singleton() { + return &singleton_; + } +}; +SomeClass NullType1::singleton_; +class NullType2 final { + static SomeClass singleton_; +public: + static constexpr SomeClass* singleton() { + return &singleton_; + } +}; +SomeClass NullType2::singleton_; +static_assert(NullType1::singleton() != NullType2::singleton(), ""); } // namespace static_assert( @@ -262,6 +279,19 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2; + obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, givenValidPtr_whenCopyAssigning_thenPointsToSameObject) { intrusive_ptr obj1 = make_intrusive(); intrusive_ptr obj2 = make_intrusive(); @@ -359,6 +389,19 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2; + obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, givenPtr_whenMoveConstructing_thenPointsToSameObject) { intrusive_ptr obj1 = make_intrusive(); SomeClass* obj1ptr = obj1.get(); @@ -420,6 +463,18 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, givenPtr_whenCopyConstructing_thenPointsToSameObject) { intrusive_ptr obj1 = make_intrusive(); SomeClass* obj1ptr = obj1.get(); @@ -482,6 +537,18 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, SwapFunction) { intrusive_ptr obj1 = make_intrusive(); intrusive_ptr obj2 = make_intrusive(); @@ -1520,9 +1587,9 @@ weak_intrusive_ptr make_weak_only(Args&&... args) { auto intrusive = make_intrusive(std::forward(args)...); return weak_intrusive_ptr(intrusive); } -template -weak_intrusive_ptr make_invalid_weak() { - return weak_intrusive_ptr(intrusive_ptr()); +template > +weak_intrusive_ptr make_invalid_weak() { + return weak_intrusive_ptr(intrusive_ptr()); } } // namespace @@ -1752,6 +1819,17 @@ TEST( EXPECT_TRUE(obj2.weak.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = make_invalid_weak(); + obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST( WeakIntrusivePtrTest, givenValidPtr_whenCopyAssigning_thenPointsToSameObject) { @@ -1930,6 +2008,17 @@ TEST( EXPECT_TRUE(obj2.weak.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = make_invalid_weak(); + obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST( WeakIntrusivePtrTest, givenPtr_whenMoveConstructing_thenPointsToSameObject) { @@ -2014,6 +2103,16 @@ TEST( EXPECT_TRUE(obj2.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST( WeakIntrusivePtrTest, givenPtr_whenCopyConstructing_thenPointsToSameObject) { @@ -2097,6 +2196,16 @@ TEST( EXPECT_TRUE(obj2.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST(WeakIntrusivePtrTest, SwapFunction) { IntrusiveAndWeak obj1 = make_weak_intrusive(); IntrusiveAndWeak obj2 = make_weak_intrusive(); diff --git a/torch/csrc/jit/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp similarity index 86% rename from torch/csrc/jit/ivalue.cpp rename to aten/src/ATen/core/ivalue.cpp index 315da36deb196f..3d2b56893e7188 100644 --- a/torch/csrc/jit/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -1,12 +1,15 @@ -#include "torch/csrc/jit/assertions.h" -#include "torch/csrc/jit/ivalue.h" -#include +#include +#include #define TORCH_FORALL_TAGS(_) \ _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList) namespace torch { namespace jit { +AT_API c10::intrusive_ptr ConstantString::create(std::string str_) { + return c10::make_intrusive(std::move(str_)); +} + namespace { template diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h new file mode 100644 index 00000000000000..914598f6ceb426 --- /dev/null +++ b/aten/src/ATen/core/ivalue.h @@ -0,0 +1,425 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace torch { namespace jit { + +template +using Shared = c10::intrusive_ptr; + +// string +struct AT_API ConstantString final : c10::intrusive_ptr_target { + private: + const std::string str_; + public: + ConstantString(std::string str) + : str_(std::move(str)) {} + static c10::intrusive_ptr create(std::string str_); + const std::string & string() const { + return str_; + } + operator const std::string & () const { + return string(); + } + AT_API friend std::ostream& operator<<( + std::ostream& out, + const ConstantString& v); +}; + +// non-mutable list +template +struct AT_CORE_EXPORT ConstantList final : c10::intrusive_ptr_target { + private: + const std::vector elements_; + public: + ConstantList(std::vector elements_) + : elements_(std::move(elements_)) {} + static c10::intrusive_ptr> create(std::vector elements_) { + return c10::make_intrusive>(std::move(elements_)); + } + const std::vector& elements() const { + return elements_; + } + operator const std::vector&() const { + return elements(); + } +}; + +struct IValue; +using Tuple = ConstantList; +using IntList = ConstantList; +using TensorList = ConstantList; +using DoubleList = ConstantList; + +// IValue is the generic tagged union used by the interpreter to hold +// all value types. +// It is a 16-byte object with an 8-byte payload and an 8-byte tag. +// The tag is currently 4 bytes to determine the type, and 1 byte +// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs +// retain/release calls. + +#define TORCH_FORALL_TAGS(_) \ + _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList) + +struct AT_API IValue final { + IValue() + : payload{0} + , tag(Tag::None) + , is_intrusive_ptr(false) {} + IValue(const IValue& rhs) + : payload(rhs.payload), + tag(rhs.tag), + is_intrusive_ptr(rhs.is_intrusive_ptr) { + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); + } + } + IValue(IValue&& rhs) noexcept : IValue() { + swap(rhs); + } + ~IValue() { + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); + } + } + IValue & operator=(IValue && rhs) & noexcept { + IValue(std::move(rhs)).swap(*this); // this also sets rhs to None + return *this; + } + IValue & operator=(IValue const & rhs) & { + IValue(rhs).swap(*this); + return *this; + } + void swap(IValue & rhs) noexcept { + std::swap(payload, rhs.payload); + std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); + std::swap(tag, rhs.tag); + } + // Accessors for subtypes are arranged together below + // While some of these accessors could be generated through templates, + // we prefer to write them manually for clarity + + // Tensor + IValue(at::Tensor t) + : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { + // Note: the undefined tensor is not refcounted, so while it + // is tagged as a tensor, is_intrusive_ptr is set to false. + // This is not an optional optimization: our incref call + // *will not* do the right thing when called on an + // undefined tensor. + payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); + } + bool isTensor() const { return Tag::Tensor == tag; } + at::Tensor toTensor() && { + AT_ASSERT(isTensor()); + return at::Tensor(moveToIntrusivePtr()); + } + at::Tensor toTensor() const & { + AT_ASSERT(isTensor()); + return at::Tensor(toIntrusivePtr()); + } + + // Tuple + IValue(c10::intrusive_ptr v); + bool isTuple() const { return Tag::Tuple == tag; } + c10::intrusive_ptr toTuple() && { + AT_ASSERT(isTuple()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toTuple() const & { + AT_ASSERT(isTuple()); + return toIntrusivePtr(); + } + + // Double + IValue(double d) + : tag(Tag::Double), is_intrusive_ptr(false) { + payload.as_double = d; + } + bool isDouble() const { return Tag::Double == tag; } + double toDouble() const { + AT_ASSERT(isDouble()); + return payload.as_double; + } + + // Int + IValue(int64_t i) + : tag(Tag::Int), is_intrusive_ptr(false) { + payload.as_int = i; + } + + // allow you to pass literals (3, 4) without ambiguity + IValue(int32_t i) + : IValue(static_cast(i)) {} + IValue(bool b) + : IValue(static_cast(b)) {} + + bool isInt() const { return Tag::Int == tag; } + + int64_t toInt() const { + AT_ASSERT(isInt()); + return payload.as_int; + } + + // IntList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + IValue(at::ArrayRef v) + : IValue(v.vec()) {} + bool isIntList() const { return Tag::IntList == tag; } + c10::intrusive_ptr toIntList() && { + AT_ASSERT(isIntList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toIntList() const & { + AT_ASSERT(isIntList()); + return toIntrusivePtr(); + } + + const std::vector& toIntListRef() const; + const std::vector& toDoubleListRef() const; + const std::vector& toTensorListRef() const; + + // ConstantString + IValue(c10::intrusive_ptr v); + IValue(std::string v); + bool isString() const { return Tag::String == tag; } + c10::intrusive_ptr toString() && { + AT_ASSERT(isString()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toString() const & { + AT_ASSERT(isString()); + return toIntrusivePtr(); + } + + // DoubleList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isDoubleList() const { return Tag::DoubleList == tag; } + c10::intrusive_ptr toDoubleList() && { + AT_ASSERT(isDoubleList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toDoubleList() const & { + AT_ASSERT(isDoubleList()); + return toIntrusivePtr(); + } + + //TensorList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isTensorList() const { return Tag::TensorList == tag; } + c10::intrusive_ptr toTensorList() && { + AT_ASSERT(isTensorList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toTensorList() const & { + AT_ASSERT(isTensorList()); + return toIntrusivePtr(); + } + + // None + bool isNone() { + return Tag::None == tag; + } + std::string toNone() const { + return "None"; + } + // Scalar, which gets encoded as either an Int or a Double + IValue(at::Scalar s) + : IValue() { + if(s.isFloatingPoint()) { + *this = s.toDouble(); + } else { + *this = s.toLong(); + } + } + bool isScalar() { + return isDouble() || isInt(); + } + at::Scalar toScalar() const { + if(isDouble()) + return toDouble(); + else if(isInt()) + return toInt(); + else + throw std::runtime_error("IValue is not a Scalar"); + } + + // for debugging + std::string tagKind() const { + switch(tag) { + #define DEFINE_CASE(x) case Tag::x: return #x; + TORCH_FORALL_TAGS(DEFINE_CASE) + #undef DEFINE_CASE + } + return "Invalid Tag"; + } + + // generic v.to() implementations + // that can be used in special functions like pop/push + // that use template meta-programming. + // prefer the directly named methods when you can, + // since they are simpler to understand + + // Note: if you get linker errors saying one of these is missing, + // change it to ... && = delete; and you will see better error messages for why + // However, we cannot commit this because some compiler versions barf on it. + template + T to() &&; + template + T to() const &; + + AT_API friend std::ostream& operator<<(std::ostream& out, const IValue& v); + + private: + // NOTE: IValue tags are intentionally private. In the future we may encode + // this value different (e.g. using NaN boxing), and this would make it more + // costly to determine the tag for all types vs just determining if something + // is a particular type. Instead we want clients to use the `isX` methods when + // possible. If for perf. reasons you really, absolutely, must have a jump + // table, then we can revisit this. + enum class Tag : uint32_t { +#define DEFINE_TAG(x) x, + TORCH_FORALL_TAGS(DEFINE_TAG) +#undef DEFINE_TAG + }; + + template> + c10::intrusive_ptr moveToIntrusivePtr() { + auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + clearToNone(); + return t; + } + template> + c10::intrusive_ptr toIntrusivePtr() const { + auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + auto p = r; + r.release(); + return p; + } + void clearToNone() { + payload.as_int = 0; + tag = Tag::None; + is_intrusive_ptr = false; + } + union { + int64_t as_int; + double as_double; + c10::intrusive_ptr_target* as_intrusive_ptr; + } payload; + Tag tag; + bool is_intrusive_ptr; +}; + +#undef TORCH_FORALL_TAGS + + +#define DEFINE_TO(type, method_name) \ +template<> \ +inline type IValue::to() && { \ + return std::move(*this).method_name(); \ +} \ +template<> \ +inline type IValue::to() const & { \ + return this->method_name(); \ +} +DEFINE_TO(at::Tensor, toTensor) +DEFINE_TO(c10::intrusive_ptr, toTuple) +DEFINE_TO(double, toDouble) +DEFINE_TO(int64_t, toInt) +DEFINE_TO(c10::intrusive_ptr, toDoubleList) +DEFINE_TO(c10::intrusive_ptr, toIntList) +DEFINE_TO(c10::intrusive_ptr, toTensorList) +DEFINE_TO(c10::intrusive_ptr, toString) +DEFINE_TO(at::Scalar, toScalar) +DEFINE_TO(bool, toInt) +DEFINE_TO(std::vector, toIntListRef) +DEFINE_TO(std::vector, toDoubleListRef) +DEFINE_TO(std::vector, toTensorListRef) + +#undef DEFINE_TO + +#define DEFINE_TO_WITH_BODY(type, body) \ +template<> \ +inline type IValue::to() && { \ + body(std::move(*this)); \ +} \ +template<> \ +inline type IValue::to() const & { \ + body((*this)); \ +} + +#define SCALAR_TYPE_BODY(this) return static_cast(this.toInt()); +#define LAYOUT_BODY(this) return static_cast(this.toInt()); +#define DEVICE_BODY(this) \ + /* NB: const_list might be a move of the vector, so we need to */ \ + /* assign it to prevent its deallocation. */ \ + auto&& const_list = this.toIntList(); \ + const auto& elems = const_list->elements(); \ + AT_ASSERT(elems.size() == 2); \ + return at::Device(static_cast(elems[0]), elems[1]); + +DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY) +DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY) +DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY) + +#undef DEFINE_TO_WITH_BODY +#undef SCALAR_TYPE_BODY +#undef LAYOUT_BODY +#undef DEVICE_BODY + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::Tuple), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::IntList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(IntList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::String), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::string v) +: IValue(ConstantString::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::DoubleList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(DoubleList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::TensorList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(TensorList::create(std::move(v))) {} + +inline const std::vector& IValue::toIntListRef() const { + return toIntList()->elements(); +} + +inline const std::vector& IValue::toDoubleListRef() const { + return toDoubleList()->elements(); +} + +inline const std::vector& IValue::toTensorListRef() const { + return toTensorList()->elements(); +} + + +}} diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h index e258d3be6a7fa3..a19035de1ba034 100644 --- a/aten/src/ATen/core/typeid.h +++ b/aten/src/ATen/core/typeid.h @@ -49,9 +49,9 @@ class AT_CORE_API TypeIdentifier final : public at::IdWrapper - bool Match() const { + bool Match() const noexcept { return (id_ == Id()); } @@ -452,10 +452,8 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept { class Tensor; -// Note: we have preallocated the numbers 0-8 so they line up exactly +// Note: we have preallocated the numbers so they line up exactly // with at::ScalarType's numbering. All other numbers do not matter. -// -// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins. struct _CaffeHighestPreallocatedTypeId final {}; @@ -470,7 +468,7 @@ CAFFE_DECLARE_KNOWN_TYPE(7, double) CAFFE_DECLARE_KNOWN_TYPE(8, at::ComplexHalf) CAFFE_DECLARE_KNOWN_TYPE(9, std::complex) CAFFE_DECLARE_KNOWN_TYPE(10, std::complex) -// 10 = undefined type id +// 11 = undefined type id CAFFE_DECLARE_KNOWN_TYPE(12, Tensor) CAFFE_DECLARE_KNOWN_TYPE(13, std::string) diff --git a/aten/src/ATen/cpu/vec256/intrinsics.h b/aten/src/ATen/cpu/vec256/intrinsics.h index 442e8fd0511fc7..76779aada7a0b3 100644 --- a/aten/src/ATen/cpu/vec256/intrinsics.h +++ b/aten/src/ATen/cpu/vec256/intrinsics.h @@ -19,7 +19,7 @@ /* GCC-compatible compiler, targeting ARM with WMMX */ #include #elif (defined(__GNUC__) || defined(__xlC__)) && \ - (defined(__VEC__) || defined(__ALTIVEC__)) + (defined(__VEC__) || defined(__ALTIVEC__)) /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ #include #elif defined(__GNUC__) && defined(__SPE__) diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h index 98f1158465f2f7..71688bd48e5090 100644 --- a/aten/src/ATen/cpu/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec256/vec256.h @@ -32,4 +32,194 @@ std::ostream& operator<<(std::ostream& stream, const Vec256& vec) { return stream; } + +#if defined(__AVX__) && !defined(_MSC_VER) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template<> +Vec256 cast(const Vec256& src) { + return _mm256_castpd_ps(src); +} + +template<> +Vec256 cast(const Vec256& src) { + return _mm256_castps_pd(src); +} + +#if defined(__AVX2__) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#define DEFINE_FLOAT_INT_CAST(int_t, float_t, float_ch) \ +template<> \ +Vec256 cast(const Vec256& src) { \ + return _mm256_castp ## float_ch ## _si256(src); \ +} \ +template<> \ +Vec256 cast(const Vec256& src) { \ + return _mm256_castsi256_p ## float_ch (src); \ +} + +DEFINE_FLOAT_INT_CAST(int64_t, double, d) +DEFINE_FLOAT_INT_CAST(int32_t, double, d) +DEFINE_FLOAT_INT_CAST(int16_t, double, d) +DEFINE_FLOAT_INT_CAST(int64_t, float, s) +DEFINE_FLOAT_INT_CAST(int32_t, float, s) +DEFINE_FLOAT_INT_CAST(int16_t, float, s) + +#undef DEFINE_FLOAT_INT_CAST + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +c10::guts::enable_if_t> +inline gather(const double* base_addr, const Vec256& vindex) { + return _mm256_i64gather_pd(base_addr, vindex, scale); +} + +template +c10::guts::enable_if_t> +inline gather(const float* base_addr, const Vec256& vindex) { + return _mm256_i32gather_ps(base_addr, vindex, scale); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +c10::guts::enable_if_t> +inline mask_gather(const Vec256& src, const double* base_addr, + const Vec256& vindex, const Vec256& mask) { + return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale); +} + +template +c10::guts::enable_if_t> +inline mask_gather(const Vec256& src, const float* base_addr, + const Vec256& vindex, const Vec256& mask) { + return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +// Only works for inputs in the range: [-2^51, 2^51] +// From: https://stackoverflow.com/a/41148578 +template<> +Vec256 +inline convert_to_int_of_same_size(const Vec256 &src) { + auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000)); + return _mm256_sub_epi64( + _mm256_castpd_si256(x), + _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) + ); +} + +template<> +Vec256 +inline convert_to_int_of_same_size(const Vec256 &src) { + return _mm256_cvttps_epi32(src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vec256> +inline interleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, a1, a3, a3} + // b = {b0, b1, b2, b3} + + // swap lanes: + // a_swapped = {a0, a1, b0, b1} + // b_swapped = {a2, a3, b2, b3} + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + auto a_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_a); + auto b_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_b); + + // group cols crossing lanes: + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6); // 0, 2, 1, 3 + return std::make_pair(_mm256_permute4x64_pd(a_swapped, group_ctrl), + _mm256_permute4x64_pd(b_swapped, group_ctrl)); +} + +template <> +std::pair, Vec256> +inline interleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + + // swap lanes: + // a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3} + // b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7} + // TODO: can we support caching this? + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + auto a_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_a); + auto b_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_b); + + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl), + _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vec256> +inline deinterleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + + // group cols crossing lanes: + // a_grouped = {a0, a1, b0, b1} + // b_grouped = {a2, a3, b2, b3} + static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6); // 0, 2, 1, 3 + auto a_grouped = _mm256_permute4x64_pd(a, group_ctrl); + auto b_grouped = _mm256_permute4x64_pd(b, group_ctrl); + + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_a), + _mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_b)); +} + +template <> +std::pair, Vec256> +inline deinterleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + + // group cols crossing lanes: + // a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3} + // b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7} + // TODO: can we support caching this? + const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl); + auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl); + + // swap lanes: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_a), + _mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_b)); +} + +#endif // defined(__AVX2__) + +#endif // defined(__AVX__) && !defined(_MSC_VER) + }}} diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h index fa3c4e550d6a32..f1eba7e2d3c428 100644 --- a/aten/src/ATen/cpu/vec256/vec256_base.h +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -3,8 +3,11 @@ #include #include #include +#include +#include #include "ATen/Utils.h" +#include "ATen/core/C++17.h" #if defined(__GNUC__) #define __at_align32__ __attribute__((aligned(32))) @@ -18,6 +21,21 @@ namespace at { namespace vec256 { namespace { +template struct int_of_size; + +#define DEFINE_INT_OF_SIZE(int_t) \ +template<> struct int_of_size { using type = int_t; } + +DEFINE_INT_OF_SIZE(int64_t); +DEFINE_INT_OF_SIZE(int32_t); +DEFINE_INT_OF_SIZE(int16_t); +DEFINE_INT_OF_SIZE(int8_t); + +#undef DEFINE_INT_OF_SIZE + +template +using int_same_size_t = typename int_of_size::type; + // NOTE: If you specialize on a type, you must define all operations! // emulates vectorized types @@ -33,8 +51,13 @@ struct Vec256 { values[i] = val; } } + template> + Vec256(Args... vals) { + values = { vals... }; + } template - static Vec256 blend(Vec256 a, Vec256 b) { + static Vec256 blend(const Vec256& a, const Vec256& b) { int64_t mask = mask_; Vec256 vec; for (int64_t i = 0; i < size; i++) { @@ -47,7 +70,29 @@ struct Vec256 { } return vec; } - static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + Vec256 vec; + int_same_size_t buffer[size]; + mask.store(buffer); + for (int64_t i = 0; i < size; i++) { + if (buffer[i] & 0x01) + { + vec[i] = b[i]; + } else { + vec[i] = a[i]; + } + } + return vec; + } + static Vec256 arange(T base = static_cast(0), T step = static_cast(1)) { + Vec256 vec; + for (int64_t i = 0; i < size; i++) { + vec.values[i] = base + i * step; + } + return vec; + } + static Vec256 set(const Vec256& a, const Vec256& b, int64_t count = size) { Vec256 vec; for (int64_t i = 0; i < size; i++) { if (i < count) { @@ -173,9 +218,28 @@ struct Vec256 { } return ret; } +#define DEFINE_COMP(binary_pred) \ + Vec256 operator binary_pred(const Vec256 &other) const { \ + Vec256 vec; \ + for (int64_t i = 0; i != size; i++) { \ + if (values[i] binary_pred other.values[i]) { \ + std::memset(static_cast(vec.values + i), 0xFF, sizeof(T)); \ + } else { \ + std::memset(static_cast(vec.values + i), 0, sizeof(T)); \ + } \ + } \ + return vec; \ + } + DEFINE_COMP(==) + DEFINE_COMP(!=) + DEFINE_COMP(>=) + DEFINE_COMP(<=) + DEFINE_COMP(>) + DEFINE_COMP(<) +#undef DEFINE_COMP }; -template Vec256 operator+(const Vec256 &a, const Vec256 &b) { +template Vec256 inline operator+(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] + b[i]; @@ -183,7 +247,7 @@ template Vec256 operator+(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 operator-(const Vec256 &a, const Vec256 &b) { +template Vec256 inline operator-(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] - b[i]; @@ -191,7 +255,7 @@ template Vec256 operator-(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 operator*(const Vec256 &a, const Vec256 &b) { +template Vec256 inline operator*(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] * b[i]; @@ -199,7 +263,7 @@ template Vec256 operator*(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 operator/(const Vec256 &a, const Vec256 &b) __ubsan_ignore_float_divide_by_zero__ { +template Vec256 inline operator/(const Vec256 &a, const Vec256 &b) __ubsan_ignore_float_divide_by_zero__ { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] / b[i]; @@ -207,7 +271,8 @@ template Vec256 operator/(const Vec256 &a, const Vec256 &b) _ return c; } -template Vec256 max(const Vec256 &a, const Vec256 &b) { + +template Vec256 inline max(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = std::max(a[i], b[i]); @@ -215,7 +280,7 @@ template Vec256 max(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 min(const Vec256 &a, const Vec256 &b) { +template Vec256 inline min(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = std::min(a[i], b[i]); @@ -223,9 +288,154 @@ template Vec256 min(const Vec256 &a, const Vec256 &b) { return c; } +#define DEFINE_BITWISE_OP(op) \ +template \ +Vec256 inline operator op(const Vec256 &a, const Vec256 &b) { \ + using iT = int_same_size_t; \ + iT buffer[Vec256::size]; \ + for (int64_t i = 0; i != Vec256::size; i++) { \ + auto a_val = a[i]; \ + auto b_val = b[i]; \ + iT *i_a_ptr = reinterpret_cast(&a_val); \ + iT *i_b_ptr = reinterpret_cast(&b_val); \ + buffer[i] = *i_a_ptr op *i_b_ptr; \ + } \ + return Vec256::loadu(buffer); \ +} +DEFINE_BITWISE_OP(&) +DEFINE_BITWISE_OP(|) +DEFINE_BITWISE_OP(^) +#undef DEFINE_BITWISE_OP + template -T fmadd(const T& a, const T& b, const T& c) { +inline T fmadd(const T& a, const T& b, const T& c) { return a * b + c; } +template +c10::guts::enable_if_t> +inline gather(T const* base_addr, const Vec256>& vindex) { + static constexpr int size = Vec256::size; + int_same_size_t index_arr[size]; + vindex.store(static_cast(index_arr)); + T buffer[size]; + for (int64_t i = 0; i < size; i++) { + buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; + } + return Vec256::loadu(static_cast(buffer)); +} + +template +c10::guts::enable_if_t> +inline mask_gather(const Vec256& src, T const* base_addr, + const Vec256>& vindex, Vec256& mask) { + static constexpr int size = Vec256::size; + T src_arr[size]; + int_same_size_t mask_arr[size]; // use int type so we can logical and + int_same_size_t index_arr[size]; + src.store(static_cast(src_arr)); + mask.store(static_cast(mask_arr)); + vindex.store(static_cast(index_arr)); + T buffer[size]; + for (int64_t i = 0; i < size; i++) { + if (mask_arr[i] & 0x01) { // check highest bit + buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; + } else { + buffer[i] = src_arr[i]; + } + } + mask = Vec256(); // "zero out" mask + return Vec256::loadu(static_cast(buffer)); +} + +// Cast a given vector to another type without changing the bits representation. +// So a Vec of 256 bits containing all ones can be cast to a +// Vec of 256 bits containing all ones (i.e., four negative 1s). +namespace { + // There is a struct here because we don't have static_if and I can't + // partially specialize a templated function. + template + struct CastImpl { + static inline Vec256 apply(const Vec256& src) { + src_t src_arr[Vec256::size]; + src.store(static_cast(src_arr)); + return Vec256::loadu(static_cast(src_arr)); + } + }; + + template + struct CastImpl { + static inline Vec256 apply(const Vec256& src) { + return src; + } + }; +} +template +Vec256 cast(const Vec256& src) { + return CastImpl::apply(src); +} + +template +inline Vec256> convert_to_int_of_same_size(const Vec256& src) { + static constexpr int size = Vec256::size; + T src_arr[size]; + src.store(static_cast(src_arr)); + int_same_size_t buffer[size]; + for (int64_t i = 0; i < size; i++) { + buffer[i] = static_cast>(src_arr[i]); + } + return Vec256>::loadu(static_cast(buffer)); +} + +// E.g., inputs: a Vec256 = {a0, b0, a1, b1, a2, b2, a3, b3} +// b Vec256 = {a4, b4, a5, b5, a6, b6, a7, b7} +// returns: Vec256 = {a0, a1, a2, a3, a4, a5, a6, a7} +// Vec256 = {b0, b1, b2, b3, b4, b5, b6, b7} +template +inline c10::guts::enable_if_t::size % 2 == 0, std::pair, Vec256>> +deinterleave2(const Vec256& a, const Vec256& b) { + static constexpr int size = Vec256::size; + static constexpr int half_size = size / 2; + T a_arr[size]; + T b_arr[size]; + T buffer1[size]; + T buffer2[size]; + a.store(static_cast(a_arr)); + b.store(static_cast(b_arr)); + for (int64_t i = 0; i < half_size; i++) { + buffer1[i] = a_arr[i * 2]; + buffer1[half_size + i] = b_arr[i * 2]; + buffer2[i] = a_arr[i * 2 + 1]; + buffer2[half_size + i] = b_arr[i * 2 + 1]; + } + return std::make_pair(Vec256::loadu(static_cast(buffer1)), + Vec256::loadu(static_cast(buffer2))); +} + +// inverse operation of deinterleave2 +// E.g., inputs: a Vec256 = {a0, a1, a2, a3, a4, a5, a6, a7} +// b Vec256 = {b0, b1, b2, b3, b4, b5, b6, b7} +// returns: Vec256 = {a0, b0, a1, b1, a2, b2, a3, b3} +// Vec256 = {a4, b4, a5, b5, a6, b6, a7, b7} +template +inline c10::guts::enable_if_t::size % 2 == 0, std::pair, Vec256>> +interleave2(const Vec256& a, const Vec256& b) { + static constexpr int size = Vec256::size; + static constexpr int half_size = size / 2; + T a_arr[size]; + T b_arr[size]; + T buffer1[size]; + T buffer2[size]; + a.store(static_cast(a_arr)); + b.store(static_cast(b_arr)); + for (int64_t i = 0; i < half_size; i++) { + buffer1[i * 2] = a_arr[i]; + buffer1[i * 2 + 1] = b_arr[i]; + buffer2[i * 2] = a_arr[half_size + i]; + buffer2[i * 2 + 1] = b_arr[half_size + i]; + } + return std::make_pair(Vec256::loadu(static_cast(buffer1)), + Vec256::loadu(static_cast(buffer2))); +} + }}} diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h index 05c21634659754..ced6fa6a37b6aa 100644 --- a/aten/src/ATen/cpu/vec256/vec256_double.h +++ b/aten/src/ATen/cpu/vec256/vec256_double.h @@ -22,14 +22,25 @@ template <> class Vec256 { Vec256(double val) { values = _mm256_set1_pd(val); } + Vec256(double val1, double val2, double val3, double val4) { + values = _mm256_setr_pd(val1, val2, val3, val4); + } operator __m256d() const { return values; } template - static Vec256 blend(Vec256 a, Vec256 b) { + static Vec256 blend(const Vec256& a, const Vec256& b) { return _mm256_blend_pd(a.values, b.values, mask); } - static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_pd(a.values, b.values, mask.values); + } + static Vec256 arange(double base = 0., double step = 1.) { + return Vec256(base, base + step, base + 2 * step, base + 3 * step); + } + static Vec256 set(const Vec256& a, const Vec256& b, + int64_t count = size) { switch (count) { case 0: return a; @@ -56,7 +67,7 @@ template <> class Vec256 { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_pd(reinterpret_cast(ptr), values); - } else { + } else if (count > 0) { double tmp_values[size]; _mm256_storeu_pd(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(double)); @@ -154,6 +165,32 @@ template <> class Vec256 { Vec256 pow(const Vec256 &b) const { return Vec256(Sleef_powd4_u10(values, b)); } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vec256 operator==(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); + } + + Vec256 operator!=(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_NEQ_OQ); + } + + Vec256 operator<(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ); + } + + Vec256 operator<=(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ); + } + + Vec256 operator>(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ); + } + + Vec256 operator>=(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ); + } }; template <> @@ -186,9 +223,24 @@ Vec256 inline min(const Vec256& a, const Vec256& b) { return _mm256_min_pd(a, b); } +template <> +Vec256 inline operator&(const Vec256& a, const Vec256& b) { + return _mm256_and_pd(a, b); +} + +template <> +Vec256 inline operator|(const Vec256& a, const Vec256& b) { + return _mm256_or_pd(a, b); +} + +template <> +Vec256 inline operator^(const Vec256& a, const Vec256& b) { + return _mm256_xor_pd(a, b); +} + #ifdef __AVX2__ template <> -Vec256 fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { +Vec256 inline fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { return _mm256_fmadd_pd(a, b, c); } #endif diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h index c38fae11c24863..ebd0c10d2b62d1 100644 --- a/aten/src/ATen/cpu/vec256/vec256_float.h +++ b/aten/src/ATen/cpu/vec256/vec256_float.h @@ -16,20 +16,34 @@ template <> class Vec256 { private: __m256 values; public: - static constexpr int64_t size = 8; + static constexpr int size = 8; Vec256() {} Vec256(__m256 v) : values(v) {} Vec256(float val) { values = _mm256_set1_ps(val); } + Vec256(float val1, float val2, float val3, float val4, + float val5, float val6, float val7, float val8) { + values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8); + } operator __m256() const { return values; } template - static Vec256 blend(Vec256 a, Vec256 b) { + static Vec256 blend(const Vec256& a, const Vec256& b) { return _mm256_blend_ps(a.values, b.values, mask); } - static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_ps(a.values, b.values, mask.values); + } + static Vec256 arange(float base = 0.f, float step = 1.f) { + return Vec256( + base, base + step, base + 2 * step, base + 3 * step, + base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); + } + static Vec256 set(const Vec256& a, const Vec256& b, + int64_t count = size) { switch (count) { case 0: return a; @@ -61,7 +75,7 @@ template <> class Vec256 { void store(void* ptr, int64_t count = size) const { if (count == size) { _mm256_storeu_ps(reinterpret_cast(ptr), values); - } else { + } else if (count > 0) { float tmp_values[size]; _mm256_storeu_ps(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(float)); @@ -159,6 +173,32 @@ template <> class Vec256 { Vec256 pow(const Vec256 &b) const { return Vec256(Sleef_powf8_u10(values, b)); } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vec256 operator==(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); + } + + Vec256 operator!=(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_NEQ_OQ); + } + + Vec256 operator<(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ); + } + + Vec256 operator<=(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ); + } + + Vec256 operator>(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ); + } + + Vec256 operator>=(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ); + } }; template <> @@ -191,9 +231,24 @@ Vec256 inline min(const Vec256& a, const Vec256& b) { return _mm256_min_ps(a, b); } +template <> +Vec256 inline operator&(const Vec256& a, const Vec256& b) { + return _mm256_and_ps(a, b); +} + +template <> +Vec256 inline operator|(const Vec256& a, const Vec256& b) { + return _mm256_or_ps(a, b); +} + +template <> +Vec256 inline operator^(const Vec256& a, const Vec256& b) { + return _mm256_xor_ps(a, b); +} + #ifdef __AVX2__ template <> -Vec256 fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { +Vec256 inline fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { return _mm256_fmadd_ps(a, b, c); } #endif diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h index c9b643e7d4bb09..2ca4d614c21e7b 100644 --- a/aten/src/ATen/cpu/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec256/vec256_int.h @@ -26,6 +26,9 @@ struct Vec256 : public Vec256i { using Vec256i::Vec256i; Vec256() {} Vec256(int64_t v) { values = _mm256_set1_epi64x(v); } + Vec256(int64_t val1, int64_t val2, int64_t val3, int64_t val4) { + values = _mm256_setr_epi64x(val1, val2, val3, val4); + } template static Vec256 blend(Vec256 a, Vec256 b) { __at_align32__ int64_t tmp_values[size]; @@ -40,6 +43,13 @@ struct Vec256 : public Vec256i { tmp_values[3] = _mm256_extract_epi64(b.values, 3); return loadu(tmp_values); } + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + static Vec256 arange(int64_t base = 0, int64_t step = 1) { + return Vec256(base, base + step, base + 2 * step, base + 3 * step); + } static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { switch (count) { @@ -65,7 +75,7 @@ struct Vec256 : public Vec256i { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else { + } else if (count > 0) { __at_align32__ int64_t tmp_values[size]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); @@ -79,6 +89,30 @@ struct Vec256 : public Vec256i { auto inverse = _mm256_xor_si256(values, is_larger); return _mm256_sub_epi64(inverse, is_larger); } + Vec256 operator==(const Vec256& other) const { + return _mm256_cmpeq_epi64(values, other.values); + } + Vec256 operator!=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto eq = _mm256_cmpeq_epi64(values, other.values); + return _mm256_xor_si256(zero, eq); // invert + } + Vec256 operator<(const Vec256& other) const { + return _mm256_cmpgt_epi64(other.values, values); + } + Vec256 operator<=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto gt = _mm256_cmpgt_epi64(values, other.values); + return _mm256_xor_si256(zero, gt); // invert + } + Vec256 operator>(const Vec256& other) const { + return _mm256_cmpgt_epi64(values, other.values); + } + Vec256 operator>=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto lt = _mm256_cmpgt_epi64(other.values, values); + return _mm256_xor_si256(zero, lt); // invert + } }; template <> @@ -87,10 +121,23 @@ struct Vec256 : public Vec256i { using Vec256i::Vec256i; Vec256() {} Vec256(int32_t v) { values = _mm256_set1_epi32(v); } + Vec256(int32_t val1, int32_t val2, int32_t val3, int32_t val4, + int32_t val5, int32_t val6, int32_t val7, int32_t val8) { + values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8); + } template static Vec256 blend(Vec256 a, Vec256 b) { return _mm256_blend_epi32(a, b, mask); } + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + static Vec256 arange(int32_t base = 0, int32_t step = 1) { + return Vec256( + base, base + step, base + 2 * step, base + 3 * step, + base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); + } static Vec256 set(Vec256 a, Vec256 b, int32_t count = size) { switch (count) { @@ -124,7 +171,7 @@ struct Vec256 : public Vec256i { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else { + } else if (count > 0) { __at_align32__ int32_t tmp_values[size]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); @@ -135,6 +182,30 @@ struct Vec256 : public Vec256i { Vec256 abs() const { return _mm256_abs_epi32(values); } + Vec256 operator==(const Vec256& other) const { + return _mm256_cmpeq_epi32(values, other.values); + } + Vec256 operator!=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto eq = _mm256_cmpeq_epi32(values, other.values); + return _mm256_xor_si256(zero, eq); // invert + } + Vec256 operator<(const Vec256& other) const { + return _mm256_cmpgt_epi32(other.values, values); + } + Vec256 operator<=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto gt = _mm256_cmpgt_epi32(values, other.values); + return _mm256_xor_si256(zero, gt); // invert + } + Vec256 operator>(const Vec256& other) const { + return _mm256_cmpgt_epi32(values, other.values); + } + Vec256 operator>=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto lt = _mm256_cmpgt_epi32(other.values, values); + return _mm256_xor_si256(zero, lt); // invert + } }; template <> @@ -143,6 +214,13 @@ struct Vec256 : public Vec256i { using Vec256i::Vec256i; Vec256() {} Vec256(int16_t v) { values = _mm256_set1_epi16(v); } + Vec256(int16_t val1, int16_t val2, int16_t val3, int16_t val4, + int16_t val5, int16_t val6, int16_t val7, int16_t val8, + int16_t val9, int16_t val10, int16_t val11, int16_t val12, + int16_t val13, int16_t val14, int16_t val15, int16_t val16) { + values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8, + val9, val10, val11, val12, val13, val14, val15, val16); + } template static Vec256 blend(Vec256 a, Vec256 b) { __at_align32__ int16_t tmp_values[size]; @@ -181,6 +259,17 @@ struct Vec256 : public Vec256i { tmp_values[15] = _mm256_extract_epi16(b.values, 15); return loadu(tmp_values); } + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + static Vec256 arange(int16_t base = 0, int16_t step = 1) { + return Vec256( + base, base + step, base + 2 * step, base + 3 * step, + base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, + base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, + base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step); + } static Vec256 set(Vec256 a, Vec256 b, int16_t count = size) { switch (count) { @@ -230,7 +319,7 @@ struct Vec256 : public Vec256i { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else { + } else if (count > 0) { __at_align32__ int16_t tmp_values[size]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); @@ -241,6 +330,30 @@ struct Vec256 : public Vec256i { Vec256 abs() const { return _mm256_abs_epi16(values); } + Vec256 operator==(const Vec256& other) const { + return _mm256_cmpeq_epi16(values, other.values); + } + Vec256 operator!=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto eq = _mm256_cmpeq_epi16(values, other.values); + return _mm256_xor_si256(zero, eq); // invert + } + Vec256 operator<(const Vec256& other) const { + return _mm256_cmpgt_epi16(other.values, values); + } + Vec256 operator<=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto gt = _mm256_cmpgt_epi16(values, other.values); + return _mm256_xor_si256(zero, gt); // invert + } + Vec256 operator>(const Vec256& other) const { + return _mm256_cmpgt_epi16(values, other.values); + } + Vec256 operator>=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto lt = _mm256_cmpgt_epi16(other.values, values); + return _mm256_xor_si256(zero, lt); // invert + } }; template <> @@ -258,6 +371,21 @@ Vec256 inline operator+(const Vec256& a, const Vec256 return _mm256_add_epi16(a, b); } +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_epi64(a, b); +} + +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_epi32(a, b); +} + +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_epi16(a, b); +} + // AVX2 has no intrinsic for int64_t multiply so it needs to be emulated // This could be implemented more efficiently using epi32 instructions // This is also technically avx compatible, but then we'll need AVX @@ -293,7 +421,7 @@ Vec256 inline operator*(const Vec256& a, const Vec256 } template -Vec256 intdiv_256(const Vec256& a, const Vec256& b) { +Vec256 inline intdiv_256(const Vec256& a, const Vec256& b) { T values_a[Vec256::size]; T values_b[Vec256::size]; a.store(values_a); @@ -304,20 +432,26 @@ Vec256 intdiv_256(const Vec256& a, const Vec256& b) { return Vec256::loadu(values_a); } -template <> -Vec256 inline operator/(const Vec256& a, const Vec256& b) { - return intdiv_256(a, b); +#define DEFINE_INTEGER_BINARY_OP(op, func) \ +template <> \ +Vec256 inline operator op(const Vec256& a, const Vec256& b) { \ + return func(a, b); \ +} \ +template <> \ +Vec256 inline operator op(const Vec256& a, const Vec256& b) { \ + return func(a, b); \ +} \ +template <> \ +Vec256 inline operator op(const Vec256& a, const Vec256& b) { \ + return func(a, b); \ } -template <> -Vec256 inline operator/(const Vec256& a, const Vec256& b) { - return intdiv_256(a, b); -} +DEFINE_INTEGER_BINARY_OP(/, intdiv_256) +DEFINE_INTEGER_BINARY_OP(&, _mm256_and_si256) +DEFINE_INTEGER_BINARY_OP(|, _mm256_or_si256) +DEFINE_INTEGER_BINARY_OP(^, _mm256_xor_si256) -template <> -Vec256 inline operator/(const Vec256& a, const Vec256& b) { - return intdiv_256(a, b); -} +#undef DEFINE_INTEGER_BINARY_OP #endif diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h index 7b41f1fe3f7235..761551f808610b 100644 --- a/aten/src/ATen/cuda/ATenCUDAGeneral.h +++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + #ifdef _WIN32 # if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_CUDA_BUILD_MAIN_LIB) # define AT_CUDA_API __declspec(dllexport) diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu deleted file mode 100644 index bd121250ee4847..00000000000000 --- a/aten/src/ATen/cuda/CUDAHalf.cu +++ /dev/null @@ -1,51 +0,0 @@ -#include "ATen/core/Half.h" -#include "ATen/cuda/CUDAHalf.cuh" - -#include -#include -#include - -namespace at { -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - -half Converter::operator()(Half aten_half) { - return half{aten_half.x}; -} - -half Converter::operator()(double value) { - return half{Half(value).x}; -} - -Half Converter::operator()(half cuda_half) { - return Half(cuda_half.x, Half::from_bits); -} -#else -half Converter::operator()(Half aten_half) { - __half_raw x_raw; - x_raw.x = aten_half.x; - return half(x_raw); -} - -Half Converter::operator()(half cuda_half) { - __half_raw raw(cuda_half); - return Half(raw.x, Half::from_bits); -} - -half Converter::operator()(double value) { - __half_raw raw; - raw.x = Half(value).x; - return half {raw}; -} - -template <> __half HalfFix(Half h) { - __half_raw raw; - raw.x = h.x; - return __half{raw}; -} - -template <> Half HalfFix(__half h) { - __half_raw raw(h); - return Half(raw.x, Half::from_bits); -} -#endif -} // namespace at diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh deleted file mode 100644 index 6558ed518ac1fd..00000000000000 --- a/aten/src/ATen/cuda/CUDAHalf.cuh +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "ATen/cuda/ATenCUDAGeneral.h" -#include "ATen/core/Half.h" - -#include -#include -#include - -namespace at { - -template <> -struct AT_CUDA_API Converter { - half operator()(Half); -}; - -template <> -struct AT_CUDA_API Converter { - Half operator()(half); -}; - -template <> -struct AT_CUDA_API Converter { - half operator()(double); -}; - -#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) -template <> __half HalfFix(Half h); -template <> Half HalfFix(__half h); -#endif -} // namespace at diff --git a/aten/src/ATen/cuda/NumericLimits.cuh b/aten/src/ATen/cuda/NumericLimits.cuh index 325cbce737dd51..981bf8c1c34efa 100644 --- a/aten/src/ATen/cuda/NumericLimits.cuh +++ b/aten/src/ATen/cuda/NumericLimits.cuh @@ -3,6 +3,7 @@ #include #include #include +#include // NumericLimits.cuh is a holder for numeric limits definitions of commonly used // types. This header is very specific to ROCm HIP and may be removed in the future. @@ -101,4 +102,4 @@ struct numeric_limits { static inline __host__ __device__ double upper_bound() { return inf; } }; -} // namespace at \ No newline at end of file +} // namespace at diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index e87a7bb88f8eb4..5df218a89cc06d 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -39,7 +39,7 @@ def TypedDict(name, attrs, total=True): # type: ignore # declaration under Type.h (right now, we call this template # BROADCAST but it also handles default arguments) TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\ -${return_type} ${api_name}(${type_method_formals_with_defaults}) const override; +${return_type} ${api_name}(${type_method_formals}) const override; """) # 2. broadcasting functions are implemented in Type.cpp TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\ @@ -60,18 +60,18 @@ def TypedDict(name, attrs, total=True): # type: ignore # for 'native' declarations (so the native dispatch is hardcoded into # the template here.) PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\ -virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0; +virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const = 0; """) DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\ AT_DEPRECATED(virtual ${return_type} \ -${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0); +${method_prefix_derived}${api_name}(${type_method_formals}) const = 0); """) PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\ -virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const = 0; +virtual ${return_type} ${api_name}(${type_method_formals}) const = 0; """) TYPE_METHOD_DECLARATION_ABSTRACT = CodeTemplate("""\ -${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const override; +${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override; """) TYPE_METHOD_DEFINITION_ABSTRACT = CodeTemplate("""\ ${return_type} TypeDefault::${method_prefix_derived}${api_name}(${type_method_formals}) const { @@ -79,7 +79,7 @@ def TypedDict(name, attrs, total=True): # type: ignore } """) TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\ -${return_type} ${api_name}(${type_method_formals_with_defaults}) const override; +${return_type} ${api_name}(${type_method_formals}) const override; """) TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\ ${return_type} TypeDefault::${api_name}(${type_method_formals}) const { @@ -107,6 +107,10 @@ def TypedDict(name, attrs, total=True): # type: ignore # NB: As far as ezyang can tell, we don't *have* to codegen this, # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in # the superclass. But it doesn't seem to be harmful. +# +# TODO: self_ty is a hack to make things work for native methods which need to +# take a dtype, but also need to dispatch differently for different types. +# Eliminate it at some point. TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\ ${return_type} ${Type}::${api_name}(${type_method_formals}) const { ${device_guard_declaration} @@ -173,7 +177,7 @@ def TypedDict(name, attrs, total=True): # type: ignore # the same name (but different signature) already ZERO_DIM_CHECK = CodeTemplate("""\ if (${check_name}.dim() == 0) { - return static_cast(this)->${api_name}(${zero_dim_actuals}); + return static_cast(this)->${api_name}(${zero_dim_actuals}); }""") ZERO_DIM_ONLY = CodeTemplate("""\ @@ -183,12 +187,12 @@ def TypedDict(name, attrs, total=True): # type: ignore SPARSE_CHECK = CodeTemplate("""\ if(${check_name}.type().is_sparse()) { - return static_cast(this)->${api_name}(${sparse_actuals}); + return static_cast(this)->${api_name}(${sparse_actuals}); }""") BUFFER_DEFINITION = CodeTemplate("""\ -auto ${name}_ = c10::make_intrusive( - ${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false).release(); +auto ${name}_ = c10::make_intrusive( + ${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), ${THTensor}_new(), false).release(); auto ${name} = Tensor(${name}_, false);""") CONDITIONAL_INITIALIZER = CodeTemplate("""\ @@ -198,8 +202,6 @@ def TypedDict(name, attrs, total=True): # type: ignore CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})") -HALF_CONVERSION = CodeTemplate("convert(${value})") - class NYIError(Exception): """Indicates we don't support this declaration yet""" @@ -330,18 +332,19 @@ def __init__(self, reason): CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL') ALLOC_NOARGS_WRAP = { - 'THTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()', - 'THBoolTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()', - 'THIndexTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()', - 'THIntegerTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()', - 'THDenseTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()', - 'THDenseIndexTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()' + 'THTensor*': 'c10::make_intrusive' + '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()', + 'THBoolTensor*': 'c10::make_intrusive' + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Byte), allocator(), false).release()', + 'THIndexTensor*': 'c10::make_intrusive' + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), allocator(), false).release()', + 'THIntegerTensor*': 'c10::make_intrusive' + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Int), allocator(), false).release()', + 'THDenseTensor*': 'c10::make_intrusive' + '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()', + 'THDenseIndexTensor*': 'c10::make_intrusive' + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), ' + 'allocator(), false).release()' } ALLOC_WRAP = { @@ -390,6 +393,7 @@ def __getitem__(self, x): 'type_registrations': List[str], 'type_headers': List[str], 'pure_virtual_type_method_declarations': List[str], + 'pure_virtual_extended_type_method_declarations': List[str], 'type_method_declarations': List[str], 'type_method_definitions': List[str], 'type_method_inline_definitions': List[str], @@ -490,6 +494,9 @@ def __getitem__(self, x): 'formals': List[str], 'inferred_type': str, 'inplace': bool, + # This controls whether or not we generate the interface in Type or + # TypeExtendedInterface + 'extended_method': bool, 'method_actuals': List[str], 'method_formals_with_defaults': List[str], 'method_formals': List[str], @@ -509,7 +516,6 @@ def __getitem__(self, x): 'type_definition_body': List[str], 'type_method_actuals': List[str], 'type_method_definition_dispatch': str, - 'type_method_formals_with_defaults': List[str], 'type_method_formals': List[str], 'variants': str, 'when_spares_dispatch': str, @@ -811,7 +817,6 @@ def process_option(option, output_options): # There are no cases where these differ, but they do in native_functions option['type_method_formals'] = option['formals'] - option['type_method_formals_with_defaults'] = option['formals_with_defaults'] option['type_method_actuals'] = option['actuals'] option['const_mark'] = '' if option['inplace'] else ' const' @@ -836,8 +841,12 @@ def process_option(option, output_options): # NN function with no _forward/_backward suffix don't have cimpls. # They call the _forward function and discard any buffer returns abstract = False - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) top_env['type_method_declarations'].append( TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) body = emit_nn_body(option) @@ -845,17 +854,27 @@ def process_option(option, output_options): TYPE_METHOD_DEFINITION_CONCRETE.substitute( env, type_definition_body=body)) elif broadcast_arg is None: - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) top_env['type_method_declarations'].append( TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env)) top_env['type_method_definitions'].append( TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env)) else: - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) top_env['type_method_declarations'].append( TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) top_env['type_method_declarations'].append( @@ -888,7 +907,7 @@ def process_option(option, output_options): method_of.append('Tensor') if is_namespace_function: - option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor) + option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor) top_env['function_declarations'].append( FUNCTION_DECLARATION.substitute(env)) top_env['function_definitions'].append( @@ -1031,7 +1050,6 @@ def find_formal(formal_name, formals): dispatch_type['is_type_dispatched'] = True option['type_method_formals'] = [format_formal(f) for f in formals if f != dispatch_type] - option['type_method_formals_with_defaults'] = [formal_with_default(f) for f in formals if f != dispatch_type] option['type_method_actuals'] = [f['name'] for f in formals if f != dispatch_type] option['native_actuals'] = [f['name'] if f != dispatch_type else '*this' for f in formals] @@ -1060,11 +1078,21 @@ def find_formal(formal_name, formals): # Factory methods are not dispatched over `Type`. if not is_factory_method: if option['deprecated']: + # Deprecated functions are always non-extended, + # because they need to be made available from Type + # (the public interface) so that code like + # tensor.type().arange(...) keeps working. Once + # we remove the deprecated functions, we can eliminate + # these methods entirely. top_env['pure_virtual_type_method_declarations'].append( DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) else: - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) dispatch = option['type_method_definition_dispatch'] option['native_type_method_dispatch'] = dispatch @@ -1116,9 +1144,9 @@ def find_formal(formal_name, formals): if is_namespace_function: if dispatch_type: - option['inferred_type'] = dispatch_type['name'] + option['inferred_type'] = 'static_cast({})'.format(dispatch_type['name']) elif dispatch_tensor: - option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor) + option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor) else: # doesn't depend on a specific type, use undefined float option['inferred_type'] = 'at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)' @@ -1170,8 +1198,6 @@ def create_derived(backend_type_env, declarations): is_cuda = 'CUDA' in backend_type_env['Backend'] - real_is_half = backend_type_env['ScalarName'] == 'Half' - def replace_with_null(argument): # type: (THFormal) -> bool return (argument['type'] == 'THGenerator*' and @@ -1198,8 +1224,6 @@ def get_argument(argument, option): elif requires_checked_cast(argument): checked_use = CHECKED_USE.get( argument['type'], '{}_').format(argument['name']) - if real_is_half and argument['type'] == 'real': - checked_use = HALF_CONVERSION.substitute(value=checked_use) if nullable_argument(argument): checked_use = CHECKED_USE_NULLABLE.substitute( env={}, arg_name=argument['name'], usage=checked_use) @@ -1295,11 +1319,12 @@ def allocate_arg(env, arg, output_count): tensor_arg = '{}_'.format(name) if arg.get('mask', False): allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation) - tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensor::singleton() : (TensorImpl*){}_' + tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_' .format(name, name)) + intrusive_ptr_type = 'c10::intrusive_ptr' return [ 'auto {}_ = {};'.format(name, allocation), - 'auto {} = Tensor({}, false);'.format(name, tensor_arg), + 'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg), ] def resize_arg(arg): @@ -1507,7 +1532,10 @@ def emit_body(env, option): else "" wrapped_tensor = CodeTemplate(ALLOC_WRAP[ret['type']]).substitute( env, arguments=[call]) - return_tensor = "return Tensor((${wrapped_tensor})${maybe_scalar},false);" + return_tensor = ( + "return Tensor(" + + "c10::intrusive_ptr::reclaim(" + + "(${wrapped_tensor})${maybe_scalar}));") body.append(CodeTemplate(return_tensor).substitute( env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar)) # return the same underlying Tensor type for both real and accreal; this ensures diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 3f962961f55812..3112e5ff0424ab 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -1,5 +1,6 @@ import argparse import os +import filecmp import yaml from collections import OrderedDict @@ -40,19 +41,27 @@ parser.add_argument( '-d', '--install_dir', help='output directory', default='ATen') options = parser.parse_args() +gen_to_source = os.environ.get('GEN_TO_SOURCE') # update source directly as part of gen +if not gen_to_source: + core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None +else: + core_install_dir = os.path.join(options.source_path, 'core') if options.install_dir is not None and not os.path.exists(options.install_dir): os.makedirs(options.install_dir) +if core_install_dir is not None and not os.path.exists(core_install_dir): + os.makedirs(core_install_dir) class FileManager(object): - def __init__(self): + def __init__(self, install_dir=None): + self.install_dir = install_dir if install_dir else options.install_dir self.filenames = set() self.outputs_written = False self.undeclared_files = [] def will_write(self, filename): - filename = '{}/{}'.format(options.install_dir, filename) + filename = '{}/{}'.format(self.install_dir, filename) if self.outputs_written: raise Exception("'will_write' can only be called before " + "the call to write_outputs, refactor so outputs are registered " + @@ -78,7 +87,7 @@ def write_outputs(self, filename): self.outputs_written = True def write(self, filename, s, env=None): - filename = '{}/{}'.format(options.install_dir, filename) + filename = '{}/{}'.format(self.install_dir, filename) if isinstance(s, CodeTemplate): assert env is not None env['generated_comment'] = "@" + "generated by aten/src/ATen/gen.py" @@ -107,6 +116,7 @@ def check_all_files_written(self): SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h") +TYPE_EXTENDED_INTERFACE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeExtendedInterface.h") TYPE_DEFAULT_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.h") TYPE_DEFAULT_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.cpp") @@ -127,6 +137,7 @@ def check_all_files_written(self): context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}()); """) +core_file_manager = FileManager(core_install_dir) file_manager = FileManager() cuda_file_manager = FileManager() @@ -155,7 +166,7 @@ def check_all_files_written(self): ('Int', 'int', 'Long', 'int32_t', False), ('Long', 'int64_t', 'Long', 'int64_t', False), ('Short', 'int16_t', 'Long', 'int16_t', False), - ('Half', 'Half', 'Double', 'THHalf', True), + ('Half', 'Half', 'Double', 'at::Half', True), ] # shared environment for non-derived base classes Type.h Tensor.h Storage.h @@ -165,6 +176,7 @@ def check_all_files_written(self): 'cuda_type_registrations': [], 'cuda_type_headers': [], 'pure_virtual_type_method_declarations': [], + 'pure_virtual_extended_type_method_declarations': [], 'type_method_declarations': [], 'type_method_definitions': [], 'type_method_inline_definitions': [], @@ -236,7 +248,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['DenseBackend'] = backend env['storage_tensor_headers'] = [] if density != 'Sparse': - env['storage_tensor_headers'] = ['#include "ATen/TensorImpl.h"'] + env['storage_tensor_headers'] = ['#include "ATen/core/TensorImpl.h"'] # used for generating switch logic for external functions tag = density_tag + backend + scalar_name @@ -251,7 +263,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations '#undef THNN_', '#undef THCIndexTensor_', ] - env['extra_cuda_headers'] = ['#include '] + env['extra_cuda_headers'] = ['#include '] env['extra_cuda_headers'].append('#include ') env['extra_cuda_headers'].append('#include ') env['extra_cuda_headers'].append('#include ') @@ -284,7 +296,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if scalar_name == "Half": env['SparseTensor'] = 'Tensor' if backend == "CUDA": - env['AS_REAL'] = 'convert' + env['AS_REAL'] = 'convert' declarations, definitions = function_wrapper.create_derived( env, declarations) @@ -330,9 +342,11 @@ def iterate_types(): # so that the script runs quickly when we are just querying the # outputs def declare_outputs(): - files = ['Declarations.yaml', 'Type.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h', - 'TensorMethods.h', 'Functions.h', - 'CPUCopy.cpp', 'NativeFunctions.h', + core_files = ['Type.h', 'Tensor.h', 'TensorMethods.h'] + for f in core_files: + core_file_manager.will_write(f) + files = ['Declarations.yaml', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h', + 'Functions.h', 'CPUCopy.cpp', 'NativeFunctions.h', 'RegisterCPU.cpp', 'RegisterCPU.h'] for f in files: file_manager.will_write(f) @@ -399,7 +413,16 @@ def generate_outputs(): all_types.append(generate_storage_type_and_tensor( backend, density, scalar_type, declarations)) - file_manager.write('Type.h', TYPE_H, top_env) + core_files = { + 'Type.h': TYPE_H, + 'Tensor.h': TENSOR_H, + 'TensorMethods.h': TENSOR_METHODS_H + } + + for core_file, core_template_file in core_files.items(): + core_file_manager.write(core_file, core_template_file, top_env) + + file_manager.write('TypeExtendedInterface.h', TYPE_EXTENDED_INTERFACE_H, top_env) file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env) file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env) @@ -409,8 +432,6 @@ def generate_outputs(): cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env) cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env) - file_manager.write('Tensor.h', TENSOR_H, top_env) - file_manager.write('TensorMethods.h', TENSOR_METHODS_H, top_env) file_manager.write('Functions.h', FUNCTIONS_H, top_env) file_manager.write('CPUCopy.cpp', copy_wrapper.create(all_types, 'CPU')) @@ -420,10 +441,25 @@ def generate_outputs(): file_manager.check_all_files_written() cuda_file_manager.check_all_files_written() + # check that generated files match source files + core_source_path = os.path.join(options.source_path, 'core') + match, mismatch, errors = filecmp.cmpfiles(core_install_dir, core_source_path, core_files.keys(), shallow=False) + if errors: + raise RuntimeError("Error while trying to compare source and generated files for {}. " + "Source directory: {}. Generated directory: {}." + .format(errors, core_source_path, core_install_dir)) + if mismatch: + file_component = '{}'.format(','.join(mismatch)) + if len(mismatch) > 1: + file_component = '{' + file_component + '}' + update_cmd = "cp {}/{} {}".format(core_install_dir, file_component, core_source_path) + raise RuntimeError("Source files: {} did not match generated files. To update the source files, " + "run \"{}\"".format(mismatch, update_cmd)) declare_outputs() if options.output_dependencies is not None: file_manager.write_outputs(options.output_dependencies) + core_file_manager.write_outputs(options.output_dependencies + "-core") cuda_file_manager.write_outputs(options.output_dependencies + "-cuda") else: generate_outputs() diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index f359d67c72e786..96ddb5ae3928b1 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -155,70 +155,51 @@ static void check_input_shape_forward(const at::Tensor& input, int64_t k = input.ndimension(); int64_t weight_dim = weight.ndimension(); - if (weight_dim != k) { - std::stringstream ss; - ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim - << "-dimensional weight " << weight.sizes() << ", but got input of size " - << input.sizes() << " instead"; - throw std::runtime_error(ss.str()); - } - if (weight.size(0) < groups) { - std::stringstream ss; - ss << "Given groups=" << groups << ", expected weight to be at least " - << groups << " at dimension 0, but got weight of size " << weight.sizes() - << " instead"; - throw std::runtime_error(ss.str()); - } - if (weight.size(0) % groups != 0) { - std::stringstream ss; - ss << "Given groups=" << groups << ", expected weight to be divisible by " - << groups << " at dimension 0, but got weight of size " << weight.sizes() - << " instead"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(weight_dim == k, + "Expected ", weight_dim, "-dimensional input for ", weight_dim, + "-dimensional weight ", weight.sizes(), ", but got ", k, "-dimensional input of size ", + input.sizes(), " instead"); + AT_CHECK(weight.size(0) >= groups, + "Given groups=", groups, ", expected weight to be at least ", groups, + " at dimension 0, but got weight of size ", weight.sizes(), " instead"); + AT_CHECK(weight.size(0) % groups == 0, + "Given groups=", groups, ", expected weight to be divisible by ", + groups, " at dimension 0, but got weight of size ", weight.sizes(), + " instead"); if (!transposed) { - if (input.size(1) != (weight.size(1) * groups)) { - std::stringstream ss; - ss << "Given groups=" << groups << ", weight of size " << weight.sizes() - << ", expected input" << input.sizes() << " to have " - << (weight.size(1) * groups) << " channels, but got " << input.size(1) - << " channels instead"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(0))) { - std::stringstream ss; - ss << "Given weight of size " << weight.sizes() - << ", expected bias to be 1-dimensional with " << weight.size(0) << " elements" - << ", but got bias of size " << bias.sizes() << " instead"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.size(1) == (weight.size(1) * groups), + "Given groups=", groups, ", weight of size ", weight.sizes(), + ", expected input", input.sizes(), " to have ", + (weight.size(1) * groups), " channels, but got ", input.size(1), + " channels instead"); + AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(0)), + "Given weight of size ", weight.sizes(), + ", expected bias to be 1-dimensional with ", weight.size(0), " elements", + ", but got bias of size ", bias.sizes(), " instead"); } else { // transposed - if (input.size(1) != weight.size(0)) { - std::stringstream ss; - ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() - << ", expected input" << input.sizes() << " to have " - << weight.size(0) << " channels, but got " << input.size(1) - << " channels instead"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(1) * groups)) { - std::stringstream ss; - ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() - << ", expected bias to be 1-dimensional with " << weight.size(1) * groups << " elements" - << ", but got bias of size " << bias.sizes() << " instead"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.size(1) == weight.size(0), + "Given transposed=", transposed, ", weight of size ", weight.sizes(), + ", expected input", input.sizes(), " to have ", weight.size(0), + " channels, but got ", input.size(1), " channels instead"); + AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(1) * groups), + "Given transposed=", transposed, ", weight of size ", weight.sizes(), + ", expected bias to be 1-dimensional with ", weight.size(1) * groups, " elements", + ", but got bias of size ", bias.sizes(), " instead"); } } static auto view4d(const at::Tensor& tensor) -> at::Tensor { - if (tensor.ndimension() != 3) throw std::runtime_error("expected 3D tensor"); + AT_CHECK(tensor.ndimension() == 3, + "expected 3D tensor, got tensor with ", tensor.ndimension(), + " dimensions instead"); return tensor.unsqueeze(2); } static auto view3d(const at::Tensor& tensor) -> at::Tensor { - if (tensor.ndimension() != 4) throw std::runtime_error("expected 4D tensor"); + AT_CHECK(tensor.ndimension() == 4, + "expected 4D tensor, got tensor with ", tensor.ndimension(), + " dimensions instead"); return tensor.squeeze(2); } @@ -293,7 +274,7 @@ static inline std::vector convolution_expand_param_if_needed( ss << "expected " << param_name << " to be a single integer value or a " << "list of " << expected_dim << " values to match the convolution " << "dimensions, but got " << param_name << "=" << list_param; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } else { return list_param.vec(); } @@ -311,9 +292,7 @@ at::Tensor _convolution( auto k = weight.ndimension(); int64_t dim = k - 2; - if (dim <= 0) { - throw std::runtime_error("weight should have at least two dimensions"); - } + AT_CHECK(dim > 0, "weight should at least have at least two dimensions"); ConvParams params; params.stride = convolution_expand_param_if_needed(stride_, "stride", dim); @@ -326,8 +305,8 @@ at::Tensor _convolution( params.deterministic = deterministic; params.cudnn_enabled = cudnn_enabled; - if (params.is_padding_neg()) throw std::runtime_error("negative padding is not supported"); - if (params.is_output_padding_neg()) throw std::runtime_error("negative output_padding is not supported"); + AT_CHECK(!params.is_padding_neg(), "negative padding is not supported"); + AT_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported"); check_input_shape_forward(input, weight, bias, params.groups, params.transposed); @@ -349,16 +328,12 @@ at::Tensor _convolution( output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation); } else if (params.use_cudnn(input)) { - if (input.type() != weight.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && input.type() != bias.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.type() == weight.type(), + "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), + ") should be the same"); + AT_CHECK(!bias.defined() || (input.type() == bias.type()), + "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), + ") should be the same"); if (params.transposed) { output = at::cudnn_convolution_transpose( @@ -370,16 +345,12 @@ at::Tensor _convolution( params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); } } else if (params.use_miopen(input)) { - if (input.type() != weight.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && input.type() != bias.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.type() == weight.type(), + "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), + ") should be the same"); + AT_CHECK(!bias.defined() || (input.type() == bias.type()), + "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), + ") should be the same"); if (params.transposed) { output = at::miopen_convolution_transpose( @@ -392,16 +363,12 @@ at::Tensor _convolution( } } else if (params.use_mkldnn(input)) { #if AT_MKLDNN_ENABLED() - if (input.type() != weight.type()){ - std::stringstream ss; - ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && input.type() != bias.type()){ - std::stringstream ss; - ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.type() == weight.type(), + "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), + ") should be the same"); + AT_CHECK(!bias.defined() || (input.type() == bias.type()), + "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), + ") should be the same"); output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups); #endif @@ -487,7 +454,7 @@ at::Tensor _convolution_nogroup( } } - throw std::runtime_error("unsupported ConvNd parameters"); + AT_ERROR("unsupported ConvNd parameters"); } static Tensor subvariable(const Tensor& var, int dim, int groups, int g) { diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index dad05dcf8b47a8..42ef6a4f6bb5f1 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -22,7 +22,10 @@ // DEFINE_DISPATCH(stub); // // In native/cpu/MyKernel.cpp: -// void kernel(const Tensor& x) { ... } +// namespace { +// // use anonymous namespace so that different cpu versions won't conflict +// void kernel(const Tensor& x) { ... } +// } // REGISTER_DISPATCH(stub, &kernel); // // To call: @@ -46,19 +49,22 @@ enum class CPUCapability { CPUCapability get_cpu_capability(); template -struct AT_API DispatchStub { - static_assert(std::is_pointer::value, "FnPtr should be a pointer type"); +struct AT_API DispatchStub; + +template +struct AT_API DispatchStub { + using FnPtr = rT (*) (Args...); template - void operator()(DeviceType device_type, ArgTypes&&... args) { + rT operator()(DeviceType device_type, ArgTypes&&... args) { if (device_type == DeviceType::CPU) { if (!cpu_dispatch_ptr) { cpu_dispatch_ptr = choose_cpu_impl(); } - (*cpu_dispatch_ptr)(std::forward(args)...); + return (*cpu_dispatch_ptr)(std::forward(args)...); } else if (device_type == DeviceType::CUDA) { AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel"); - (*cuda_dispatch_ptr)(std::forward(args)...); + return (*cuda_dispatch_ptr)(std::forward(args)...); } else { AT_ERROR("DispatchStub: unsupported device type", device_type); } @@ -103,6 +109,11 @@ struct RegisterDispatch { }; } // anonymous namespace +// Compiler will complain if you put things like std::tuple in +// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g., +// adding parentheses and using helper struct to get rid of the parentheses, do +// not work with MSVC. So do a `using`-declaration if you need to pass in such +// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h. #define DECLARE_DISPATCH(fn, name) \ struct name : DispatchStub {}; \ extern AT_API struct name name diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 67c0877f9fa072..99fa4c701d4bbf 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -67,18 +67,17 @@ Tensor embedding_sparse_backward( int64_t num_features = grad_.size(-1); auto weight_size = std::array{{ num_weights, num_features }}; auto& dense_type = grad.type(); - auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU); // check if all our grad come from padding_idx if (grad.numel() == 0) { - return sparse_type._sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}), - dense_type.tensor({0, num_features}), - weight_size); + return at::_sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}), + dense_type.tensor({0, num_features}), + weight_size); } auto index = indices.reshape({1, -1}); auto values = grad.reshape({-1, num_features}); - return sparse_type._sparse_coo_tensor_unsafe(index, values, weight_size); + return at::_sparse_coo_tensor_unsafe(index, values, weight_size); } Tensor embedding_dense_backward_cpu( diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp index b45e2a4f98860e..dcb8a0964d2f90 100644 --- a/aten/src/ATen/native/Gesv.cpp +++ b/aten/src/ATen/native/Gesv.cpp @@ -44,7 +44,7 @@ template<> void lapackGesv( #endif template -static void applyGesv(Tensor& b, Tensor& A, std::vector infos) { +static void applyGesv(Tensor& b, Tensor& A, std::vector& infos) { #ifndef USE_LAPACK AT_ERROR("gesv: LAPACK library not found in compilation"); #endif @@ -117,8 +117,7 @@ std::tuple gesv_out( Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) { if (self.dim() > 2 || A.dim() > 2) { AT_ERROR("torch.gesv() with the `out` keyword does not support batching. " - "b.dim() (%lld) and A.dim() (%lld) must both be 2.", - (long long)self.dim(), (long long)A.dim()); + "b.dim() (", self.dim(), ") and A.dim() (", A.dim(), ") must both be 2."); } return at::_gesv_single_out(solution, lu, self, A); } diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h index 2d265520f3d21c..a3ba6ec1a8f127 100644 --- a/aten/src/ATen/native/Gesv.h +++ b/aten/src/ATen/native/Gesv.h @@ -5,26 +5,23 @@ namespace at { namespace native { static inline void checkInputs(const Tensor& self, const Tensor& A) { if (A.size(-1) != A.size(-2)) { AT_ERROR("A must be batches of square matrices, " - "but they are %lld by %lld matrices", + "but they are ", A.size(-1), " by ", A.size(-2), " matrices", (long long)A.size(-1), (long long)A.size(-2)); } if (A.size(-1) != self.size(-2)) { AT_ERROR("Incompatible matrix sizes for matmul: each A " - "matrix is %llu by %lld but each b matrix is %lld by %lld.", - (long long)A.size(-1), (long long)A.size(-1), - (long long)self.size(-2), (long long)self.size(-1)); + "matrix is ", A.size(-1), " by ", A.size(-1), + " but each b matrix is ", self.size(-2), " by ", self.size(-1)); } } -static inline void checkErrors(std::vector infos) { +static inline void checkErrors(std::vector& infos) { for (size_t i = 0; i < infos.size(); i++) { auto info = infos[i]; if (info < 0) { - AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value", - (long long)i, -info); + AT_ERROR("gesv: For batch ", i, ": Argument ", -info, " has illegal value."); } else if (info > 0) { - AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.", - (long long)i, info, info); + AT_ERROR("gesv: For batch ", i, ": U(", info, ",", info, ") is zero, singular U."); } } } diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 1547ab2c934053..4d09307bd640e0 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -3,8 +3,9 @@ #include "ATen/Device.h" #include "ATen/Error.h" #include "ATen/NativeFunctions.h" -#include "ATen/detail/CUDAHooksInterface.h" #include "ATen/native/GridSampler.h" +#include "ATen/native/cpu/GridSamplerKernel.h" +#include "ATen/cpu/vml.h" #ifdef _OPENMP #include @@ -16,6 +17,7 @@ using at::native::detail::GridSamplerInterpolation; using at::native::detail::GridSamplerPadding; namespace { + template static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) { return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); @@ -117,121 +119,10 @@ namespace { } } - template - Tensor grid_sampler_2d_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - auto output = at::empty({N, C, out_H, out_W}, input.options()); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t out_sN = output.stride(0); - int64_t out_sC = output.stride(1); - int64_t out_sH = output.stride(2); - int64_t out_sW = output.stride(3); - scalar_t *inp_ptr = input.data(); - scalar_t *out_ptr = output.data(); - scalar_t *grid_ptr = grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; - scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix = clip_coordinates(ix, inp_W); - iy = clip_coordinates(iy, inp_H); - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - ix = reflect_coordinates(ix, inp_W); - iy = reflect_coordinates(iy, inp_H); - } - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get NE, NW, SE, SW pixel values from (x, y) - int64_t ix_nw = static_cast(std::floor(ix)); - int64_t iy_nw = static_cast(std::floor(iy)); - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - // calculate bilinear weighted pixel value and set output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne - // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se - *out_ptr_NCHW = static_cast(0); - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast(std::round(ix)); - int64_t iy_nearest = static_cast(std::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) { - *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast(0); - } - } - } - } - } - } - return output; - } - template Tensor grid_sampler_3d_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { int64_t N = input.size(0); int64_t C = input.size(1); int64_t inp_D = input.size(2); @@ -395,167 +286,12 @@ namespace { return output; } - template - std::tuple - grid_sampler_2d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - // If interpolation mode is Nearest, then grad_grid is not filled in the - // loop below. - if (interpolation_mode == GridSamplerInterpolation::Nearest) { - grad_grid.zero_(); - } - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t gOut_sN = grad_output.stride(0); - int64_t gOut_sC = grad_output.stride(1); - int64_t gOut_sH = grad_output.stride(2); - int64_t gOut_sW = grad_output.stride(3); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sH = grad_input.stride(2); - int64_t gInp_sW = grad_input.stride(3); - int64_t gGrid_sN = grad_grid.stride(0); - int64_t gGrid_sW = grad_grid.stride(2); - scalar_t *inp_ptr = input.data(); - scalar_t *grid_ptr = grid.data(); - scalar_t *gOut_ptr = grad_output.data(); - scalar_t *gInp_ptr = grad_input.data(); - scalar_t *gGrid_ptr = grad_grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; - scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - - // multipliers for gradients on ix and iy - // E.g., 0 for out-of-bound indices when GridSamplerPadding::Border - scalar_t gix_mult, giy_mult; - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix = clip_coordinates_set_grad(ix, inp_W, &gix_mult); - iy = clip_coordinates_set_grad(iy, inp_H, &giy_mult); - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - ix = reflect_coordinates_set_grad(ix, inp_W, &gix_mult); - iy = reflect_coordinates_set_grad(iy, inp_H, &giy_mult); - } else { // padding_mode == GridSamplerPadding::Zeros - gix_mult = static_cast(1); - giy_mult = static_cast(1); - } - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get NE, NW, SE, SW pixel values from (x, y) - int64_t ix_nw = static_cast(std::floor(ix)); - int64_t iy_nw = static_cast(std::floor(iy)); - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCHW; - - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); - safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); - safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); - safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); - - // calculate grad_grid - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - // un-normalize grad_grid values back to [-1, 1] constraints - gix = gix * (inp_W - 1) / 2; - giy = giy * (inp_H - 1) / 2; - - // assuming grad_grid is contiguous - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast(std::round(ix)); - int64_t iy_nearest = static_cast(std::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) { - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW); - } - } - } - } - } - return std::make_tuple(grad_input, grad_grid); - } - template std::tuple grid_sampler_3d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { + const Tensor& input, const Tensor& grid, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { auto grad_input = at::zeros_like(input); auto grad_grid = at::empty_like(grid); // If interpolation mode is Nearest, then grad_grid is not filled in the @@ -783,18 +519,18 @@ namespace { } return std::make_tuple(grad_input, grad_grid); } -} + +} // namespace // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] { - return grid_sampler_2d_cpu_impl( - input, grid, static_cast(interpolation_mode), - static_cast(padding_mode)); - }); + return grid_sampler_2d_cpu_kernel(kCPU, input, grid, interpolation_mode, padding_mode); } +DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel); + + // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { @@ -809,14 +545,11 @@ Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, std::tuple grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] { - return grid_sampler_2d_backward_cpu_impl( - grad_output, input, grid, - static_cast(interpolation_mode), - static_cast(padding_mode)); - }); + return grid_sampler_2d_backward_cpu_kernel(kCPU, grad_output, input, grid, interpolation_mode, padding_mode); } +DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel); + // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. std::tuple grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h index f39b4e996469fa..ac9c72002c66cc 100644 --- a/aten/src/ATen/native/GridSampler.h +++ b/aten/src/ATen/native/GridSampler.h @@ -1,3 +1,5 @@ +#pragma once + #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp index 288fa283abe660..5566fd397320aa 100644 --- a/aten/src/ATen/native/Indexing.cpp +++ b/aten/src/ATen/native/Indexing.cpp @@ -39,7 +39,7 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, ss << "The shape of the mask " << mask.sizes() << " at index " << maskIdx; ss << " does not match the shape of the indexed tensor " << self.sizes(); ss << " at index " << idx; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } static void checkIndexTensorTypes(TensorList indices) { @@ -47,9 +47,8 @@ static void checkIndexTensorTypes(TensorList indices) { if (tensor.defined()) { auto& type = tensor.type(); auto scalarType = type.scalarType(); - if (scalarType != kLong && scalarType != kByte) { - throw std::runtime_error("tensors used as indices must be long or byte tensors"); - } + AT_CHECK(scalarType == kLong || scalarType == kByte, + "tensors used as indices must be long or byte tensors"); } } } @@ -146,12 +145,10 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) if (index.numel() != 0) { auto max_idx = index.max().toCLong(); auto min_idx = index.min().toCLong(); - if (max_idx >= dim_size) { - AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); - } - if (min_idx < -dim_size) { - AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); - } + AT_CHECK(max_idx < dim_size, + "index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); + AT_CHECK(min_idx >= -dim_size, + "index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); } return index.remainder(dim_size); } @@ -230,9 +227,8 @@ static std::tuple makeLinearIndex(Tensor self, TensorList orig) } Tensor index(const Tensor & self, TensorList indices) { - if (indices.size() > (size_t)self.dim()) { - AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - } + AT_CHECK(indices.size() <= (size_t)self.dim(), + "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); Tensor src, linearIndex; std::tie(src, linearIndex) = makeLinearIndex(self, indices); @@ -240,9 +236,8 @@ Tensor index(const Tensor & self, TensorList indices) { } Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) { - if (indices.size() > (size_t)self.dim()) { - AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - } + AT_CHECK(indices.size() <= (size_t)self.dim(), + "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); Tensor src, linearIndex, expandedValue; std::tie(src, linearIndex) = makeLinearIndex(self, indices); @@ -252,9 +247,8 @@ Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) } Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) { - if (indices.size() > (size_t)self.dim()) { - AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - } + AT_CHECK(indices.size() <= (size_t)self.dim(), + "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); Tensor src, linearIndex, expandedValue; std::tie(src, linearIndex) = makeLinearIndex(self, indices); @@ -265,18 +259,14 @@ Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) { Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { dim = maybe_wrap_dim(dim, self.dim()); - if (index.dim() >= 2) { - AT_ERROR( - "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); - } + AT_CHECK(index.dim() < 2, + "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); + int64_t numIndices = index.numel(); - if (source.dim() == 0 && numIndices != 1) { - AT_ERROR( - "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); - } - if (index.type().scalarType() != ScalarType::Long) { - AT_ERROR("index_copy_(): Expected LongTensor for index"); - } + AT_CHECK(source.dim() != 0 || numIndices == 1, + "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); + AT_CHECK(index.type().scalarType() == ScalarType::Long, + "index_copy_(): Expected LongTensor for index"); // Check that source and destination slices have the same size auto selfSlicedSizes = self.sizes().vec(); @@ -294,7 +284,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten ss << "index_copy_(): Source/destination tensor must have same slice shapes. "; ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << dim; ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0."; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } if (source.dim() > 0 && numIndices != source.size(dim)) { AT_ERROR( diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp index 07d7e46ff79a56..0aaf2149b42a05 100644 --- a/aten/src/ATen/native/LegacyBridge.cpp +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -22,14 +22,6 @@ namespace { // TODO: Maybe the foo_ variants should call th_foo_ -Tensor norm(const Tensor & self, Scalar p) { - if (_has_native(self)) { - return native_norm(self, p); - } else { - return th_norm(self, p); - } -} - Tensor clone(const Tensor& self) { if (_has_native(self)) { return native_clone(self); @@ -144,34 +136,34 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta Tensor tensor(const Type& dtype) { if (_type_has_native(dtype)) { - return dtype.native_tensor(); + return at::getType(dtype.options()).native_tensor(); } else { - return dtype.th_tensor(); + return at::getType(dtype.options()).th_tensor(); } } Tensor tensor(const Type& dtype, ArrayRef size) { if (_type_has_native(dtype)) { - return dtype.native_tensor(size); + return at::getType(dtype.options()).native_tensor(size); } else { - return dtype.th_tensor(size); + return at::getType(dtype.options()).th_tensor(size); } } Tensor sparse_coo_tensor(const Type& dtype, ArrayRef size) { - return dtype.toSparse().native_sparse_coo_tensor(size); + return at::getType(dtype.options().layout(at::kSparse)).native_sparse_coo_tensor(size); } Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) { - return values.type().toSparse().native_sparse_coo_tensor(indices, values); + return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values); } Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef size) { - return values.type().toSparse().native_sparse_coo_tensor(indices, values, size); + return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size); } Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef size) { - return values.type().toSparse()._native_sparse_coo_tensor_unsafe(indices, values, size); + return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size); } int64_t get_device(const Tensor& self) { diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 2371d82efc6cfb..50726cb99b81b9 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1,7 +1,10 @@ #include "ATen/ATen.h" #include "ATen/ExpandUtils.h" +#include "ATen/Dispatch.h" #include "ATen/NativeFunctions.h" #include "ATen/native/LinearAlgebraUtils.h" +#include "ATen/TensorUtils.h" +#include "ATen/Parallel.h" #include #include #include @@ -169,14 +172,14 @@ Tensor mm(const Tensor& self, const Tensor& mat2) { if (self.is_sparse()) { return mat2.type().addmm(at::zeros({}, mat2.type()), self, mat2, 0, 1); } - return self.type()._mm(self, mat2); + return at::_mm(self, mat2); } Tensor& mm_out(Tensor& result, const Tensor& self, const Tensor& mat2) { if (self.is_sparse()) { - return mat2.type().addmm_out(result, at::zeros({}, mat2.type()), self, mat2, 0, 1); + return at::addmm_out(result, at::zeros({}, mat2.options()), self, mat2, 0, 1); } - return self.type()._mm_out(result, self, mat2); + return at::_mm_out(result, self, mat2); } Tensor mv(const Tensor& self, const Tensor& vec) { @@ -222,6 +225,153 @@ Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const T return at::_addr_out(result, self, vec1, vec2, beta, alpha); } +template +inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, Scalar beta_, Scalar alpha_) { + int64_t bs = result.size(0); + int64_t is = result.size(1); + int64_t js = result.size(2); + int64_t ks = self.size(2); + + scalar_t alpha = alpha_.to(); + scalar_t beta = beta_.to(); + + auto r0 = result.accessor(); + auto s0 = self.accessor(); + auto m0 = mat2.accessor(); + + int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1); + parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) { + for (int64_t b = b_begin; b < b_end; b++) { + auto r1 = r0[b]; + auto s1 = s0[b]; + auto m1 = m0[b]; + for (int64_t i = 0; i < is; i++) { + auto r2 = r1[i]; + auto s2 = s1[i]; + for (int64_t j = 0; j < js; j++) { + scalar_t &r = r2[j]; + if (is_bmm) { + r = 0; + for (int64_t k = 0; k < ks; k++) { + r += s2[k] * m1[k][j]; + } + } else { + r *= beta; + for (int64_t k = 0; k < ks; k++) { + r += alpha * s2[k] * m1[k][j]; + } + } + } + } + } + }); +} + +// This tries to apply some optimizations to bmm/baddbmm: +// - When the operand size is small, computation are parallelized over the batch +// dimension using OMP and naive matrix multiplication is applied. +// - When the operand size is larger than the threshold, if compiled with MKL, MKL's batch gemm is used. +// - Otherwise, we use a series of matrix multiplications. +// The threshold of 400 for the first has not been thoroughly benchmarked yet and may have room for further +// optimization, it likely depends on the characteristics of the CPU, MKL will be different from non-MKL etc., +// but this seems to be a first starting point. + +static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha, bool is_bmm_out) { + // is_bmm_out: true for bmm_out, false for baddbmm_ + // self_or_result is "self" for baddbmm_ and "result" for bmm_out + CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm"); + TensorArg self_arg(self_or_result, is_bmm_out ? "self" : "result", 0); + TensorArg b1_arg(batch1, "batch1", 1); + TensorArg b2_arg(batch2, "batch2", 2); + checkDim(c, b1_arg, 3); + checkDim(c, b2_arg, 3); + + int64_t bs = batch1.size(0); + checkSize(c, b2_arg, 0, bs); + int64_t contraction_size = batch1.size(2); + int64_t res_rows = batch1.size(1); + int64_t res_cols = batch2.size(2); + checkSize(c, b2_arg, 1, contraction_size); + + if (is_bmm_out) { + self_or_result.resize_({bs, res_rows, res_cols}); + } else { + checkSize(c, self_arg, 0, bs); + checkSize(c, self_arg, 1, res_rows); + checkSize(c, self_arg, 2, res_cols); + } + + // handle pathological cases that blas may not like + if (self_or_result.numel() == 0) { + return self_or_result; + } else if (contraction_size == 0) { + return self_or_result.zero_(); + } + + auto batch_items_contiguous_or_transposed = [&](const Tensor& t) { + return (t.stride(2) == 1 && t.stride(1) == t.size(2)) + || (t.stride(1) == 1 && t.stride(2) == t.size(1)); + }; + + if (contraction_size * res_rows * res_cols < 400) { + if (is_bmm_out) { + AT_DISPATCH_ALL_TYPES(batch1.type(), "bmm", [&] { + baddbmm_cpu_kernel(self_or_result, batch1, batch2, beta, alpha); + }); + } else { + AT_DISPATCH_ALL_TYPES(batch1.type(), "baddbmm", [&] { + baddbmm_cpu_kernel(self_or_result, batch1, batch2, beta, alpha); + }); + } + } else if (at::hasMKL() && at::native::is_floating_point(self_or_result) + && batch_items_contiguous_or_transposed(batch1) + && batch_items_contiguous_or_transposed(batch2) + && self_or_result.is_contiguous()) { + at::native::_baddbmm_mkl_(self_or_result, batch1, batch2, beta, alpha); + } else { // split along batch dimension + if (is_bmm_out) { + for (int64_t b = 0; b < bs; b++) { + auto r = self_or_result.select(0, b); + at::native::mm_out(r, batch1.select(0, b), batch2.select(0, b)); + } + } else { + for (int64_t b = 0; b < bs; b++) { + self_or_result.select(0, b).addmm_(batch1.select(0, b), batch2.select(0, b), beta, alpha); + } + } + } + return self_or_result; +} + + +Tensor baddbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + Tensor result = self.type().tensor(); + return at::native::baddbmm_out_cpu(result, self, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm_out_cpu(Tensor &result, const Tensor& self_, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + Tensor self; + std::tie(self) = expand_size(self_, {batch1.size(0), batch1.size(1), batch2.size(2)}, "baddbmm"); + result.resize_(self.sizes()); + result.copy_(self); + return at::native::baddbmm__cpu(result, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return bmm_out_or_baddbmm_(self, batch1, batch2, beta, alpha, false); +} + +Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) { + Tensor result = self.type().tensor(); + return at::native::bmm_out_cpu(result, self, mat2); +} + +Tensor& bmm_out_cpu(Tensor &result, const Tensor& batch1, const Tensor& batch2) { + Scalar beta(0.0); + Scalar alpha(1.0); + return bmm_out_or_baddbmm_(result, batch1, batch2, beta, alpha, true); +} + Tensor dot(const Tensor& self, const Tensor& tensor) { check_1d(self, "self", "dot"); check_1d(tensor, "tensor", "dot"); diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index ccae5fb75f5b01..b7a9c52c64bd4e 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -287,7 +287,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_ for (int64_t c = 0; c < num_labels; c++) { scalar_t& res = grad_a[t][c]; scalar_t lp = log_probs_a[t][c]; - res = std::exp(lp)-std::exp(res + nll - lp) * gr; + res = (std::exp(lp)-std::exp(res + nll - lp)) * gr; } } // zero the remainder diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 24d8a41fb50271..ed0a94ae496718 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -11,11 +11,15 @@ namespace at { namespace native { namespace { void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){ - if (actual != expected){ - std::stringstream ss; - ss << arg_name << " should contain " << expected << " elements not " << actual ; - throw std::runtime_error(ss.str()); + AT_CHECK(actual == expected, + arg_name, " should contain ", expected, " elements not ", actual); + } + + static inline Tensor repeat_if_defined(const Tensor& t, int64_t repeat) { + if (t.defined()) { + return t.repeat(repeat); } + return t; } } @@ -28,12 +32,12 @@ Tensor batch_norm( if (running_mean.defined()) { check_dims_match_num_input_features("running_mean", num_features, running_mean.numel()); } else if (!training) { - throw std::runtime_error("running_mean must be defined in evaluation mode"); + AT_ERROR("running_mean must be defined in evaluation mode"); } if (running_var.defined()) { check_dims_match_num_input_features("running_var", num_features, running_var.numel()); } else if (!training) { - throw std::runtime_error("running_var must be defined in evaluation mode"); + AT_ERROR("running_var must be defined in evaluation mode"); } if (weight.defined()) { check_dims_match_num_input_features("weight", num_features, weight.numel()); @@ -83,35 +87,57 @@ Tensor batch_norm( running_mean, running_var, training, momentum, eps); } +Tensor instance_norm( + const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */, + const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */, + bool use_input_stats, double momentum, double eps, bool cudnn_enabled) { + AT_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()), + "Expected running_mean and running_var to be defined when use_input_stats is false"); + std::vector shape = input.sizes().vec(); + int64_t b = input.size(0); + int64_t c = input.size(1); + shape[1] = b * c; + shape[0] = 1; + + Tensor weight_ = repeat_if_defined(weight, b); + Tensor bias_ = repeat_if_defined(bias, b); + Tensor running_mean_ = repeat_if_defined(running_mean, b); + Tensor running_var_ = repeat_if_defined(running_var, b); + + auto input_reshaped = input.contiguous().view(shape); + auto out = at::batch_norm(input_reshaped, weight_, bias_, running_mean_, running_var_, + use_input_stats, momentum, eps, cudnn_enabled); + + // we alias running_mean and running_var because they are const but we want to modify their data + if (running_mean.defined()) { + at::alias(running_mean).copy_(running_mean_.view({ b, c }).mean(0, false)); + } + if (running_var.defined()) { + at::alias(running_var).copy_(running_var_.view({ b, c }).mean(0, false)); + } + + return out.view(input.sizes()); +} + Tensor layer_norm(const Tensor& input, IntList normalized_shape, const Tensor& weight /* optional */, const Tensor& bias /* optional */, double eps, bool cudnn_enabled) { int64_t normalized_ndim = normalized_shape.size(); - if (normalized_ndim < 1) { - std::stringstream ss; - ss << "Expected normalized_shape to be at least 1-dimensional, i.e., " - << "containing at least one element, but got normalized_shape=" - << normalized_shape; - throw std::runtime_error(ss.str()); - } + AT_CHECK(normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape=", + normalized_shape); - if (weight.defined() && !weight.sizes().equals(normalized_shape)) { - std::stringstream ss; - ss << "Expected weight to be of same shape as normalized_shape, but got " - << "weight of shape " << weight.sizes() << " and normalized_shape=" - << normalized_shape; - throw std::runtime_error(ss.str()); - } - - if (bias.defined() && !bias.sizes().equals(normalized_shape)) { - std::stringstream ss; - ss << "Expected bias to be of same shape as normalized_shape, but got " - << "bias of shape " << bias.sizes() << " and normalized_shape=" - << normalized_shape; - throw std::runtime_error(ss.str()); - } + AT_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", weight.sizes(), " and normalized_shape=", + normalized_shape); + AT_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", bias.sizes(), " and normalized_shape=", + normalized_shape); auto input_shape = input.sizes(); auto input_ndim = input.dim(); @@ -125,7 +151,7 @@ Tensor layer_norm(const Tensor& input, IntList normalized_shape, ss << ", " << size; } ss << "], but got input of size" << input_shape; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } int64_t n = 1; @@ -159,29 +185,19 @@ Tensor group_norm(const Tensor& input, int64_t num_groups, int64_t b = input.size(0); int64_t c = input.size(1); - if (c % num_groups != 0) { - std::stringstream ss; - ss << "Expected number of channels in input to be divisible by " - << "num_groups, but got input of shape " << input.sizes() << " and " - << "num_groups=" << num_groups; - throw std::runtime_error(ss.str()); - } - - if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) { - std::stringstream ss; - ss << "Expected weight to be a vector of size equal to the number of " - << "channels in input, but got weight of shape " << weight.sizes() - << " and input of shape " << input.sizes(); - throw std::runtime_error(ss.str()); - } - - if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) { - std::stringstream ss; - ss << "Expected bias to be a vector of size equal to the number of " - << "channels in input, but got bias of shape " << weight.sizes() - << " and input of shape " << input.sizes(); - throw std::runtime_error(ss.str()); - } + AT_CHECK(c % num_groups == 0, + "Expected number of channels in input to be divisible by ", + "num_groups, but got input of shape ", input.sizes(), " and " + "num_groups=", num_groups); + + AT_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c), + "Expected weight to be a vector of size equal to the number of ", + "channels in input, but got weight of shape ", weight.sizes(), + " and input of shape ", input.sizes()); + AT_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c), + "Expected bias to be a vector of size equal to the number of ", + "channels in input, but got bias of shape ", weight.sizes(), + " and input of shape ", input.sizes()); // Apply group norm auto input_reshaped = input.contiguous().view({1, b * num_groups, -1}); diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp new file mode 100644 index 00000000000000..1f93ecbc8235ab --- /dev/null +++ b/aten/src/ATen/native/PixelShuffle.cpp @@ -0,0 +1,34 @@ +#include "ATen/native/TensorTransformations.h" + +#include +#include + +#include +#include + +namespace at { +namespace native { + +Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) { + AT_ASSERTM(self.dim() == 4, + "pixel_shuffle expects 4D input, but got input with sizes ",self.sizes()); + int64_t b = self.size(0); + int64_t c = self.size(1); + int64_t h = self.size(2); + int64_t w = self.size(3); + int64_t upscale_factor_squared = upscale_factor * upscale_factor; + AT_ASSERTM(c % upscale_factor_squared == 0, + "pixel_shuffle expects input channel to be divisible by square of " + "upscale_factor, but got input with sizes ", self.sizes(), + ", upscale_factor=", upscale_factor, + ", and self.size(1)=", c, " is not divisible by ", upscale_factor_squared); + int64_t oc = c / upscale_factor_squared; + int64_t oh = h * upscale_factor; + int64_t ow = w * upscale_factor; + + auto input_reshaped = self.reshape({b, oc, upscale_factor, upscale_factor, h, w}); + return input_reshaped.permute({0 /* b */, 1 /* oc */, 4 /* h */, 2 /* 1st upscale_factor */, 5 /* w */, 3 /* 2nd upscale_factor */}) + .reshape({b, oc, oh, ow}); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 9b06a513a14cd5..df937d6464b487 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -74,10 +74,19 @@ signature. - `std::array` (where N is `1-4`). NB: you MUST NOT put a space after the comma, otherwise this argument will not parse correctly. (If you decide to fix this, make sure you fix the argument parser both in ATen and in PyTorch.) +- `TensorOptions`. Tensor options provide information about how a + tensor should be constructed; it is most useful when you are writing a + factory function, where you have no `Tensor` inputs and thus + cannot otherwise determine how to construct a `Tensor`. - `*` is a special sentinel argument, which doesn't translate into an actual argument, but indicates that in the Python bindings, any subsequent arguments must be specified as keyword arguments (and cannot be provided positionally). +Functions with no tensor inputs are called *factory functions*, and +are handled specially by code generation. If your function is behaving +differently than another example, check first and see if one is a +factory while another is not. + **Return types.** These types are permissible as ReturnType: - `Tensor` and `TensorList`, which translate into the C++ types `Tensor` and `std::vector`, @@ -218,8 +227,9 @@ direct consequences on valid implementations: * Never create a `Tensor` directly (e.g., `at::CPU` or `at::CUDA`), as a caller will be expecting to get `Variable`s out if it passes `Variable`. - Instead, create tensors from the `type()` of one of the input tensors, e.g., - `input.type().tensor()` or `input.type().toScalarType(kByte)` if you need + Instead, create tensors using the `options()` of one of the input + tensors. E.g., `at::empty(sizes, input.options())` or + `at::ones(input.options().dtype(kByte))`, if you need a different scalar type. * If you need to call other ATen functions, be sure to qualify the call diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 2c7e641dcbe843..c976121e77ae3f 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -19,6 +19,7 @@ namespace native { DEFINE_DISPATCH(sum_kernel); DEFINE_DISPATCH(prod_kernel); +DEFINE_DISPATCH(norm_kernel); static inline Tensor integer_upcast(const Tensor& self, optional dtype) { ScalarType scalarType = self.type().scalarType(); @@ -584,12 +585,23 @@ Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim) return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim); } -Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) { - Tensor result = self.type().tensor(); - return at::native::norm_out(result, self, p, dim, keepdim); +Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_, bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) + return result; + if (self.is_contiguous() && result.is_contiguous()) { + _dimreduce_setup(result, self, dim); + norm_kernel(kCPU, result, self, p, dim); + if (!keepdim) { + result.squeeze_(dim); + } + return result; + } else { + return at::_th_norm_out(result, self, p, dim, keepdim); + } } -Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) { +Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) { AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes"); @@ -597,10 +609,44 @@ Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { return result; } else { - return at::_th_norm_out(result, self, p, dim, keepdim); + if (self.is_cuda()) { + return at::_th_norm_out(result, self, p, dim, keepdim); + } else { + return _norm_out_cpu(result, self, p, dim, keepdim); + } + } +} + +Tensor _norm(const Tensor &self, Scalar p) { + if (self.type().is_sparse()) { + return at::native_norm(self, p); + } else { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes"); + if (self.is_cuda()) { + return at::th_norm(self, p); + } else { + if (self.is_contiguous()) { + Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type()); + norm_kernel(kCPU, result, self, p, nullopt); + return result; + } else { + return at::th_norm(self, p); + } + } } } +Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::norm_out(result, self, p, dim, keepdim); +} + +Tensor norm(const Tensor& self, Scalar p) { + return at::native::_norm(self, p); +} + Tensor all(const Tensor& self, int64_t dim, bool keepdim) { Tensor result = self.type().tensor(); return at::native::all_out(result, self, dim, keepdim); diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp index 5995e43ef1e536..1a089a9f473c17 100644 --- a/aten/src/ATen/native/RoiPooling.cpp +++ b/aten/src/ATen/native/RoiPooling.cpp @@ -134,7 +134,7 @@ Tensor RoiPooling2d_backward_cpu( double spatialScale, const Tensor& gradOutput, const Tensor& argmaxes) { - throw std::runtime_error("not implemented"); + AT_ERROR("not implemented"); } } diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index 6ffb891bf7d777..975ae8c1ff9c47 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -18,7 +18,7 @@ Scalar _local_scalar(const Tensor& self) { Scalar _local_scalar_dense_cpu(const Tensor& self) { Scalar r; - AT_DISPATCH_ALL_TYPES_AND_HALF( + AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX( self.type(), "_local_scalar_dense_cpu", [&] { scalar_t value = *self.data(); r = Scalar(value); diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h index 7518d1f945a5fd..875c7aa12b68cf 100644 --- a/aten/src/ATen/native/SpectralOpsUtils.h +++ b/aten/src/ATen/native/SpectralOpsUtils.h @@ -51,7 +51,7 @@ inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size, std::ostringstream ss; ss << "expected real signal size " << expected_size << " is incompatible " << "with onesided complex frequency size " << complex_size; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } } diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp new file mode 100644 index 00000000000000..9605736ee112fe --- /dev/null +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -0,0 +1,51 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + +namespace at { +namespace native { + +static void ensure_has_index(Device* device) { + if (!device->is_cuda() || device->has_index()) { + return; + } + device->set_index(at::current_device()); +} + +static Tensor to_impl(const Tensor& self, const TensorOptions& options, bool non_blocking) { + return self.type().toBackend(options.backend()).toScalarType(options.dtype()) + .copy(self, non_blocking, options.device()); +} + +Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking) { + ensure_has_index(&device); + if (self.device() == device && self.dtype() == dtype) { + return self; + } + return to_impl(self, self.options().device(device).dtype(dtype), non_blocking); +} + +Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking) { + if (self.dtype() == dtype) { + return self; + } + return to_impl(self, self.options().dtype(dtype), non_blocking); +} + +Tensor to(const Tensor& self, Device device, bool non_blocking) { + ensure_has_index(&device); + if (self.device() == device) { + return self; + } + return to_impl(self, self.options().device(device), non_blocking); +} + +Tensor to(const Tensor& self, const Tensor& other, bool non_blocking) { + auto self_options = self.options(); + auto options = other.options(); + if (self_options == options) { + return self; + } + return to_impl(self, options, non_blocking); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 1a12549b5e70e9..178045d9fd0de4 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -59,6 +59,10 @@ void window_function_checks( window_length); } +const TypeExtendedInterface& getFactoryType(const TensorOptions& options) { + return at::getType(options); +} + } // namespace // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -73,7 +77,7 @@ Tensor arange( Scalar step, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._arange(start, end, step); + return getFactoryType(options)._arange(start, end, step); } Tensor& arange_out(Tensor& result, Scalar start, Scalar end) { @@ -86,7 +90,7 @@ Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) { Tensor arange(Scalar end, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._arange(end); + return getFactoryType(options)._arange(end); } Tensor& arange_out(Tensor& result, Scalar end) { @@ -94,7 +98,7 @@ Tensor& arange_out(Tensor& result, Scalar end) { } Tensor _dim_arange(const Tensor& like, int64_t dim) { - return like.type().toScalarType(at::kLong)._arange(like.size(dim)); + return at::getType(like.options().dtype(at::kLong))._arange(like.size(dim)); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -102,7 +106,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) { Tensor empty(IntList size, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] // Can't call a factory function, because the buck stops with us! - return at::getType(options).tensor(size); + return getFactoryType(options).tensor(size); } Tensor& empty_out(Tensor& result, IntList size) { @@ -218,7 +222,7 @@ Tensor linspace( int64_t steps, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._linspace(start, end, steps); + return getFactoryType(options)._linspace(start, end, steps); } Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) { @@ -241,7 +245,7 @@ Tensor logspace( int64_t steps, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._logspace(start, end, steps); + return getFactoryType(options)._logspace(start, end, steps); } Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) { @@ -475,7 +479,7 @@ Tensor range( Scalar step, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._range(start, end, step); + return getFactoryType(options)._range(start, end, step); } Tensor& range_out(Tensor& result, Scalar start, Scalar end) { diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index bae2a94b86273b..15d86fb5162a3e 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -98,7 +98,8 @@ void TensorIterator::compute_common_type() { if (op.tensor->defined() && type != op.tensor->type()) { if (op.tensor->dim() == 0) { if (type.backend() != at::Backend::CUDA) { - *op.tensor = op.tensor->toType(type); + cast_tensors_.emplace_back(op.tensor->toType(type)); + op.tensor = &(cast_tensors_.back()); } } else { op.needs_cast = true; diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index 245866373d4763..3faedbec6bb320 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -184,6 +184,7 @@ struct AT_API TensorIterator { DimVector shape_; DimVector perm_; SmallVector operands_; + SmallVector cast_tensors_; int num_outputs_ = 0; bool has_coalesced_dimensions_ = false; }; diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index cf6a9ece5c0d9d..634e7a443d21fd 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -20,9 +20,8 @@ std::vector broadcast_tensors(TensorList tensors) { static void check_cat_no_zero_dim(TensorList tensors) { for(size_t i = 0; i < tensors.size(); ++i) { auto& t = tensors[i]; - if (t.dim() == 0) { - AT_ERROR("zero-dimensional tensor (at position ", i, ") cannot be concatenated"); - } + AT_CHECK(t.dim() > 0, + "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); } } @@ -39,12 +38,11 @@ Tensor cat(TensorList tensors, int64_t dim) { } std::vector chunk(const Tensor& self, int64_t chunks, int64_t dim) { - if (self.dim() == 0) { - AT_ERROR("chunk expects at least a 1-dimensional tensor"); - } - if (chunks <= 0) { - AT_ERROR("chunk expects `chunks` to be greater than 0, got: ", chunks); - } + AT_CHECK(self.dim() > 0, + "chunk expects at least a 1-dimensional tensor"); + AT_CHECK(chunks > 0, + "chunk expects `chunks` to be greater than 0, got: ", chunks); + int64_t split_size = (self.size(dim) + chunks - 1) / chunks; // We need to call split_with_sizes in the case where split_size and dimension size are 0, because @@ -117,14 +115,11 @@ Tensor expand(const Tensor& self, IntList size, bool implicit) { // distinguish between expands inserted by broadcasts and those explicitly // requested by the user, because it is legal to remove implicit expands // from the graph, but not legal to remove the explicit ones. - if (size.size() < (size_t)self.dim()) { - std::ostringstream ss; - ss << "expand(" << self.type() << "{" << self.sizes() << "}, size=" << size - << "): the number of sizes provided (" << size.size() << ") " - << "must be greater or equal to the number of dimensions in the tensor (" - << self.dim() << ")"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(size.size() >= (size_t)self.dim(), + "expand(", self.type(), "{", self.sizes(), "}, size=", size, + "): the number of sizes provided (", size.size(), ") ", + "must be greater or equal to the number of dimensions in the tensor (", + self.dim(), ")"); std::vector expandedSizes; std::vector expandedStrides; @@ -159,17 +154,15 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { if (start != cur_size) { // start being the end is valid, but not a valid dim specification. start = maybe_wrap_dim(start, cur_size); } - if (length < 0 || start > cur_size - length) { - AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); - } + AT_CHECK(length >= 0 && start <= cur_size - length, + "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); return at::slice(self, dim, start, start + length, 1); } Tensor permute(const Tensor& self, IntList dims) { auto nDims = self.dim(); - if (dims.size() != (size_t)nDims) { - AT_ERROR("number of dims don't match in permute"); - } + AT_CHECK(dims.size() == (size_t)nDims, + "number of dims don't match in permute"); auto oldSizes = self.sizes(); auto oldStrides = self.strides(); std::vector newSizes(nDims); @@ -177,9 +170,8 @@ Tensor permute(const Tensor& self, IntList dims) { std::vector seen(nDims); for (int64_t i = 0; i < nDims; i++) { auto dim = maybe_wrap_dim(dims[i], nDims); - if (seen[dim]) { - AT_ERROR("repeated dim in permute"); - } + AT_CHECK(!seen[dim], + "repeated dim in permute"); seen[dim] = true; newSizes[i] = oldSizes[dim]; newStrides[i] = oldStrides[dim]; @@ -188,9 +180,8 @@ Tensor permute(const Tensor& self, IntList dims) { } Tensor repeat(const Tensor& self, IntList repeats) { - if (repeats.size() < (size_t)self.dim()) { - AT_ERROR("Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); - } + AT_CHECK(repeats.size() >= (size_t)self.dim(), + "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); // Add new leading dimensions to the tensor if the // number of target dimensions is larger than the @@ -206,7 +197,7 @@ Tensor repeat(const Tensor& self, IntList repeats) { Tensor xtensor = self.expand(padded_size); Tensor result = self.type().tensor(target_size); - Tensor urtensor = result.type().alias(result); + Tensor urtensor = at::alias(result); for (int64_t i = 0; i < xtensor.dim(); ++i) { // can't unfold with step 0, so make sure step is at least 1 // (it doesn't matter what it is in that case, because the size is 0). @@ -238,12 +229,9 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) { AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor."); dim = maybe_wrap_dim(dim, ndim); auto size = self.size(dim); - if (index < -size || index >= size) { - std::stringstream ss; - ss << "select(): index " << index << " out of range for tensor of size "; - ss << self.sizes() << " at dimension " << dim; - throw std::runtime_error(ss.str()); - } + AT_CHECK(index >= -size && index < size, + "select(): index ", index, " out of range for tensor of size ", + self.sizes(), " at dimension ", dim); if (index < 0) { index += size; } @@ -261,10 +249,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ dim = maybe_wrap_dim(dim, ndim); auto sizes = self.sizes().vec(); auto strides = self.strides().vec(); - if (step <= 0) { - // TODO: support negative strides - throw std::runtime_error("slice step must be positive"); - } + // TODO: support negative strides + AT_CHECK(step > 0, "slice step must be positive"); if (start < 0) { start += sizes[dim]; } @@ -322,22 +308,15 @@ std::vector split_with_sizes(const Tensor& self, IntList split_sizes, in for (i = 0; i < num_splits; ++i) { auto length = split_sizes[i]; - if (length < 0) { - std::ostringstream ss; - ss << "split_with_sizes expects split_sizes have only non-negative " - << "entries, but got split_sizes=" << split_sizes; - throw std::runtime_error(ss.str()); - } + AT_CHECK(length >= 0, + "split_with_sizes expects split_sizes have only non-negative ", + "entries, but got split_sizes=", split_sizes); splits[i] = self.narrow(dim, start_idx, length); start_idx += length; } - if (start_idx != dim_size) { - std::ostringstream ss; - ss << "split_with_sizes expects split_sizes to sum exactly to " - << dim_size << " (input tensor's size at dimension " << dim << "), " - << "but got split_sizes=" << split_sizes; - throw std::runtime_error(ss.str()); - } + AT_CHECK(start_idx == dim_size, + "split_with_sizes expects split_sizes to sum exactly to ", dim_size, + " (input tensor's size at dimension ", dim, "), ", "but got split_sizes=", split_sizes); return splits; } @@ -350,28 +329,24 @@ static inline std::vector get_stack_inputs(TensorList tensors, int64_t d } Tensor stack(TensorList tensors, int64_t dim) { - if (tensors.size() == 0) { - throw std::runtime_error("stack expects a non-empty TensorList"); - } + AT_CHECK(tensors.size() > 0, + "stack expects a non-empty TensorList"); dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); return at::cat(get_stack_inputs(tensors, dim), dim); } Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) { - if (tensors.size() == 0) { - throw std::runtime_error("stack expects a non-empty TensorList"); - } + AT_CHECK(tensors.size() > 0, + "stack expects a non-empty TensorList"); dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); return at::cat_out(result, get_stack_inputs(tensors, dim), dim); } static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) { int64_t nsparseDims = self._sparseDims(); - if (dim0 >= nsparseDims || dim1 >= nsparseDims) { - AT_ERROR( - "sparse transpose: transposed dimensions must be sparse ", - "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1); - } + AT_CHECK(dim0 < nsparseDims && dim1 < nsparseDims, + "sparse transpose: transposed dimensions must be sparse ", + "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1); if (self._indices().numel() == 0 && self._values().numel() == 0) { auto sizes = self.sizes().vec(); @@ -442,10 +417,9 @@ static void check_t(const Tensor& self, const char *fn) { if (self.is_sparse()) { int64_t sparseDims = self._sparseDims(); int64_t denseDims = self._denseDims(); - if (!(sparseDims == 2 && denseDims == 0)) { - AT_ERROR(fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ", - sparseDims, " sparse and ", denseDims, " dense dimensions"); - } + AT_CHECK(sparseDims == 2 && denseDims == 0, + fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ", + sparseDims, " sparse and ", denseDims, " dense dimensions"); } else if (self.dim() != 2) { AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D"); } @@ -622,6 +596,10 @@ std::vector meshgrid(TensorList tensors) { AT_ERROR("Expected scalar or 1D tensor in the tensor list but got: ", tensors[i]); } } + for(int64_t i = 0; i < size - 1; i++){ + AT_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype"); + AT_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device"); + } std::vector grids; for(int64_t i = 0; i < size; i++) { std::vector view_shape(size, 1); diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp new file mode 100644 index 00000000000000..1627b4c2596e12 --- /dev/null +++ b/aten/src/ATen/native/WeightNorm.cpp @@ -0,0 +1,117 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace at { +namespace native { + +// Staying faithful to the Python for now for clarity, look for optimizations later +// (e.g., single return statement for RVO) +Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim) +{ + // I assume tensor.contiguous(), view(), norm(), etc. here will dispatch through VariableType. + if (dim == -1) { + return v.norm(pow); + } else if (dim == 0) { + std::vector output_size(v.dim(), 1); + output_size[0] = v.size(0); + return v.contiguous().view({v.size(0), -1}).norm(pow, 1).view(output_size); + } else if (dim == v.dim() - 1) { + std::vector output_size(v.dim(), 1); + output_size[v.dim() - 1] = v.size(v.dim() - 1); + return v.contiguous().view({-1, v.size(v.dim() - 1)}).norm(pow, 0).view(output_size); + } else { + // To consider: at::native::norm_except_dim is probably fine as well, + // and would avoid an additional dynamic dispatch. + return at::norm_except_dim(v.transpose(0, dim), pow, 0).transpose(0, dim); // optimize? + } +} + +Tensor _weight_norm + (const Tensor & v_in, + const Tensor & g_in, + int64_t dim) +{ + + AT_CHECK( + v_in.device() == g_in.device(), + "weight_norm: expected v_in and g_in to be on the same device, but v_in is " + "on ", v_in.device(), " and g_in is on ", g_in.device()); + + auto v = v_in.contiguous(); + auto g = g_in.contiguous(); + + bool can_use_fused = v.type().is_cuda() && (dim == 0 || dim == v.dim() - 1); + + if (can_use_fused) { + // weight_norm does not have a derivative defined for it, so this will route back through + // VariableType.cpp, and construct a WeightNormFusedBackward object in the autograd graph. + return std::get<0>(at::_weight_norm_cuda_interface(v, g, dim)); + } else { + // Double-differentiable primitive ops + // at::native::norm_except_dim would probably be fine as well. + return v*(g/at::norm_except_dim(v, 2, dim)); + } +} + +// Differentiable backward path, an alternative to weight_norm_cuda_backward, to be used +// when backward is itself creating a graph. +// The GradMode::is_enabled() check must be performed within Functions.cpp; that's why we +// define a separate function here, instead of inlining it in weight_norm_cuda_backward. +std::tuple _weight_norm_differentiable_backward + (const Tensor & grad_w, + const Tensor & saved_v, + const Tensor & saved_g, + const Tensor & saved_norms, + int64_t dim) +{ + // In Functions.cpp, the HardshrinkBackward object supplies "grad.contiguous()" + // as the first argument, so grad_w should be contiguous here. + // All these checks should succeed: + AT_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous"); + AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + + int64_t last_dim = saved_v.dim() - 1; + int64_t last_size = saved_v.size(last_dim); + + // Like weight_norm_fused_backward, weight_norm_differentiable_backward should only ever be called + // through a WeightNormFusedBackward object, so we expect that dim == 0 || dim == saved_v.size(-1) + AT_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension"); + + // saved_g and saved_norms are already shaped to broadcast over the correct dimensions + + // ...but saved_norms might be Float when saved_g and saved_v are half. + // To consider: saved_norms.to(..., True /*non_blocking*/); + auto norms = saved_norms.to(saved_g.type().scalarType()); + + std::vector bcast_size(saved_v.dim(), 1); + + // Analytic backward path using differentiable primitive ops + if (dim == 0) { + bcast_size[0] = saved_v.size(0); + auto per_dim_sums = (grad_w*saved_v).view({saved_v.size(0), -1}).sum(1).view(bcast_size); + auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms))); + auto grad_g = per_dim_sums/norms; + return std::tuple{grad_v, grad_g}; + } else { // dim == last_dim + bcast_size[last_dim] = last_size; + auto per_dim_sums = (grad_w*saved_v).view({-1, last_size}).sum(0).view(bcast_size); + auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms))); + auto grad_g = per_dim_sums/norms; + return std::tuple{grad_v, grad_g}; + } +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp new file mode 100644 index 00000000000000..648defd192e117 --- /dev/null +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -0,0 +1,890 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +#include +#include +#include + +namespace at { namespace native { namespace { + +/** NOTE [ Grid Sample CPU Kernels ] + * + * Implementation of vectorized grid sample CPU kernels is divided into three + * parts. More detailed description exist after this paragraph, but on a high + * level, they are + * 1. `ComputeLocation` struct + * + Computes the interpolation location basing on padding mode. + * 2. `ApplyGridSample` struct + * + Owns N (# spatial dims) `ComputeLocation` structs, and uses them to + * compute the interpolation locations. + * + Interpolates the values and writes to output. + * 3. `grid_sample_2d_grid_slice_iterator` function + * + Iterates over a slice of the grid tensor based on the geometry by the + * spatial ordering, i.e., the first iteration will process grid values + * grid[n, 0, 0, :], grid[n, 0, 1, :], grid[n, 0, 2, :], ... + * (Recall that, e.g., 2D grid has shape [N x H x W x 2], so grid[n, ...] + * is a slice, and grid[n, h, w, :] contains the values for a single + * output spatial location.) + * + Applies a given operator at each iteration, so we can use the same + * pattern for forward and backward. + * + * Putting everything together, we have, e.g., the forward kernel implemented + * as + * + * // `ApplyGridSample` struct that processes grid values, extracts and + * // interpolates input values, and write to output. + * ApplyGridSample grid_sample(input_accessor); + * + * // For each slice, we call `grid_sample_2d_grid_slice_iterator` with + * // 1. the grid slice, and + * // 2. a lambda that takes in + * // i. location vectors (x and y for 2D) extracted from grid + * // ii. `spatial_offset` as the spatial offset of these vectors + * // from the beginning of this slice. + * // iii. `len` as the number of valid locations in the vectors. + * // (There might not be enough near boundary.) + * for (int n = 0; n < input_accessor.size(0); n++) { + * grid_sample_2d_grid_slice_iterator( + * grid_accessor[n], + * [&](const Vec256& grid_x, + * const Vec256& grid_y, + * int64_t spatial_offset, int64_t len) { + * grid_sample.forward(out_accessor[n], input_accessor[n], + * spatial_offset, grid_x, grid_y, len); + * }); + * } + * + * Now we talk about details of each of these three parts: + * + * 1. `ComputeLocation` struct + * Transforms grid values into interpolation locations of the input tensor + * for a particular spatial dimension, based on the size of that dimension + * in input tensor, and the padding mode. + * + * template + * struct ComputeLocation { + * using Vec = Vec256; + * + * // ctor + * ComputeLocation(int64_t size); + * + * // Given grid values `in`, return the interpolation locations after + * // un-normalization and padding mechanism (elementwise). + * Vec apply(const Vec &in) const; + * + * // Similar to `apply`, but also returns `d apply(in) / d in` + * // (elementwise). + * // this is often used in gradient computation. + * std::pair apply_get_grad(const Vec &in) const; + * }; + * + * 2. `ApplyGridSample` struct + * Owns N `ComputeLocation` structs, where N is the number of spatial + * dimensions. Given N input grid vectors (one for each spatial dimension) + * and spatial offset, it gets the interpolation locations from + * `ComputeLocation`s, applies interpolation procedure, and then writes to + * the output (or grad_input & grad_grid in backward). + * + * template + * struct ApplyGridSample { + * + * // ctor + * ApplyGridSample(const TensorAccessor& input); + * + * // Applies grid sampling (forward) procedure: + * // 1. computes interpolation locations from grid values `grid_x` + * // and `grid_y`, + * // 2. interpolates output values using the locations and input + * // data in `inp_slice`, and + * // 3. writes the first `len` values in the interpolated vector to + * // `out_slice` with spatial offset being `offset`. + * // + * // This assimes that `grid_x` and `grid_y` all contain valid grid + * // values \in [-1, 1], even at indices greater than `len`. + * // + * // The `*_slice` argument namess mean samples within a batch (i.e., + * // with the batch dimension sliced out). + * void forward(TensorAccessor& out_slice, + * const TensorAccessor& inp_slice, + * int64_t offset, const Vec& grid_x, const Vec& grid_y, + * int64_t len) const; + * + * // Applies grid sampling (backward) procedure. Arguments semantics + * // and strategy are similar to those of `forward`. + * void backward(TensorAccessor& gInp_slice, + * TensorAccessor& gGrid_slice, + * const TensorAccessor& gOut_slice, + * const TensorAccessor& inp_slice, + * int64_t offset, const Vec& grid_x, const Vec& grid_y, + * int64_t len) const; + * }; + * + * 3. `grid_sample_2d_grid_slice_iterator` function + * Among the tensors we work with, we know that the output tensors are + * contiguous (i.e., `output` in forward, and `grad_input` & `grad_grid` in + * backward), we need to randomly read `input` anyways, and `grad_output` + * usually comes from autograd and is often contiguous. So we base our + * iterating strategy on the geometry of grid. + * `grid_sample_2d_grid_slice_iterator` function provides an abstraction to + * efficiently iterates through a `grid` slice (without batch dimension). + * See comments of that function on the specific cases and strategies used. + * + * template + * void grid_sample_2d_grid_slice_iterator( + * const TensorAccessor& grid_slice, + * const ApplyFn &apply_fn); + * + * `apply_fn` is a function/lambda that takes in + * i. location vectors (x and y for 2D) extracted from grid + * ii. `spatial_offset` as the spatial offset of these vectors + * from the beginning of this slice. + * iii. `len` as the number of valid locations in the vectors. + * (There might not be enough near boundary.) + + * It should be callable as if it has declaration: + * void apply_fn(const Vec256& grid_x, + * const Vec256& grid_y, + * int64_t spatial_offset, int64_t len); + * + * `apply_fn` will be called multiple times, and together cover the entire + * output spatial space. + * + * Now you should be able tp understand everything about the implementaion of + * 2D forward kernel shown at the beginning of this note. + * + **/ + + +using at::native::detail::GridSamplerInterpolation; +using at::native::detail::GridSamplerPadding; +using namespace at::vec256; + + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ComputeLocation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Struct to compute interpolation location from grid values, and to apply +// padding mechanism (e.g., reflection). +// See NOTE [ Grid Sample CPU Kernels ] for details. + +template +struct ComputeLocationBase { + using Vec = Vec256; + + const scalar_t half_max_val; + + ComputeLocationBase(int64_t size) + : half_max_val(static_cast(size - 1) / 2) {} + + inline Vec unnormalize(const Vec &in) const { + return (in + Vec(1)) * Vec(half_max_val); + } +}; + +template +struct ComputeLocation; + +template +struct ComputeLocation + : ComputeLocationBase { + using Vec = Vec256; + using ComputeLocationBase::unnormalize; + using ComputeLocationBase::half_max_val; + + using ComputeLocationBase::ComputeLocationBase; + + inline Vec apply(const Vec &in) const { + return unnormalize(in); + } + + inline std::pair apply_get_grad(const Vec &in) const { + return std::make_pair(unnormalize(in), Vec(half_max_val)); + } +}; + +template +struct ComputeLocation + : ComputeLocationBase { + using Vec = Vec256; + using ComputeLocationBase::unnormalize; + using ComputeLocationBase::half_max_val; + + const scalar_t max_val; + + ComputeLocation(int64_t size) + : ComputeLocationBase(size) + , max_val(static_cast(size - 1)) {} + + inline Vec apply(const Vec &in) const { + return min(Vec(max_val), max(unnormalize(in), Vec(0))); + } + inline std::pair apply_get_grad(const Vec &in) const { + using int_t = int_same_size_t; + Vec max_val_vec(max_val), zeros(0); + auto indices = unnormalize(in); + auto bounded_lo = max(indices, zeros); + // Integral type equality comparison is very very fast because it just looks + // at the bits. Casting is free too. So we use the following pattern instead + // of comparison + blendv. + auto in_bound_lo = cast(cast(bounded_lo) == cast(indices)); + auto res = min(bounded_lo, max_val_vec); + auto in_bound_hi = cast(cast(res) == cast(indices)); + return std::make_pair(res, (in_bound_lo & in_bound_hi) & Vec(half_max_val)); + } +}; + +template +struct ComputeLocation + : ComputeLocationBase { + using Vec = Vec256; + using ComputeLocationBase::unnormalize; + using ComputeLocationBase::half_max_val; + + bool unit_size; // whether size == 1, just return 0 in this case + const scalar_t double_max_val; + const scalar_t neg_half_max_val; + + ComputeLocation(int64_t size) + : ComputeLocationBase(size) + , unit_size(size == 1) + , double_max_val(static_cast((size - 1) * 2)) + , neg_half_max_val(-0.5 * static_cast(size - 1)) {} + + inline Vec apply(const Vec &in) const { + if (unit_size) { + return Vec(0); + } + Vec double_max_val_vec(double_max_val); + auto abs_in = unnormalize(in).abs(); + auto fdouble_flips = abs_in / double_max_val_vec; + auto double_flips = fdouble_flips.trunc(); + auto extra = abs_in - double_flips * double_max_val_vec; + // Now we need to test if extra > max_val to find out if another flip is + // needed. The following comparison does that and returns the correct + // flipped value. + return min(extra, double_max_val_vec - extra); + } + + inline std::pair apply_get_grad(const Vec &in) const { + if (unit_size) { + return std::make_pair(Vec(0), Vec(0)); + } + Vec double_max_val_vec(double_max_val); + auto unnorm_in = unnormalize(in); + auto neg_in = unnorm_in < Vec(0); + auto abs_in = unnorm_in.abs(); + auto fdouble_flips = abs_in / double_max_val_vec; + auto double_flips = fdouble_flips.trunc(); + + auto extra = abs_in - double_flips * double_max_val_vec; + auto reflected_extra = double_max_val_vec - extra; + auto one_more_flip = extra > reflected_extra; + + return std::make_pair( + Vec::blendv(extra, reflected_extra, one_more_flip), + Vec::blendv(Vec(half_max_val), Vec(neg_half_max_val), one_more_flip ^ neg_in) + ); + } +}; + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ApplyGridSample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Struct to apply grid sample (reading from input, interpolate, and write to +// output). +// See NOTE [ Grid Sample CPU Kernels ] for details. + +template +static inline void +mask_scatter_add(const scalar_t *src, scalar_t* base_addr, + const int_same_size_t *offsets, + const int_same_size_t *mask, int64_t len) { + #pragma unroll + for (int64_t i = 0; i < len; i++) { + if (mask[i] & 0x01) { + base_addr[offsets[i]] += src[i]; + } + } +} + +template +struct ApplyGridSample; + +template +struct ApplyGridSample { + using Vec = Vec256; + using integer_t = int_same_size_t; + using iVec = Vec256; + + const int64_t inp_H; + const int64_t inp_W; + const int64_t inp_sH; + const int64_t inp_sW; + const int64_t C; + const int64_t inp_sC; + const ComputeLocation compute_H; + const ComputeLocation compute_W; + const bool must_in_bound = padding != GridSamplerPadding::Zeros; + + ApplyGridSample(const TensorAccessor& input) + : inp_H(input.size(2)) + , inp_W(input.size(3)) + , inp_sH(input.stride(2)) + , inp_sW(input.stride(3)) + , C(input.size(1)) + , inp_sC(input.stride(1)) + , compute_H(input.size(2)) + , compute_W(input.size(3)) {} + + inline std::tuple< + Vec, Vec, Vec, Vec, // distances to 4 sides + Vec, Vec, Vec, Vec, // interpolation weights wrt 4 corners + Vec, Vec, Vec, Vec, // in_bound masks + iVec, iVec // y_n and x_w + > + compute_interp_params(const Vec& x, const Vec& y) const { + // get NE, NW, SE, SW pixel values from (x, y) + // assuming we get exact integer representation and just use scalar_t + // if we don't, the weights will be garbage anyways. + auto x_w = x.floor(); + auto y_n = y.floor(); + + // get distances to each side + auto w = x - x_w; + auto e = Vec(1) - w; + auto n = y - y_n; + auto s = Vec(1) - n; + + // get interpolation weights for each neighbor + // e.g., for the nw corder, the weight is `dist_to_south * dist_to_east`. + auto nw = s * e; + auto ne = s * w; + auto sw = n * e; + auto se = n * w; + + auto i_x_w = convert_to_int_of_same_size(x_w); + auto i_y_n = convert_to_int_of_same_size(y_n); + auto i_x_e = i_x_w + iVec(1); + auto i_y_s = i_y_n + iVec(1); + + // Use int comparison because it is much faster than float comp with AVX2 + // (latency 1 cyc vs. 4 cyc on skylake) + // Avoid using the le and ge because those are not implemented in AVX2 and + // are actually simulated using multiple instructions. + auto w_mask = must_in_bound ? iVec(-1) // true = all ones + : (i_x_w > iVec(-1)) & (i_x_w < iVec(inp_W)); + auto n_mask = must_in_bound ? iVec(-1) // true = all ones + : (i_y_n > iVec(-1)) & (i_y_n < iVec(inp_H)); + auto e_mask = must_in_bound ? (i_x_e < iVec(inp_W)) + : (i_x_e > iVec(-1)) & (i_x_e < iVec(inp_W)); + auto s_mask = must_in_bound ? (i_y_s < iVec(inp_H)) + : (i_y_s > iVec(-1)) & (i_y_s < iVec(inp_H)); + auto nw_mask = cast(must_in_bound ? iVec(-1) : (w_mask & n_mask)); + auto ne_mask = cast(e_mask & n_mask); + auto sw_mask = cast(w_mask & s_mask); + auto se_mask = cast(e_mask & s_mask); + + return std::make_tuple( + n, s, w, e, + nw, ne, sw, se, + nw_mask, ne_mask, sw_mask, se_mask, + i_y_n, i_x_w); + } + + inline void forward(TensorAccessor& out_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + auto x = compute_W.apply(grid_x); + auto y = compute_H.apply(grid_y); + + auto interp_params = compute_interp_params(x, y); + + auto nw = std::get<4>(interp_params); + auto ne = std::get<5>(interp_params); + auto sw = std::get<6>(interp_params); + auto se = std::get<7>(interp_params); + + auto nw_mask = std::get<8>(interp_params); + auto ne_mask = std::get<9>(interp_params); + auto sw_mask = std::get<10>(interp_params); + auto se_mask = std::get<11>(interp_params); + + auto i_y_n = std::get<12>(interp_params); + auto i_x_w = std::get<13>(interp_params); + + auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW); + auto i_ne_offset = i_nw_offset + iVec(inp_sW); + auto i_sw_offset = i_nw_offset + iVec(inp_sH); + auto i_se_offset = i_sw_offset + iVec(inp_sW); + + #pragma unroll + for (int64_t c = 0; c < C; ++c) { + auto inp_slice_C_ptr = inp_slice[c].data(); + + // mask_gather zeros out the mask, so we need to make copies + Vec nw_mask_copy = nw_mask; + Vec ne_mask_copy = ne_mask; + Vec sw_mask_copy = sw_mask; + Vec se_mask_copy = se_mask; + auto nw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy); + auto ne_val = mask_gather(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy); + auto sw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy); + auto se_val = mask_gather(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy); + + auto interpolated = (nw_val * nw) + (ne_val * ne) + (sw_val * sw) + (se_val * se); + interpolated.store(out_slice[c].data() + offset, len); + } + } + + inline void backward(TensorAccessor& gInp_slice, + TensorAccessor& gGrid_slice, + const TensorAccessor& gOut_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + Vec x, y, gx_mult, gy_mult; + std::tie(x, gx_mult) = compute_W.apply_get_grad(grid_x); + std::tie(y, gy_mult) = compute_H.apply_get_grad(grid_y); + + Vec n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask; + iVec i_y_n, i_x_w; + + std::tie( + n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask, + i_y_n, i_x_w) = compute_interp_params(x, y); + + auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW); + auto i_ne_offset = i_nw_offset + iVec(inp_sW); + auto i_sw_offset = i_nw_offset + iVec(inp_sH); + auto i_se_offset = i_sw_offset + iVec(inp_sW); + + auto i_gInp_nw_offset = i_y_n * iVec(inp_W) + i_x_w; + auto i_gInp_ne_offset = i_gInp_nw_offset + iVec(1); + auto i_gInp_sw_offset = i_gInp_nw_offset + iVec(inp_W); + auto i_gInp_se_offset = i_gInp_sw_offset + iVec(1); + + // When reading input values, we used mask_gather. Unfortunately, there is + // no mask_scatter_add (the backward of mask_gather) in Intel intrinsics. + // So we store the necessary vectors to temporary arrays and use the helper + // mask_scatter_add defined above. + + integer_t i_gInp_nw_offset_arr[iVec::size]; + integer_t i_gInp_ne_offset_arr[iVec::size]; + integer_t i_gInp_sw_offset_arr[iVec::size]; + integer_t i_gInp_se_offset_arr[iVec::size]; + i_gInp_nw_offset.store(i_gInp_nw_offset_arr); + i_gInp_ne_offset.store(i_gInp_ne_offset_arr); + i_gInp_sw_offset.store(i_gInp_sw_offset_arr); + i_gInp_se_offset.store(i_gInp_se_offset_arr); + + integer_t i_nw_mask_arr[iVec::size]; + integer_t i_ne_mask_arr[iVec::size]; + integer_t i_sw_mask_arr[iVec::size]; + integer_t i_se_mask_arr[iVec::size]; + nw_mask.store(i_nw_mask_arr); + ne_mask.store(i_ne_mask_arr); + sw_mask.store(i_sw_mask_arr); + se_mask.store(i_se_mask_arr); + + scalar_t gInp_corner_arr[Vec::size]; + + auto gx = Vec(0), gy = Vec(0); + #pragma unroll + for (int64_t c = 0; c < C; ++c) { + auto inp_slice_C_ptr = inp_slice[c].data(); + auto gInp_slice_C_ptr = gInp_slice[c].data(); + auto gOut = Vec::loadu(gOut_slice[c].data() + offset, len); + + (nw * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_nw_offset_arr, i_nw_mask_arr, len); + (ne * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_ne_offset_arr, i_ne_mask_arr, len); + (sw * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_sw_offset_arr, i_sw_mask_arr, len); + (se * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_se_offset_arr, i_se_mask_arr, len); + + // mask_gather zeros out the mask, so we need to make copies + Vec nw_mask_copy = nw_mask; + Vec ne_mask_copy = ne_mask; + Vec sw_mask_copy = sw_mask; + Vec se_mask_copy = se_mask; + auto nw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy); + auto ne_val = mask_gather(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy); + auto sw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy); + auto se_val = mask_gather(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy); + + gx = gx + ((ne_val - nw_val) * s + (se_val - sw_val) * n) * gOut; + gy = gy + ((sw_val - nw_val) * e + (se_val - ne_val) * w) * gOut; + } + + gx = gx * gx_mult; + gy = gy * gy_mult; + + constexpr int64_t step = Vec::size; + auto interleaved_gGrid = interleave2(gx, gy); + auto gGrid_ptr = gGrid_slice.data() + offset * 2; + std::get<0>(interleaved_gGrid).store(gGrid_ptr, + std::min(len * 2, step)); + std::get<1>(interleaved_gGrid).store(gGrid_ptr + step, + std::max(static_cast(0), len * 2 - step)); + } +}; + +template +struct ApplyGridSample { + using Vec = Vec256; + using integer_t = int_same_size_t; + using iVec = Vec256; + + const int64_t inp_H; + const int64_t inp_W; + const int64_t inp_sH; + const int64_t inp_sW; + const int64_t C; + const int64_t inp_sC; + const ComputeLocation compute_H; + const ComputeLocation compute_W; + const bool must_in_bound = padding != GridSamplerPadding::Zeros; + + ApplyGridSample(const TensorAccessor& input) + : inp_H(input.size(2)) + , inp_W(input.size(3)) + , inp_sH(input.stride(2)) + , inp_sW(input.stride(3)) + , C(input.size(1)) + , inp_sC(input.stride(1)) + , compute_H(input.size(2)) + , compute_W(input.size(3)) {} + + inline void forward(TensorAccessor& out_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + auto x = compute_W.apply(grid_x); + auto y = compute_H.apply(grid_y); + + auto x_nearest = x.round(); + auto y_nearest = y.round(); + + auto i_x_nearest = convert_to_int_of_same_size(x_nearest); + auto i_y_nearest = convert_to_int_of_same_size(y_nearest); + + auto i_mask = must_in_bound ? iVec(-1) + : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) & + (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H)); + auto mask = cast(i_mask); + + auto i_offset = i_y_nearest * iVec(inp_sH) + i_x_nearest * iVec(inp_sW); + + auto out_ptr = out_slice.data() + offset; + auto out_sC = out_slice.stride(0); + auto inp_slice_ptr = inp_slice.data(); + #pragma unroll + for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) { + // mask_gather zeros out the mask, so we need to make a copy + auto mask_copy = mask; + auto inp_val = mask_gather(Vec(0), inp_slice_ptr, i_offset, mask_copy); + inp_val.store(static_cast(out_ptr), len); + } + } + + inline void backward(TensorAccessor& gInp_slice, + TensorAccessor& gGrid_slice, + const TensorAccessor& gOut_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + auto x = compute_W.apply(grid_x); + auto y = compute_H.apply(grid_y); + + auto x_nearest = x.round(); + auto y_nearest = y.round(); + + auto i_x_nearest = convert_to_int_of_same_size(x_nearest); + auto i_y_nearest = convert_to_int_of_same_size(y_nearest); + + auto i_mask = must_in_bound ? iVec(-1) + : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) & + (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H)); + + auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest; // gInp is contiguous + + integer_t mask_arr[iVec::size]; + i_mask.store(mask_arr); + integer_t gInp_offset_arr[iVec::size]; + i_gInp_offset.store(gInp_offset_arr); + + #pragma unroll + for (int64_t c = 0; c < C; ++c) { + mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(), + gInp_offset_arr, mask_arr, len); + } + + // grid has zero 0 gradient in Nearest mode + auto gGrid_ptr = gGrid_slice.data() + offset * 2; + std::memset(gGrid_ptr, 0, sizeof(scalar_t) * len * 2); + } +}; + +// ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~ +// Function to apply a vectorized function on a grid slice tensor (without batch +// dimension). +// See NOTE [ Grid Sample CPU Kernels ] for details. + +template +static inline void grid_sample_2d_grid_slice_iterator( + const TensorAccessor& grid_slice, const ApplyFn &apply_fn) { + int64_t out_H = grid_slice.size(0); + int64_t out_W = grid_slice.size(1); + int64_t grid_sH = grid_slice.stride(0); + int64_t grid_sW = grid_slice.stride(1); + int64_t grid_sCoor = grid_slice.stride(2); + auto grid_ptr = grid_slice.data(); + + using Vec = Vec256; + using iVec = Vec256>; + constexpr int64_t step = Vec::size; + + // Loop over each output pixel in grid. + // We consider the following three cases (after slicing out the batch + // dimension). + // See detailed discussions under each if-case. + + if (at::geometry_is_contiguous({out_H, out_W, 2}, {grid_sH, grid_sW, grid_sCoor})) { + // Case 1: + // Grid is contiguous. + // Strategy: Sequentially load two vectors at the same time, and get, + // e.g., {x0, y0, x1, y1}, {x2, y2, x3, y3}. Then we use + // at::vec256::deinterleave2 to get x and y vectors. + auto total_size = out_H * out_W; + for (int64_t spatial_offset = 0; spatial_offset < total_size; spatial_offset += step) { + auto grid_offset = spatial_offset * 2; + auto len = std::min(step, total_size - spatial_offset); + auto vec1 = Vec::loadu(grid_ptr + grid_offset, + std::min(step, len * 2)); + auto vec2 = Vec::loadu(grid_ptr + grid_offset + step, + std::max(static_cast(0), len * 2 - step)); + auto vec_xy_pair = deinterleave2(vec1, vec2); + + auto x = std::get<0>(vec_xy_pair); + auto y = std::get<1>(vec_xy_pair); + + // make sure that x and y are valid grid sample locations + if (len < step) { + x = Vec::set(Vec(0), x, len); + y = Vec::set(Vec(0), y, len); + } + apply_fn(x, y, spatial_offset, len); + } + } else if (grid_sW == 1 || out_W == 1) { + // Case 2: + // The W dimension is contiguous. + // This can be common, e.g., grid is from a conv net output of shape + // [N, 2, H, W]. + // Strategy: Divide into two contiguous slices each of shape [H, W], and + // each containing x and y vectors. So we sequentially load a + // vector from each of them to get x and y vector + + // Function to apply along a contiguous W dimension (or flattened H x W). + auto line_fn = [&](const scalar_t *grid_ptr_x, const scalar_t *grid_ptr_y, + int64_t out_base_offset, int64_t total_size) { + for (int64_t i = 0; i < total_size; i += step) { + auto len = std::min(step, total_size - i); + auto x = Vec::loadu(grid_ptr_x + i, len); + auto y = Vec::loadu(grid_ptr_y + i, len); + // make sure that x and y are valid grid sample locations + if (len < step) { + x = Vec::set(Vec(0), x, len); + y = Vec::set(Vec(0), y, len); + } + apply_fn(x, y, out_base_offset + i, len); + } + }; + + if (at::geometry_is_contiguous({out_H, out_W}, {grid_sH, grid_sW})) { + // If [H, W] is contiguous, apply line_fn once. + line_fn(grid_ptr, grid_ptr + grid_sCoor, 0, out_H * out_W); + } else { + // If only [W] is contiguous, apply line_fn once for each h slice. + auto grid_ptr_NH = grid_ptr; + for (int64_t h = 0; h < out_H; h++) { + line_fn(grid_ptr_NH, grid_ptr_NH + grid_sCoor, h * out_W, out_W); + grid_ptr_NH += grid_sH; + } + } + } else { + // Case 3: + // General case. + // Strategy: Do a for-loop over H, for each W slice, use + // at::vec256::gather to load the x and y vectors. + auto spatial_offset = 0; + auto i_offsets_delta = iVec(grid_sW * step); + + #pragma unroll + for (int64_t h = 0; h < out_H; h++) { + auto grid_ptr_x = grid_ptr + h * grid_sH; + auto grid_ptr_y = grid_ptr_x + grid_sCoor; + auto i_offsets = iVec::arange(0, grid_sW); + #pragma unroll + for (int64_t w = 0; w < out_W; w += step) { + auto len = std::min(step, out_W - w); + if (len < step) { + // prevents illegal memory access, sets the exceeding offsets to zero + i_offsets = iVec::set(iVec(0), i_offsets, len); + } + apply_fn(gather(grid_ptr_x, i_offsets), + gather(grid_ptr_y, i_offsets), + spatial_offset, len); + + i_offsets = i_offsets + i_offsets_delta; + spatial_offset += len; + } + } + } +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~ Grid Sample Kernels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Use the structs & functions defined above to calculate grid sample forward +// and backward. +// See NOTE [ Grid Sample CPU Kernels ] for details. + +Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode) { + auto N = input.size(0); + auto H = grid.size(1); + auto W = grid.size(2); + auto output = at::empty({N, input.size(1), H, W}, input.options()); + auto spatial_size = H * W; + auto grain_size = spatial_size == 0 ? (N + 1) + : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/); + +#define HANDLE_CASE(interp, padding) \ + case padding: { \ + ApplyGridSample grid_sample(inp_acc); \ + parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) { \ + for (int64_t n = begin; n < end; n++) { \ + auto out_slice = out_acc[n]; \ + auto inp_slice = inp_acc[n]; \ + grid_sample_2d_grid_slice_iterator( \ + grid_acc[n], \ + [&](const Vec256& grid_x, const Vec256& grid_y, \ + int64_t spatial_offset, int64_t len) { \ + grid_sample.forward(out_slice, inp_slice, spatial_offset, \ + grid_x, grid_y, len); \ + }); \ + } \ + }); \ + return; \ + } + +#define HANDLE_INTERP(interp) \ + case interp: { \ + switch (static_cast(padding_mode)) { \ + HANDLE_CASE(interp, GridSamplerPadding::Zeros); \ + HANDLE_CASE(interp, GridSamplerPadding::Border); \ + HANDLE_CASE(interp, GridSamplerPadding::Reflection); \ + } \ + return; \ + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_cpu_kernel_impl", [&] { + auto out_acc = output.accessor(); + auto inp_acc = input.accessor(); + auto grid_acc = grid.accessor(); + switch (static_cast(interpolation_mode)) { + HANDLE_INTERP(GridSamplerInterpolation::Bilinear); + HANDLE_INTERP(GridSamplerInterpolation::Nearest); + } + }); +#undef HANDLE_CASE +#undef HANDLE_INTERP + + return output; +} + +std::tuple +grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, + const Tensor& input, + const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode) { + // grad_output should be contiguous most of time. Ensuring that it is + // contiguous can greatly simplify this code. + auto grad_output = grad_output_.contiguous(); + + auto grad_input = at::zeros_like(input); + auto grad_grid = at::empty_like(grid); + auto N = input.size(0); + auto spatial_size = grid.size(1) * grid.size(2); + auto grain_size = spatial_size == 0 ? (N + 1) + : at::divup(at::internal::GRAIN_SIZE, spatial_size * 10 /* 2d * 5 tensors*/); + +#define HANDLE_CASE(interp, padding) \ + case padding: { \ + ApplyGridSample grid_sample(inp_acc); \ + parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) { \ + for (int64_t n = begin; n < end; n++) { \ + auto gInp_slice = gInp_acc[n]; \ + auto gGrid_slice = gGrid_acc[n]; \ + auto gOut_slice = gOut_acc[n]; \ + auto inp_slice = inp_acc[n]; \ + grid_sample_2d_grid_slice_iterator( \ + grid_acc[n], \ + [&](const Vec256& grid_x, const Vec256& grid_y, \ + int64_t spatial_offset, int64_t len) { \ + grid_sample.backward(gInp_slice, gGrid_slice, gOut_slice, inp_slice, \ + spatial_offset, grid_x, grid_y, len); \ + }); \ + } \ + }); \ + return; \ + } + +#define HANDLE_INTERP(interp) \ + case interp: { \ + switch (static_cast(padding_mode)) { \ + HANDLE_CASE(interp, GridSamplerPadding::Zeros); \ + HANDLE_CASE(interp, GridSamplerPadding::Border); \ + HANDLE_CASE(interp, GridSamplerPadding::Reflection); \ + } \ + return; \ + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] { + auto gInp_acc = grad_input.accessor(); + auto gGrid_acc = grad_grid.accessor(); + auto inp_acc = input.accessor(); + auto grid_acc = grid.accessor(); + auto gOut_acc = grad_output.accessor(); + switch (static_cast(interpolation_mode)) { + HANDLE_INTERP(GridSamplerInterpolation::Bilinear); + HANDLE_INTERP(GridSamplerInterpolation::Nearest); + } + }); +#undef HANDLE_CASE +#undef HANDLE_INTERP + + return std::make_tuple(grad_input, grad_grid); +} + +} + +REGISTER_DISPATCH(grid_sampler_2d_cpu_kernel, &grid_sampler_2d_cpu_kernel_impl); +REGISTER_DISPATCH(grid_sampler_2d_backward_cpu_kernel, &grid_sampler_2d_backward_cpu_kernel_impl); + + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h new file mode 100644 index 00000000000000..36ba3a91cc9bb8 --- /dev/null +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h @@ -0,0 +1,18 @@ +#pragma once + +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" +#include "ATen/NativeFunctions.h" +#include "ATen/native/DispatchStub.h" +#include "ATen/cpu/vml.h" + +#include + +namespace at { namespace native { + +using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t); +using backward_2d_fn = std::tuple(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t); +DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel); +DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index feea350fd08306..3be4ed4e06c28d 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -183,9 +183,163 @@ static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional +struct NormReduction { + // reduction width in number of scalar elements + static constexpr int WIDTH = 128 / sizeof(scalar_t); + using Vec = Vec256; + + static void apply(Tensor& res, const Tensor& self, Scalar p, at::optional dim) { + auto out_ = res.data(); + auto data_ = self.data(); + auto numel = self.numel(); + float pval = 0.0; + if (p.isIntegral()){ + pval = p.to(); + } else if (p.isFloatingPoint()) { + pval = p.to(); + } + if (!dim.has_value()) { + *out_ = reduce_all(data_, numel, pval); + return; + } + int64_t n = self.size(*dim); + int64_t stride = self.stride(*dim); + // A contiguous tensor does not need to hold a meaningful stride + // if the corresponding size is 1 + if (n == 1) { + stride = 1; + for (int64_t i = self.ndimension() - 1; i > *dim; i--) { + stride *= self.size(i); + } + } + int64_t batch = numel / n; + parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) { + for (int64_t bi = begin; bi < end; bi++) { + int64_t b = bi / stride; + int64_t i = bi % stride; + const scalar_t* data = &data_[b * n * stride + i]; + out_[bi] = norm_reduce(data, n, stride, pval); + } + }); + } + + static scalar_t reduce_all(const scalar_t* data_, int64_t size, float pval) { + scalar_t sum = parallel_reduce( + 0, + size, + internal::GRAIN_SIZE, + (scalar_t)0, + [=](int64_t begin, int64_t end, scalar_t init) { + const scalar_t* data = &data_[begin]; + int64_t n = end - begin; + scalar_t result = norm_reduce(data, n, 1, pval); + return result; + }, + std::plus()); + return sum; + } + + static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) { + scalar_t result = 0.0; + if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) { + int64_t n_rounded = round_down(n, WIDTH); + scalar_t result1 = norm_reduce128(data, n_rounded, pval); + scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval); + result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval); + } else { + result = norm_reduce_sequential(data, n, stride, pval); + } + return result; + } + + static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) { + scalar_t result = 0.0; + if (pval == 0) { + for (int64_t k = 0; k < n; k++) { + result += (data[k * stride] != 0.0); + } + } else if (pval == 1) { + for (int64_t k = 0; k < n; k++) { + result += std::abs(data[k * stride]); + } + } else if (pval == 2) { + for (int64_t k = 0; k < n; k++) { + result += data[k * stride] * data[k * stride]; + } + result = std::sqrt(result); + } else if (pval == 3) { + for (int64_t k = 0; k < n; k++) { + result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]); + } + result = std::pow(result, 1.0/3); + } else if (std::isinf(pval)) { + for (int64_t k = 0; k < n; k++) { + result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result; + } + result = result; + } else { + for (int64_t k = 0; k < n; k++) { + result += std::pow(std::abs(data[k * stride]), pval); + } + result = std::pow(result, 1.0/pval); + } + return result; + } + + // Reduce down a column of WIDTH elements (128 bytes) with the given number n + // n is already rounded by 128 + static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) { + scalar_t result = 0.0; + Vec acc[4] = {0.0, 0.0, 0.0, 0.0}; // 128 bytes (two cache lines) + static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes"); + int64_t rows = n / WIDTH; + if (pval == 1){ + for (int row = 0; row < rows; row ++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); + acc[j] = acc[j] + val.abs(); + } + } + } + else if (pval == 2) { + for (int row = 0; row < rows; row ++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); + acc[j] = acc[j] + val * val; + } + } + } + else if (pval == 3) { + for (int row = 0; row < rows; row ++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); + acc[j] = acc[j] + (val * val * val).abs(); + } + } + } + scalar_t buf[WIDTH] = {0}; + for (int j = 0; j != 4; j++) { + acc[j].store(&buf[j * Vec::size]); + } + for (int i = 0; i < WIDTH; i++) { + result += buf[i]; + } + result = std::pow(result, 1.0/pval); + return result; + } +}; + +static void norm_kernel_impl(Tensor& result, const Tensor& self, Scalar p, at::optional dim) { + AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] { + NormReduction::apply(result, self, p, dim); + }); +} + } // anonymous namespace REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl); REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl); +REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl); }} // namespace at::native diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h index 4c5c8c15149a1b..5fc7c60ff2803d 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h @@ -11,4 +11,7 @@ using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional); DECLARE_DISPATCH(reduce_fn, sum_kernel); DECLARE_DISPATCH(reduce_fn, prod_kernel); +using reduce_norm_fn = void(*)(Tensor &, const Tensor &, Scalar, at::optional); +DECLARE_DISPATCH(reduce_norm_fn, norm_kernel); + }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index 37c8f3a364f75a..8715a9ef460ee6 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -112,22 +112,16 @@ class CuFFTConfig { if (input.type().scalarType() == ScalarType::Half) { // cuFFT on half requires compute capability of at least SM_53 auto dev_prop = at::cuda::getCurrentDeviceProperties(); - if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) { - std::ostringstream ss; - ss << "cuFFT doesn't support signals of half type with compute " - << "capability less than SM_53, but the device containing input half " - << "tensor only has SM_" << dev_prop->major << dev_prop->minor; - throw std::runtime_error(ss.str()); - } + AT_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + "cuFFT doesn't support signals of half type with compute " + "capability less than SM_53, but the device containing input half " + "tensor only has SM_", dev_prop->major, dev_prop->minor); for (int64_t i = 0; i < signal_ndim; i++) { auto signal_size = checked_signal_sizes[i]; - if (!is_pow_of_two(signal_size)) { - std::ostringstream ss; - ss << "cuFFT doesn't support signals of half type with size at any " - << "dimension that is not a power of two, but got a signal size of " - << checked_signal_sizes; - throw std::runtime_error(ss.str()); - } + AT_CHECK(is_pow_of_two(signal_size), + "cuFFT doesn't support signals of half type with size at any ", + "dimension that is not a power of two, but got a signal size of ", + checked_signal_sizes); } clone_input |= input.stride(signal_ndim) != 1; } @@ -212,7 +206,7 @@ class CuFFTConfig { } else if (!complex_input && complex_output) { exec_type = HIPFFT_R2C; } else { - throw std::runtime_error("hipFFT doesn't support r2r (float)"); + AT_ERROR("hipFFT doesn't support r2r (float)"); } } else if (input.type().scalarType() == ScalarType::Double) { if (complex_input && complex_output) { @@ -222,13 +216,13 @@ class CuFFTConfig { } else if (!complex_input && complex_output) { exec_type = HIPFFT_D2Z; } else { - throw std::runtime_error("hipFFT doesn't support r2r (double)"); + AT_ERROR("hipFFT doesn't support r2r (double)"); } } else { std::ostringstream ss; ss << "hipFFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } #else @@ -249,7 +243,7 @@ class CuFFTConfig { std::ostringstream ss; ss << "cuFFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } #endif diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 853c04deb2215c..afa4c8e1916604 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -73,14 +73,22 @@ __global__ void EmbeddingBag_updateOutputKernel( } } if (mode == MODE_MEAN) { - weightFeatSum = weightFeatSum / static_cast(bag_size_); - bag_size[bag] = bag_size_; + if (end == begin) { + bag_size[bag] = 0; + } else { + weightFeatSum = weightFeatSum / static_cast(bag_size_); + bag_size[bag] = bag_size_; + } } if (mode == MODE_MEAN || mode == MODE_SUM) { output[bag * featureSize + featureDim] = static_cast(weightFeatSum); } else if (mode == MODE_MAX) { + if (end == begin) { + // If bag is empty, set output to 0. + weightFeatMax = 0; + } max_indices[bag * featureSize + featureDim] = maxWord; output[bag * featureSize + featureDim] = weightFeatMax; } @@ -268,8 +276,10 @@ __global__ void EmbeddingBag_accGradParametersKernel_max( int64_t bag = chunk / chunksPerBag; int64_t word_idx = max_indices[bag * stride + featureDim]; - - atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]); + if (word_idx >= 0) { + // If bag is empty, we have max_indices[idx] set to -1 in forward. + atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]); + } } } } diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index bc37e83990e192..80c7aaeb74f6a8 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -84,7 +84,7 @@ static inline Storage pin_memory(int64_t size, Tensor dummy) { name = static_cast(storage_##name.data()); template -static void applyGesv(Tensor& b, Tensor& A, std::vector infos) { +static void applyGesv(Tensor& b, Tensor& A, std::vector& infos) { #ifndef USE_MAGMA AT_ERROR("gesv: MAGMA library not found in " "compilation. Please rebuild with MAGMA."); diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu new file mode 100644 index 00000000000000..1c3609f50b201c --- /dev/null +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -0,0 +1,25 @@ +#include "ATen/ATen.h" + +namespace at { namespace native { + +Tensor baddbmm_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return _th_baddbmm(self, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm_out_cuda(Tensor &result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return _th_baddbmm_out(result, self, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm__cuda(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return _th_baddbmm_out(self, self, batch1, batch2, beta, alpha); +} + +Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) { + return _th_bmm(self, mat2); +} + +Tensor& bmm_out_cuda(Tensor &result, const Tensor& batch1, const Tensor& batch2) { + return _th_bmm_out(result, batch1, batch2); +} + +} } diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index 16d7935f3d49fa..dc0b5af8d4e264 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -227,7 +227,7 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu... - constexpr int max_threads = 1024; + constexpr int max_threads = std::is_same::value ? 1024 : 896; // we need 72 or so 32 bit registers for double int threads_target = max_threads; while (threads_target / 2 >= 2*max_target_length+1) { threads_target /= 2; @@ -247,6 +247,7 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors return std::make_tuple(neg_log_likelihood, log_alpha); } @@ -452,7 +453,7 @@ __global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ grad scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c]; if (t < input_length) { scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c]; - res = std::exp(lp)-std::exp(res + nll - lp) * gr; + res = (std::exp(lp)-std::exp(res + nll - lp)) * gr; } else { res = 0.; @@ -505,7 +506,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta)) // As above, there may be better configurations to use. - constexpr int max_threads = 1024; + constexpr int max_threads = std::is_same::value ? 1024 : 896; // we need 72 or so 32 bit registers for double int threads_target = max_threads; while (threads_target / 2 >= 2*max_target_length+1) { threads_target /= 2; @@ -526,6 +527,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors } // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on @@ -550,7 +552,10 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ .sub_(log_probs.narrow(2, BLANK, 1)) .exp_() ); - // Tor the non-blank characters, we use a kernel to compute the subtrahend. + // scale by output gradient (blanks and first summand of non-blanks) + grad *= grad_out.view({1, batch_size, 1}); + + // For the non-blank characters, we use a kernel to compute the subtrahend. // Again we might configure block and grid in a better way. int threads_target = max_threads; while (threads_target / 2 >= max_target_length) { @@ -572,6 +577,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, num_labels, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors } else { // small problem, use naive algorithm // Still no block/grid configuration guru... int threads_input = max_threads; @@ -595,6 +601,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, num_labels, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors } return grad; } diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index c82f4e7afb87de..38b1dddb496276 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -206,7 +206,7 @@ static inline Tensor _run_cufft( CUFFT_CHECK(hipfftExecR2C(plan, static_cast(input.data_ptr()), static_cast(output.data_ptr()))); } else { - throw std::runtime_error("hipFFT doesn't support r2r (float)"); + AT_ERROR("hipFFT doesn't support r2r (float)"); } } else if (input.type().scalarType() == ScalarType::Double) { if (complex_input && complex_output) { @@ -220,13 +220,13 @@ static inline Tensor _run_cufft( CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(input.data_ptr()), static_cast(output.data_ptr()))); } else { - throw std::runtime_error("hipFFT doesn't support r2r (double)"); + AT_ERROR("hipFFT doesn't support r2r (double)"); } } else { std::ostringstream ss; ss << "hipFFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } #else CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(), diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu new file mode 100644 index 00000000000000..67d8f39e2de71d --- /dev/null +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -0,0 +1,502 @@ +#include "ATen/ATen.h" +#include "ATen/AccumulateType.h" +#include "ATen/TensorUtils.h" +#include "ATen/core/Error.h" + +#include "ATen/cuda/CUDAContext.h" +#include +#include + +namespace at { +namespace native { +namespace { + +// Block size for weight_norm_*_first_dim_kernel. +// Currently, kernels are non-persistent. +// Dialing up the block size to, say 1024, can improve performance by +// increase the amount of cache available per block, which can improve cache hit rate. +// However, this is less efficient for short rows. 256 is pretty versatile. +// May be worth implementing heuristics later. +#define BLOCK 256 + +// Block size for weight_norm_*_last_dim_kernel. +// This is tricker than the first_dim case because we must make blocks +// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses. +// Since output-element parallelism is along the fast dimension, this reduces the number of +// blocks we can launch by 16X. +#define TILE_W 16 +// Somewhat versatile strategy: max out intra-block parallelism by extending +// blocks across the slow dimension up to the hardware-max block size of 1024. +#define TILE_H 64 + +template +__device__ __forceinline__ void reduce_block_into_lanes + (T *x, + T val, + int lanes, // lanes is intended to be <= 32. + ReduceOp reduceOp) +{ + int tid = threadIdx.x + threadIdx.y*blockDim.x; + int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32. + + if(blockSize >= 64) + { + x[tid] = val; + __syncthreads(); + } + + #pragma unroll + for(int i = (blockSize >> 1); i >= 64; i >>= 1) + { + if(tid < i) + x[tid] = reduceOp(x[tid], x[tid+i]); + __syncthreads(); + } + + if(tid < 32) + { + T final; + if(blockSize >= 64) + final = reduceOp(x[tid], x[tid+32]); + else + final = val; + // __SYNCWARP(); + + #pragma unroll + for(int i = 16; i >= lanes; i >>= 1) + final = reduceOp(final, WARP_SHFL_DOWN(final, i)); + + if(tid < lanes) + x[tid] = final; // EpilogueOp + } + + // Make sure the smem result is visible to all warps. + __syncthreads(); +} + +template + +__global__ void weight_norm_fwd_first_dim_kernel + (scalar_t* __restrict__ w, + accscalar_t* __restrict__ norms, + const scalar_t* __restrict__ v, + const scalar_t* __restrict__ g, + const int rowSize) +{ + // We are norming each slowest-dim row of the tensor separately. + // For now, assign one block to each row. + const int tid = threadIdx.x; + const int row = blockIdx.x; + const int stride = blockDim.x; + + // Logical index offset for this flattened row + const int rowStart = row*rowSize; + + // Hack to get around nvcc complaining when an smem array is declared with the same name + // but different types in different kernels (in this case different instantiations) + // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s" + extern __shared__ char buf[]; + accscalar_t* s = (accscalar_t*)buf; + + accscalar_t thread_sum = 0.f; + for(int i = tid; i < rowSize; i += stride ) + { + accscalar_t val_f = scalar_cast(v[i+rowStart]); + thread_sum += val_f*val_f; // AccumOp, could do Kahan here + } + + reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd()); + accscalar_t result = s[0]; + + result = sqrtf(result); + + if(tid == 0) + norms[row] = result; + + // Broadcast load, could use shared memory instead. + accscalar_t g_this_row = scalar_cast(g[row]); + + accscalar_t rnorm = 1.f/result; // for consistency with backward kernel + + // Write data to output + for(int i = tid; i < rowSize; i += stride ) + { + accscalar_t val_f = scalar_cast(v[i+rowStart]); + w[i+rowStart] = scalar_cast(g_this_row*val_f*rnorm); + } +} + +template + +__global__ void weight_norm_fwd_last_dim_kernel +( + scalar_t* __restrict__ w, + accscalar_t* __restrict__ norms, + const scalar_t* __restrict__ v, + const scalar_t* __restrict__ g, + const int fast_dim_size, + const int slower_dims_size +) +{ + const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x; + + extern __shared__ char buf[]; + accscalar_t* alloc = (accscalar_t*)buf; + accscalar_t* s = &alloc[0]; + accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y]; + + accscalar_t thread_sum = 0.f; + + int slower_dims_location = threadIdx.y; + int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t val_f = scalar_cast(v[currentIdx]); + thread_sum += val_f*val_f; // AccumOp, could do Kahan here + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } + + reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd()); + + // Better to pass an EpilogueOp to reduce_block_into_lanes? + if(threadIdx.y == 0) + { + accscalar_t result = s[threadIdx.x]; + accscalar_t norm_this_col = sqrtf(result); + norms[fast_dim_location] = norm_this_col; + rnorms_this_block[threadIdx.x] = 1.f/norm_this_col; + } + + __syncthreads(); + + accscalar_t g_this_col = scalar_cast(g[fast_dim_location]); + accscalar_t rnorm = rnorms_this_block[threadIdx.x]; + + slower_dims_location = threadIdx.y; + currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t val_f = scalar_cast(v[currentIdx]); + w[currentIdx] = scalar_cast(g_this_col*val_f*rnorm); + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } +} + +template + +__global__ void weight_norm_bwd_first_dim_kernel + (scalar_t* __restrict__ grad_v, + scalar_t* __restrict__ grad_g, + const scalar_t* __restrict__ grad_w, + const scalar_t* __restrict__ saved_v, + const scalar_t* __restrict__ saved_g, + const accscalar_t* __restrict__ saved_norms, + const int rowSize) +{ + // For now, assign one block to each row. + const int tid = threadIdx.x; + const int row = blockIdx.x; + const int stride = blockDim.x; + + // Logical index offset for this flattened row + const int rowStart = row*rowSize; + + // Hack to get around nvcc complaining when an smem array is declared with the same name + // but different types in different kernels (in this case different instantiations) + // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s" + extern __shared__ char buf[]; + accscalar_t* s = (accscalar_t*)buf; + + accscalar_t thread_sum = 0.f; + for(int i = tid; i < rowSize; i += stride ) + { + accscalar_t grad_wi = scalar_cast(grad_w[i+rowStart]); + accscalar_t saved_vi = scalar_cast(saved_v[i+rowStart]); + thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here + } + + reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd()); + accscalar_t result = s[0]; + + // Could choose to save reciprocal of norm instead I suppose, but norms is probably + // more handy to keep around. + // Broadcast load; could use shared memory instead. + accscalar_t rnorm = 1.f/saved_norms[row]; + accscalar_t rnorm3 = rnorm*rnorm*rnorm; + + // Write g gradients. + if(tid == 0) + grad_g[row] = scalar_cast(result*rnorm); + + // Broadcast load, could use shared memory instead. + accscalar_t g_this_row = scalar_cast(saved_g[row]); + + // Write v gradients. We are reusing values that were loaded earlier, so there + // is an optimization opportunity here (store values persistently). + for(int j = tid; j < rowSize; j += stride ) + { + accscalar_t grad_wj = scalar_cast(grad_w[j+rowStart]); + accscalar_t saved_vj = scalar_cast(saved_v[j+rowStart]); + accscalar_t grad_vj = g_this_row*(rnorm*grad_wj - rnorm3*saved_vj*result); + grad_v[j+rowStart] = scalar_cast(grad_vj); + } +} + +template + +__global__ void weight_norm_bwd_last_dim_kernel + (scalar_t* __restrict__ grad_v, + scalar_t* __restrict__ grad_g, + const scalar_t* __restrict__ grad_w, + const scalar_t* __restrict__ saved_v, + const scalar_t* __restrict__ saved_g, + const accscalar_t* __restrict__ saved_norms, + const int fast_dim_size, + const int slower_dims_size) +{ + const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x; + + extern __shared__ char buf[]; + accscalar_t* s = (accscalar_t*)buf; + + accscalar_t thread_sum = 0.f; + + int slower_dims_location = threadIdx.y; + int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t grad_wi = scalar_cast(grad_w[currentIdx]); + accscalar_t saved_vi = scalar_cast(saved_v[currentIdx]); + thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } + + reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd()); + accscalar_t result = s[threadIdx.x]; + + // Broadcast load; could use shared memory instead. + accscalar_t rnorm = 1.f/saved_norms[fast_dim_location]; + accscalar_t rnorm3 = rnorm*rnorm*rnorm; + + // Write g gradients. + if(threadIdx.y == 0) + grad_g[fast_dim_location] = scalar_cast(result*rnorm); + + // Entire block pulls these values, could use shared memory instead. + accscalar_t g_this_col = scalar_cast(saved_g[fast_dim_location]); + + // Write v gradients. + slower_dims_location = threadIdx.y; + currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t grad_wj = scalar_cast(grad_w[currentIdx]); + accscalar_t saved_vj = scalar_cast(saved_v[currentIdx]); + accscalar_t grad_vj = g_this_col*(rnorm*grad_wj - rnorm3*saved_vj*result); + grad_v[currentIdx] = scalar_cast(grad_vj); + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } +} + +} // anonymous namespace + +std::tuple weight_norm_cuda + (const Tensor & v, + const Tensor & g, + int64_t dim) +{ + auto w = at::empty_like(v); + + // weight_norm_fused does have a derivative defined in derivatives.yaml, therefore, VariableType.cpp + // sends the unpacked g.data() as the argument. In other words, we expect "g" is a bare Tensor here. + + // norms is only needed to stash for backward. + // g.type().scalarType() may be at::ScalarType::Double, Float, or Half. + // If Half, stash norms as float. + at::ScalarType AccType = g.type().scalarType() == at::ScalarType::Half ? + at::ScalarType::Float : g.type().scalarType(); + // Will this create norms on the same device as g, regardless of what the thread's default + // current device is? I believe so, because Type::* functions are DeviceGuard()ed. + auto norms = g.type().toScalarType(AccType).tensor(g.sizes(), g.strides()); + + const int ndims = v.dim(); + + if(dim == 0) + { + // Find logical size of each flattened slowest-dim row + int rowSize = 1; + for(int i = ndims - 1; i > 0; i--) + rowSize *= v.size(i); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (v.type(), + "weight_norm_fwd_first_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_fwd_first_dim_kernel + <<>> + (w.data(), + norms.data(), + v.data(), + g.data(), + rowSize); + }); + } + else if(dim == ndims - 1) + { + // Precompute slower_dims_size and fast_dim_size + int slower_dims_size = 1; + for(int i = 0; i < ndims - 1; i++) + slower_dims_size *= v.size(i); + + int fast_dim_size = v.size(ndims-1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (v.type(), + "weight_norm_fwd_last_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_fwd_last_dim_kernel + <<<(fast_dim_size+TILE_W-1)/TILE_W, + dim3(TILE_W,TILE_H), + (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t), + stream>>> + (w.data(), + norms.data(), + v.data(), + g.data(), + fast_dim_size, + slower_dims_size); + }); + } + + // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, + // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught + // until a later error check on a synchronizing CUDA call. Unfortunately, without manually + // synchronizing here, this is the best we can do. + THCudaCheck(cudaGetLastError()); + + return std::tuple{w, norms}; +} + +std::tuple weight_norm_cuda_backward + (const Tensor & grad_w, + const Tensor & saved_v, + const Tensor & saved_g, + const Tensor & saved_norms, + int64_t dim) +{ + // These checks should always succeed, because weight_norm_fused_backward should only + // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g. + AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + AT_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim") + + auto grad_v = at::empty_like(saved_v); + auto grad_g = at::empty_like(saved_g); + + const int ndims = saved_v.dim(); + + if(dim == 0) + { + // Find logical size of each flattened slowest-dim row + int rowSize = 1; + for(int i = ndims - 1; i > 0; i--) + rowSize *= saved_v.size(i); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (saved_v.type(), + "weight_norm_bwd_first_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_bwd_first_dim_kernel + <<>> + (grad_v.data(), + grad_g.data(), + grad_w.data(), + saved_v.data(), + saved_g.data(), + saved_norms.data(), + rowSize); + }); + } + else if(dim == ndims - 1) + { + // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array. + int slower_dims_size = 1; + for(int i = 0; i < ndims - 1; i++) + slower_dims_size *= saved_v.size(i); + + int fast_dim_size = saved_v.size(ndims-1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (saved_v.type(), + "weight_norm_bwd_last_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_bwd_last_dim_kernel + <<<(fast_dim_size+TILE_W-1)/TILE_W, + dim3(TILE_W,TILE_H), + (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t), + stream>>> + (grad_v.data(), + grad_g.data(), + grad_w.data(), + saved_v.data(), + saved_g.data(), + saved_norms.data(), + fast_dim_size, + slower_dims_size); + }); + } + + // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, + // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught + // until a later error check on a synchronizing CUDA call. Unfortunately, without manually + // synchronizing here, this is the best we can do. + THCudaCheck(cudaGetLastError()); + + return std::tuple{grad_v, grad_g}; +} + +#undef BLOCK +#undef TILE_W +#undef TILE_H + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp index 6856c465e9e8ef..463d4ffea3cf04 100644 --- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp +++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp @@ -12,13 +12,13 @@ namespace at { namespace native { Tensor cudnn_affine_grid_generator_forward( const Tensor& theta, int64_t N, int64_t C, int64_t H, int64_t W) { - throw std::runtime_error("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); } Tensor cudnn_affine_grid_generator_backward( const Tensor& grad_theta, int64_t N, int64_t C, int64_t H, int64_t W) { - throw std::runtime_error("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); } }} diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp index 7cd7466a285035..d54fe256b29152 100644 --- a/aten/src/ATen/native/cudnn/BatchNorm.cpp +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -13,7 +13,7 @@ std::tuple cudnn_batch_norm( const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double exponential_average_factor, double epsilon) { - throw std::runtime_error("cudnn_batch_norm: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_batch_norm: ATen not compiled with cuDNN support"); } std::tuple cudnn_batch_norm_backward( @@ -21,7 +21,7 @@ std::tuple cudnn_batch_norm_backward( const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_var, double epsilon) { - throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); } }} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp index a94916532a3f02..afbd7653aefa67 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -14,61 +14,61 @@ at::Tensor cudnn_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_input( IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_bias( const at::Tensor& grad_output) { - throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support"); } std::tuple cudnn_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose_backward_input( const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); } std::tuple cudnn_convolution_transpose_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); } }} @@ -194,16 +194,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { // Used on pad, stride and dilation static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name) { - if (args.size() > expected_size){ - std::stringstream ss; - ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } - else if (args.size() < expected_size){ - std::stringstream ss; - ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + AT_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); if (num_negative_values > 0){ @@ -211,7 +207,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const ss << arg_name << " should be greater than zero but got ("; std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } } @@ -449,7 +445,7 @@ perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) { return perfResults[i]; } } - throw std::runtime_error("no deterministic convolution algorithms available in CuDNN"); + AT_ERROR("no deterministic convolution algorithms available in CuDNN"); } else { return perfResults[0]; } diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp index 1ce92cf7e18d2a..e859344bcc3691 100644 --- a/aten/src/ATen/native/cudnn/GridSampler.cpp +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -11,13 +11,13 @@ namespace at { namespace native { Tensor cudnn_grid_sampler_forward( const Tensor& input_t, const Tensor& grid_t) { - throw std::runtime_error("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support"); } std::tuple cudnn_grid_sampler_backward( const Tensor& input_t, const Tensor& grid_t, const Tensor& grad_output_t) { - throw std::runtime_error("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support"); } }} diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp index 966aa20e0a128d..98c0cb7918f02f 100644 --- a/aten/src/ATen/native/cudnn/LossCTC.cpp +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -14,7 +14,7 @@ namespace at { namespace native { // See Note [ATen preprocessor philosophy] std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) { - throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); + AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); } }} diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 8fc896afe23a12..876590409c43c4 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -22,7 +22,7 @@ Tensor _cudnn_rnn_flatten_weight( int64_t fn_num_layers, bool batch_first, bool fn_bidirectional ) { - throw std::runtime_error("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support"); } std::tuple _cudnn_rnn( @@ -34,7 +34,7 @@ std::tuple _cudnn_rnn( bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes, const Tensor& fn_dropout_state ) { - throw std::runtime_error("_cudnn_rnn: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support"); } std::tuple> _cudnn_rnn_backward( @@ -47,11 +47,11 @@ std::tuple> _cudnn_rnn_backward( const Tensor& dropout_state, const Tensor& reserve, std::array output_mask ) { - throw std::runtime_error("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); } Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) { - throw std::runtime_error("_cudnn_init_dropout_state: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support"); } }} // namespace at::native @@ -123,7 +123,7 @@ namespace { { std::ostringstream oss; oss << "unrecognized cuDNN RNN mode " << fn_mode; - throw std::runtime_error(oss.str()); + AT_ERROR(oss.str()); } } } @@ -131,7 +131,7 @@ namespace { void set_bidirectional(bool fn_bidirectional) { bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; } - + void set_algo(cudnnRNNAlgo_t algo){ this->algo = algo; } @@ -570,7 +570,7 @@ namespace { if (prop->major == 7 && rnn.datatype == CUDNN_DATA_HALF && !tensors.is_input_packed()) { if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && tensors.input_size <=1024 && rnn.num_directions() == 1 && rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){ - //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, + //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, //weed them out if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){ if ((tensors.seq_length >=40 && bsize <=128) || @@ -599,9 +599,8 @@ Tensor _cudnn_rnn_flatten_weight( bool fn_bidirectional ) { - if (weight_arr.size() == 0) { - throw std::runtime_error("_cudnn_rnn_flatten_weight_: cannot flatten empty weight list"); - } + AT_CHECK(weight_arr.size() > 0, + "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list"); auto any_param = weight_arr[0]; @@ -671,9 +670,8 @@ std::tuple _cudnn_rnn( // TODO: Set device to input if (fn.rnn.mode != CUDNN_LSTM) { - if (cx.defined()) { - throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); - } + AT_CHECK(!cx.defined(), + "rnn: illegal defined cx for non-LSTM RNN"); } // TODO: can batch_first be a wrapper around this function? @@ -685,12 +683,10 @@ std::tuple _cudnn_rnn( auto hidden_size = _hidden_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - if (!hx.is_contiguous()) { - throw std::runtime_error("rnn: hx is not contiguous"); - } - if (cx.defined() && !cx.is_contiguous()) { - throw std::runtime_error("rnn: cx is not contiguous"); - } + AT_CHECK(hx.is_contiguous(), + "rnn: hx is not contiguous"); + AT_CHECK(!cx.defined() || cx.is_contiguous(), + "rnn: cx is not contiguous"); auto x = input.contiguous(); auto output = input.type().tensor(output_size); @@ -723,11 +719,8 @@ std::tuple _cudnn_rnn( w_desc.set(weight_buf, 3); } - if (cx.defined() && !cx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes(); - throw std::runtime_error(oss.str()); - } + AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), + "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes()); size_t workspace_size; auto x_descs_arr = descs.get_x_descs(); @@ -817,9 +810,8 @@ std::tuple _cudnn_rnn_backward_input( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - if (cx.defined()) { - throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); - } + AT_CHECK(!cx.defined(), + "rnn: illegal defined cx for non-LSTM RNN"); } auto is_input_packed = fn_batch_sizes.size() != 0; @@ -833,12 +825,10 @@ std::tuple _cudnn_rnn_backward_input( auto hidden_size = _hidden_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - if (!hx.is_contiguous()) { - throw std::runtime_error("rnn: hx is not contiguous"); - } - if (cx.defined() && !cx.is_contiguous()) { - throw std::runtime_error("rnn: cx is not contiguous"); - } + AT_CHECK(hx.is_contiguous(), + "rnn: hx is not contiguous"); + AT_CHECK(!cx.defined() || cx.is_contiguous(), + "rnn: cx is not contiguous"); auto x = input.contiguous(); auto dy = grad_output.contiguous(); @@ -851,42 +841,25 @@ std::tuple _cudnn_rnn_backward_input( AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN"); auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor(); - if (!fn_train) { - throw std::runtime_error("cudnn RNN backward can only be called in training mode"); - } - if (!input.sizes().equals(input_size)) { - std::ostringstream oss; - oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes(); - throw std::runtime_error(oss.str()); - } - if (!output.sizes().equals(output_size)) { - std::ostringstream oss; - oss << "Expected output size " << IntList{output_size} << ", got " << output.sizes(); - throw std::runtime_error(oss.str()); - } - if (hx.defined() && !hx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes(); - throw std::runtime_error(oss.str()); - } - if (cx.defined() && !cx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes(); - throw std::runtime_error(oss.str()); - } - if (dhy.defined() && !dhy.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected d_hidden size " << IntList{hidden_size} << ", got " << dhy.sizes(); - throw std::runtime_error(oss.str()); - } - if (dcy.defined() && !dcy.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected d_cell size " << IntList{hidden_size} << ", got " << dcy.sizes(); - throw std::runtime_error(oss.str()); - } - if (!dhy.is_cuda() || !dy.is_cuda() || (dcy.defined() && !dcy.is_cuda())) { - throw std::runtime_error("Gradients aren't CUDA tensors"); - } + AT_CHECK(fn_train, + "cudnn RNN backward can only be called in training mode"); + + AT_CHECK(input.sizes().equals(input_size), + "Expected input size ", IntList{input_size}, ", got ", input.sizes()); + AT_CHECK(output.sizes().equals(output_size), + "Expected output size ", IntList{output_size}, ", got ", output.sizes()); + + AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), + "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes()); + AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), + "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes()); + AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size), + "Expected d_hidden size ", IntList{hidden_size}, ", got ", dhy.sizes()); + AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size), + "Expected d_cell size ", IntList{hidden_size}, ", got ", dcy.sizes()); + + AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), + "Gradients aren't CUDA tensors"); cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors); fn.rnn.set_algo(algo); @@ -959,9 +932,8 @@ std::vector _cudnn_rnn_backward_weight( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - if (cx.defined()) { - throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); - } + AT_CHECK(!cx.defined(), + "rnn: illegal defined cx for non-LSTM RNN"); } auto is_input_packed = fn_batch_sizes.size() != 0; @@ -973,28 +945,21 @@ std::vector _cudnn_rnn_backward_weight( auto input_size = _input_size(fn.tensors); auto hidden_size = _hidden_size(fn.rnn, fn.tensors); - if (!fn_train) { - throw std::runtime_error("cudnn RNN backward can only be called in training mode"); - } - if (!input.sizes().equals(input_size)) { - std::ostringstream oss; - oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes(); - throw std::runtime_error(oss.str()); - } - if (hx.defined() && !hx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes(); - throw std::runtime_error(oss.str()); - } + AT_CHECK(fn_train, + "cudnn RNN backward can only be called in training mode"); + + AT_CHECK(input.sizes().equals(input_size), + "Expected input size ", IntList{input_size}, ", got ", input.sizes()); + AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), + "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes()); + // TODO: the above were the only checks in rnn.py, but it doesn't seem // like these checks are enough - if (!hx.is_contiguous()) { - throw std::runtime_error("rnn: hx is not contiguous"); - } - if (cx.defined() && !cx.is_contiguous()) { - throw std::runtime_error("rnn: cx is not contiguous"); - } + AT_CHECK(hx.is_contiguous(), + "rnn: hx is not contiguous"); + AT_CHECK(!cx.defined() || cx.is_contiguous(), + "rnn: cx is not contiguous"); auto x = input.contiguous(); const auto& y = output; @@ -1059,9 +1024,9 @@ std::tuple> _cudnn_rnn_backward( std::array output_mask ) { - auto grad_output = grad_output_r.defined() ? grad_output_r : output.type().zeros_like(output); - auto grad_hy = grad_hy_r.defined() ? grad_hy_r : hx.type().zeros_like(hx); - auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : cx.type().zeros_like(cx)) : grad_cy_r; + auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output); + auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx); + auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx)) : grad_cy_r; Tensor dx, dhx, dcx; // NB: unconditionally compute this gradient, because it mutates reserve diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index 997431b7a86170..c9d25780bd65d3 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -14,7 +14,7 @@ std::tuple miopen_batch_norm( const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double exponential_average_factor, double epsilon) { - throw std::runtime_error("miopen_batch_norm: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_batch_norm: ATen not compiled with MIOpen support"); } std::tuple miopen_batch_norm_backward( @@ -22,7 +22,7 @@ std::tuple miopen_batch_norm_backward( const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_var, double epsilon) { - throw std::runtime_error("miopen_batch_norm_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_batch_norm_backward: ATen not compiled with MIOpen support"); } }} // namespace at::native diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 1ae36edd5c7b76..9aeaad73558617 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -13,61 +13,61 @@ at::Tensor miopen_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_backward_input( IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_backward_input: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward_input: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_backward_weight: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward_weight: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_backward_bias( const at::Tensor& grad_output) { - throw std::runtime_error("miopen_convolution_backward_bias: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward_bias: ATen not compiled with MIOpen support"); } std::tuple miopen_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("miopen_convolution_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_transpose( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_transpose: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_transpose_backward_input( const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_transpose_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support"); } std::tuple miopen_convolution_transpose_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); } }} @@ -180,16 +180,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { // Used on pad, stride and dilation static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name) { - if (args.size() > expected_size){ - std::stringstream ss; - ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } - else if (args.size() < expected_size){ - std::stringstream ss; - ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + AT_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); if (num_negative_values > 0){ @@ -197,7 +193,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const ss << arg_name << " should be greater than zero but got ("; std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } } diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp new file mode 100644 index 00000000000000..062dd56d2ca300 --- /dev/null +++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp @@ -0,0 +1,95 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Config.h" + +#if !AT_MKL_ENABLED() + +namespace at { namespace native { + +Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + AT_ERROR("bmm: ATen not compiled with MKL support"); +} + +}} + +#else // AT_MKL_ENABLED + +#include "ATen/ATen.h" +#include "ATen/Config.h" +#include "ATen/Dispatch.h" +#include "ATen/Utils.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace at { namespace native { + +static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int batch_size, const int M, const int N, const int K, const float alpha, + const float** A, const float** B, const float beta, float** C) { + const int lda = (trans_A == CblasNoTrans) ? K : M; + const int ldb = (trans_B == CblasNoTrans) ? N : K; + const int ldc = N; + + cblas_sgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha, + A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size); +} + +static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int batch_size, const int M, const int N, const int K, const double alpha, + const double** A, const double** B, const double beta, double** C) { + const int lda = (trans_A == CblasNoTrans) ? K : M; + const int ldb = (trans_B == CblasNoTrans) ? N : K; + const int ldc = N; + + cblas_dgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha, + A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size); +} + +template +static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, const Tensor& mat2, Scalar beta_, Scalar alpha_) { + auto is_transposed = [&](const Tensor& t) { + return t.stride(0) == 1 && t.stride(1) == t.size(0); + }; + const CBLAS_TRANSPOSE trans_A = is_transposed(mat1[0]) ? CblasTrans : CblasNoTrans; + const CBLAS_TRANSPOSE trans_B = is_transposed(mat2[0]) ? CblasTrans : CblasNoTrans; + + const int batch_size = mat1.size(0); + const int M = mat1.size(1); + const int N = mat2.size(2); + const int K = mat1.size(2); + scalar_t alpha = alpha_.to(); + scalar_t beta = beta_.to(); + + std::vector A(batch_size); + std::vector B(batch_size); + std::vector C(batch_size); + for (int64_t batch = 0; batch < batch_size; batch++) { + A[batch] = mat1[batch].data(); + B[batch] = mat2[batch].data(); + C[batch] = res[batch].data(); + } + + gemm_batched(trans_A, trans_B, batch_size, M, N, K, alpha, A.data(), B.data(), beta, C.data()); +} + +Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + // checks are done in native/LinearAlgebra.cpp + AT_DISPATCH_FLOATING_TYPES(self.type(), "baddbmm__mkl", [&] { + baddbmm_mkl_template(self, batch1, batch2, beta, alpha); + }); + + return self; +} + +}} // namespace at::native + +#endif diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index c3451824c05113..2c81d69d3b8435 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -12,7 +12,7 @@ Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes) { - throw std::runtime_error("fft: ATen not compiled with MKL support"); + AT_ERROR("fft: ATen not compiled with MKL support"); } }} @@ -191,12 +191,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, osize = output_sizes[i]; istride = complex_input ? input.stride(i) >> 1 : input.stride(i); ostride = onumel; - if (isize > MKL_LONG_MAX || osize > MKL_LONG_MAX || ostride > MKL_LONG_MAX) { - std::ostringstream ss; - ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ " - << MKL_LONG_MAX << "]"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX, + "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); if (!need_contiguous && istride > MKL_LONG_MAX) { // If we didn't plan to contiguous-fy but the `istride` exceeds bound, // check if we can stride (equal to `inumel`) get back within bound if @@ -205,12 +201,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, // fine as `inumel` is non-decreasing. need_contiguous = true; } - if (need_contiguous && inumel > MKL_LONG_MAX) { - std::ostringstream ss; - ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ " - << MKL_LONG_MAX << "]"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX, + "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); inumel *= isize; onumel *= osize; } @@ -227,7 +219,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, std::ostringstream ss; ss << "MKL FFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } // signal type DFTI_CONFIG_VALUE signal_type; diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp index 00f4e8f95b92d4..ddbd6977645e74 100644 --- a/aten/src/ATen/native/mkldnn/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/Conv.cpp @@ -9,25 +9,25 @@ namespace at { namespace native { at::Tensor mkldnn_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, IntList padding, IntList stride, IntList dilation, int64_t groups) { - throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_forward: ATen not compiled with MKLDNN support"); } at::Tensor mkldnn_convolution_backward_input( IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) { - throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support"); } std::tuple mkldnn_convolution_backward_weights( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) { - throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support"); } std::tuple mkldnn_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array output_mask) { - throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_backward: ATen not compiled with MKLDNN support"); } }} diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2b71982be8cf6a..5c99d7c97e9b3e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -245,6 +245,27 @@ CPU: _atan_out_cpu CUDA: _atan_out_cuda +- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + dispatch: + CPU: baddbmm_cpu + CUDA: baddbmm_cuda + +- func: baddbmm_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method + dispatch: + CPU: baddbmm__cpu + CUDA: baddbmm__cuda + +- func: _baddbmm_mkl_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + +- func: baddbmm_out(Tensor result, Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: baddbmm_out_cpu + CUDA: baddbmm_out_cuda + - func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor - func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor @@ -281,6 +302,18 @@ - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor +- func: bmm(Tensor self, Tensor mat2) -> Tensor + variants: function, method + dispatch: + CPU: bmm_cpu + CUDA: bmm_cuda + +- func: bmm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor + variants: function + dispatch: + CPU: bmm_out_cpu + CUDA: bmm_out_cuda + - func: broadcast_tensors(TensorList tensors) -> TensorList - func: cat(TensorList tensors, int64_t dim=0) -> Tensor @@ -374,7 +407,7 @@ - func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor -- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor +- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad=0) -> Tensor - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor) @@ -840,6 +873,9 @@ - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor variants: function, method +- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) -> Tensor + variants: function + - func: inverse(Tensor self) -> Tensor variants: function, method @@ -1177,6 +1213,8 @@ - func: permute(Tensor self, IntList dims) -> Tensor variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. +- func: pixel_shuffle(Tensor self, int64_t upscale_factor) -> Tensor + - func: pin_memory(Tensor self) -> Tensor variants: function, method @@ -1713,6 +1751,27 @@ CPU: _s_where_cpu CUDA: _s_where_cuda +- func: norm_except_dim(Tensor v, int64_t pow=2, int64_t dim=0) -> Tensor + variants: function + +# VariableType::_weight_norm does not want to be given a gap in the autograd graph, +# so we don't define "dispatch" variants for it. +- func: _weight_norm(Tensor v, Tensor g, int64_t dim=0) -> Tensor + variants: function + +- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim=0) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: weight_norm_cuda + +- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: weight_norm_cuda_backward + +- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor) + variants: function + - func: zeros(IntList size, TensorOptions options={}) -> Tensor - func: zeros_out(Tensor result, IntList size) -> Tensor @@ -2030,6 +2089,22 @@ variants: function, method device_guard: False +- func: to(Tensor self, Device device, ScalarType dtype, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + +- func: to(Tensor self, ScalarType dtype, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + +- func: to(Tensor self, Device device, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + +- func: to(Tensor self, Tensor other, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + - func: meshgrid(TensorList tensors) -> TensorList # This has a method dispatch to work around circular include problems diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index c96e577ba9d47a..25fd4fc5df4326 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -68,7 +68,7 @@ SparseTensor new_sparse(const SparseType& dtype) { } else { type_id = SparseCPUTensorId(); } - return SparseTensor(c10::make_intrusive(type_id, dtype.scalarType()).release(), /* retain */ false); + return SparseTensor(c10::make_intrusive(type_id, scalarTypeToTypeMeta(dtype.scalarType()))); } /*** Helper methods ***/ diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 2e37ad41a3b96e..afd8001734a9a8 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -784,7 +784,7 @@ Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self, // sparse, dense -> sparse Tensor smm(const Tensor& self, const Tensor& mat2) { auto result = self.type().tensor(); - self.type().sspaddmm_out(result, result, self, mat2, 0.0, 1.0); + at::sspaddmm_out(result, result, self, mat2, 0.0, 1.0); return result; } @@ -792,7 +792,7 @@ Tensor smm(const Tensor& self, const Tensor& mat2) { Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { auto result = self.type().tensor(); - self.type().sspaddmm_out(result, self, mat1, mat2, beta, alpha); + at::sspaddmm_out(result, self, mat1, mat2, beta, alpha); return result; } diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py index 173ac439487d26..98b22c7f8e1d6a 100644 --- a/aten/src/ATen/preprocess_declarations.py +++ b/aten/src/ATen/preprocess_declarations.py @@ -217,8 +217,20 @@ def signature(option, i=None, value=None): (raw_args - filtered_args)] +def is_extended_method(option): + if 'method' in option['variants']: + return False + elif option.get('deprecated', False): + return False + elif not option['variants']: + return False + else: + return True + + def run(declarations): declarations = [d for d in declarations if not exclude(d)] + non_extended_methods = set() for declaration in declarations: common_with_cwrap.set_declaration_defaults(declaration) declaration['options'] = [deepcopy(o) for o in declaration['options']] @@ -237,6 +249,20 @@ def run(declarations): sanitize_return(option) process_types_and_backends(option) add_variants(option) + if not is_extended_method(option): + non_extended_methods.add(option['api_name']) declaration['options'] = handle_outputs_taken_as_arguments( declaration['options']) + + # We (very unfortunately) have overloaded virtual methods. Because + # of C++'s rules, we cannot move one overload without doing some + # extra work to make sure that overload in a superclass and an + # overload in a subclass resolve together. I've chosen to resolve + # this problem simply by moving ALL overloads of a method which + # occurs in Tensor to Type. This is why we have to first compute + # which methods *names* go on type, and then move ALL overloads + # of this name to Type. + for declaration in declarations: + for option in declaration['options']: + option['extended_method'] = option['api_name'] not in non_extended_methods return declarations diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index b4a2e05e759ea3..8bbc17af5da291 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -4,6 +4,7 @@ #include "ATen/core/Scalar.h" #include "ATen/Type.h" +#include "ATen/TypeExtendedInterface.h" #include "ATen/Tensor.h" #include "ATen/core/Storage.h" #include "ATen/core/Generator.h" @@ -20,14 +21,19 @@ using native::tensor; ${function_declarations} -static inline Type & infer_type(const Tensor & t) { +namespace detail { + +static inline TypeExtendedInterface & infer_type(const Tensor & t) { AT_CHECK(t.defined(), "undefined Tensor"); - return t.type(); + return getType(t); } -static inline Type & infer_type(const TensorList & tl) { +static inline TypeExtendedInterface & infer_type(const TensorList & tl) { AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); - return tl[0].type(); + return getType(tl[0]); } + +} // namespace detail + // function definitions are all static inline because // they are one-line statically dispatched functions that // invoke the actual dynamic dispatch on the correct argument diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h index 82a2f00ff77bc7..c6355127734b1b 100644 --- a/aten/src/ATen/templates/NativeFunctions.h +++ b/aten/src/ATen/templates/NativeFunctions.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 064da2bc2186e0..61035f2c3d38f7 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -11,7 +11,7 @@ #include "ATen/Allocator.h" #include "ATen/DeviceGuard.h" #include "ATen/NativeFunctions.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/Utils.h" #include "ATen/WrapDimUtils.h" #include "ATen/core/Half.h" @@ -32,6 +32,9 @@ namespace at { ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } +caffe2::TypeMeta ${Type}::typeMeta() const { + return caffe2::TypeMeta::Make<${ScalarType}>(); +} Backend ${Type}::backend() const { return Backend::${Backend}; } diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 77c18466404c83..85e7c84961d6ee 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -1,7 +1,5 @@ #pragma once -// ${generated_comment} - #include "ATen/core/Device.h" #include "ATen/core/Layout.h" #include "ATen/core/Scalar.h" @@ -9,9 +7,9 @@ #include "ATen/core/SparseTensorRef.h" #include "ATen/core/Storage.h" #include "ATen/core/TensorAccessor.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/core/optional.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/core/Error.h" namespace at { @@ -41,20 +39,12 @@ namespace at { // special care must be taken to handle this. struct AT_API Tensor { Tensor(){}; - Tensor(TensorImpl* tensor_impl, bool retain) - : tensor_impl_(c10::intrusive_ptr::reclaim( - tensor_impl)) { - if (tensor_impl == nullptr) { + Tensor(c10::intrusive_ptr tensor_impl) + : tensor_impl_(std::move(tensor_impl)) { + if (tensor_impl_.get() == nullptr) { throw std::runtime_error("TensorBaseImpl with nullptr not supported"); } - if (retain && tensor_impl != UndefinedTensor::singleton()) { - c10::raw::intrusive_ptr::incref(tensor_impl); - } } - Tensor(const c10::intrusive_ptr& ptr) - : tensor_impl_(ptr) {} - Tensor(c10::intrusive_ptr&& ptr) - : tensor_impl_(std::move(ptr)) {} Tensor(const Tensor&) = default; Tensor(Tensor&&) = default; @@ -69,7 +59,7 @@ struct AT_API Tensor { TensorImpl * unsafeReleaseTensorImpl() { return tensor_impl_.release(); } - const c10::intrusive_ptr& getIntrusivePtr() const { + const c10::intrusive_ptr& getIntrusivePtr() const { return tensor_impl_; } @@ -152,7 +142,7 @@ struct AT_API Tensor { return tensor_impl_->type_id(); } ScalarType scalar_type() const { - return tensor_impl_->scalar_type(); + return dataTypeToScalarType(tensor_impl_->dtype().id()); } const Storage& storage() const { return tensor_impl_->storage(); @@ -162,12 +152,6 @@ struct AT_API Tensor { Tensor toType(ScalarType t) const; Tensor toBackend(Backend b) const; - /// New-style `to()` methods. - /// NB: These methods are defined in TensorOptions.h. - Tensor to(Device device, ScalarType dtype, bool non_blocking = false) const; - Tensor to(ScalarType dtype, bool non_blocking = false) const; - Tensor to(Device device, bool non_blocking = false) const; - /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`. /// Defined in Type.h because of include order issues. bool is_variable() const noexcept; @@ -202,6 +186,8 @@ struct AT_API Tensor { AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE) #undef TO_C_TYPE + // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and + // dimension. template TensorAccessor accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); @@ -211,6 +197,20 @@ struct AT_API Tensor { template TensorAccessor accessor() && = delete; + // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and + // dimension. You can optionally specify RestrictPtrTraits as a template parameter to + // cast the data pointer to a __restrict__ pointer. + // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor + // as an argument. + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() const& { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return PackedTensorAccessor(static_cast::PtrType>(data()),sizes().data(),strides().data()); + } + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() && = delete; + Tensor operator-() const; Tensor& operator+=(const Tensor & other); Tensor& operator+=(Scalar other); @@ -267,7 +267,7 @@ struct AT_API Tensor { friend struct WeakTensor; protected: - c10::intrusive_ptr tensor_impl_; + c10::intrusive_ptr tensor_impl_; }; struct AT_API WeakTensor { @@ -295,6 +295,8 @@ struct AT_API WeakTensor { } private: - c10::weak_intrusive_ptr weak_tensor_impl_; + c10::weak_intrusive_ptr weak_tensor_impl_; }; } // namespace at + +#include "ATen/core/TensorMethods.h" diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 07e5f2b634372c..8283bea01f6bed 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -1,13 +1,10 @@ #pragma once -// ${generated_comment} - -#include "ATen/Tensor.h" +#include "ATen/core/Tensor.h" #include "ATen/core/Scalar.h" #include "ATen/core/SparseTensorRef.h" -#include "ATen/Type.h" +#include "ATen/core/Type.h" #include "ATen/core/TensorOptions.h" -#include "ATen/DeviceGuard.h" namespace at { @@ -44,45 +41,6 @@ inline TensorOptions Tensor::options() const { .is_variable(is_variable()); } -namespace detail { -inline Tensor to( - const Tensor& tensor, - const TensorOptions& options, - bool non_blocking) { - // Don't copy if the options match. - if (tensor.options() == options) { - return tensor; - } - AT_CHECK(tensor.is_variable() == options.is_variable(), - "cannot change is_variable, from: ", tensor.is_variable(), - " to: ", options.is_variable()); - DeviceGuard guard(options.device()); - return tensor.type().toBackend(options.backend()).toScalarType(options.dtype()).copy(tensor, non_blocking); -} -} // namespace detail - -inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) - const { - if (this->device() == device && this->dtype() == dtype) { - return *this; - } - return detail::to(*this, options().device(device).dtype(dtype), non_blocking); -} - -inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const { - if (this->dtype() == dtype) { - return *this; - } - return detail::to(*this, options().dtype(dtype), non_blocking); -} - -inline Tensor Tensor::to(Device device, bool non_blocking) const { - if (this->device() == device) { - return *this; - } - return detail::to(*this, options().device(device), non_blocking); -} - inline void Tensor::backward( at::optional gradient, bool keep_graph, @@ -97,6 +55,22 @@ inline void Tensor::set_data(Tensor new_data) { // all static inline to allow for inlining of the non-dynamic part of dispatch ${tensor_method_definitions} +inline bool Tensor::is_variable() const noexcept { + return type().is_variable(); +} + +inline ScalarType Tensor::dtype() const noexcept { + return type().scalarType(); +} + +inline Layout Tensor::layout() const noexcept { + return type().layout(); +} + +inline Device Tensor::device() const { + return Device(type().device_type(), type().is_cuda() ? get_device() : -1); +} + #define DEFINE_CAST(T, name, _) \ template <> \ inline T* Tensor::data() const { \ diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 3a7080ea201e35..0e00a5d3499fcd 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -1,7 +1,5 @@ #pragma once -// ${generated_comment} - #include "ATen/core/ATenGeneral.h" #include "ATen/core/Allocator.h" #include "ATen/core/Deprecated.h" @@ -10,7 +8,6 @@ #include "ATen/core/Scalar.h" #include "ATen/core/ScalarType.h" #include "ATen/core/SparseTensorRef.h" -#include "ATen/Tensor.h" #include "ATen/core/ArrayRef.h" #include "ATen/core/Half.h" #include "ATen/core/TensorTypeIdRegistration.h" @@ -36,6 +33,7 @@ class Context; struct Allocator; struct Generator; struct Storage; +struct Tensor; static inline void noop_deleter(void*) {} @@ -55,6 +53,7 @@ struct AT_API Type { virtual ~Type() {} virtual ScalarType scalarType() const = 0; + virtual caffe2::TypeMeta typeMeta() const = 0; virtual Backend backend() const = 0; Layout layout() const noexcept { return layout_from_backend(backend()); } virtual bool is_cuda() const = 0; @@ -99,7 +98,7 @@ struct AT_API Type { return backendToDeviceType(backend()); } - virtual Tensor copy(const Tensor & src, bool non_blocking=false) const = 0; + virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional to_device={}) const = 0; virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0; virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0; @@ -144,20 +143,6 @@ struct AT_API Type { }; -inline bool Tensor::is_variable() const noexcept { - return type().is_variable(); -} - -inline ScalarType Tensor::dtype() const noexcept { - return type().scalarType(); -} - -inline Layout Tensor::layout() const noexcept { - return type().layout(); -} - -inline Device Tensor::device() const { - return Device(type().device_type(), type().is_cuda() ? get_device() : -1); -} - } // namespace at + +#include "ATen/core/Tensor.h" diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 5e614edc57f216..0891f6d9f4f492 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -22,8 +22,11 @@ Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking return s_copy_(self, b_src, non_blocking); } -Tensor TypeDefault::copy(const Tensor & src, bool non_blocking) const { - // TODO(psag): have a DeviceGuard here +Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional to_device) const { + DeviceGuard device_guard; + if (to_device.has_value()) { + device_guard.set_index(to_device.value().index()); + } AT_CHECK(src.defined(), "attempt to copy an undefined tensor"); if (is_sparse()) { auto indices = src._indices(); @@ -91,29 +94,33 @@ Tensor TypeDefault::tensorWithAllocator(IntList sizes, IntList strides, Allocato } Storage TypeDefault::storage(bool resizable) const { - return Storage(scalarType(), 0, allocator(), resizable); + return Storage(typeMeta(), 0, allocator(), resizable); } Storage TypeDefault::storage(size_t size, bool resizable) const { - return Storage(scalarType(), size, allocator(), resizable); + return Storage(typeMeta(), size, allocator(), resizable); } Storage TypeDefault::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { return Storage( - scalarType(), + typeMeta(), InefficientStdFunctionContext::makeDataPtr(data, deleter, getDeviceFromPtr(data)), size, deleter); } Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) const { - return Storage(scalarType(), size, allocator); + return Storage(typeMeta(), size, allocator); } Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const { - return Tensor(static_cast(th_pointer), retain); + auto tensor_impl = c10::intrusive_ptr::reclaim(static_cast(th_pointer)); + if (retain && tensor_impl.get() != UndefinedTensorImpl::singleton()) { + c10::raw::intrusive_ptr::incref(tensor_impl.get()); + } + return Tensor(std::move(tensor_impl)); } Storage TypeDefault::unsafeStorageFromTH(void * th_pointer, bool retain) const { if (retain && th_pointer) { c10::raw::intrusive_ptr::incref(static_cast(th_pointer)); } - return Storage(static_cast(th_pointer)); + return Storage(c10::intrusive_ptr::reclaim(static_cast(th_pointer))); } diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h index 64ec158f82349e..e4a75abb48993e 100644 --- a/aten/src/ATen/templates/TypeDefault.h +++ b/aten/src/ATen/templates/TypeDefault.h @@ -2,13 +2,13 @@ // ${generated_comment} -#include "ATen/Type.h" +#include "ATen/TypeExtendedInterface.h" namespace at { -struct AT_API TypeDefault : public Type { +struct AT_API TypeDefault : public TypeExtendedInterface { explicit TypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined) - : Type(type_id, is_variable, is_undefined) {} + : TypeExtendedInterface(type_id, is_variable, is_undefined) {} // Make sure overload resolution considers the nullary virtual method. // (A single argument overload is generated in the list.) @@ -25,7 +25,7 @@ struct AT_API TypeDefault : public Type { Type & toBackend(Backend b) const override; Type & toScalarType(ScalarType s) const override; - Tensor copy(const Tensor & src, bool non_blocking=false) const override; + Tensor copy(const Tensor & src, bool non_blocking=false, optional to_device={}) const override; Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const override; void backward(Tensor & self, at::optional gradient, bool keep_graph, bool create_graph) const override; diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index 5b26400ea52f12..d012274c5fceed 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -10,11 +10,11 @@ $th_headers $storage_tensor_headers #include "ATen/${Generator}.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/Allocator.h" #include "ATen/DeviceGuard.h" #include "ATen/NativeFunctions.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/Utils.h" #include "ATen/WrapDimUtils.h" #include "ATen/core/Half.h" @@ -32,9 +32,15 @@ namespace at { ${Type}::${Type}() : ${DenseBackend}TypeDefault(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} + ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } + +caffe2::TypeMeta ${Type}::typeMeta() const { + return caffe2::TypeMeta::Make<${ScalarType}>(); +} + Backend ${Type}::backend() const { return Backend::${Backend}; } diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h index 3a48d8b26e32b4..116df9b4d465fe 100644 --- a/aten/src/ATen/templates/TypeDerived.h +++ b/aten/src/ATen/templates/TypeDerived.h @@ -19,6 +19,7 @@ namespace at { struct ${Type} final : public ${DenseBackend}TypeDefault { explicit ${Type}(); virtual ScalarType scalarType() const override; + virtual caffe2::TypeMeta typeMeta() const override; virtual Backend backend() const override; virtual const char * toString() const override; virtual size_t elementSizeInBytes() const override; diff --git a/aten/src/ATen/templates/TypeExtendedInterface.h b/aten/src/ATen/templates/TypeExtendedInterface.h new file mode 100644 index 00000000000000..82cb658c9eeea8 --- /dev/null +++ b/aten/src/ATen/templates/TypeExtendedInterface.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace at { + +struct AT_API TypeExtendedInterface : public Type { + explicit TypeExtendedInterface(TensorTypeId type_id, bool is_variable, bool is_undefined) + : Type(type_id, is_variable, is_undefined) {} + ${pure_virtual_extended_type_method_declarations} +}; + +} // namespace at diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 4fcbeaa137ae78..8103f025988aab 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -27,7 +27,8 @@ list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/apply_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/stream_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu - ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu) + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu) if (CUDNN_FOUND) list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_test.cpp) diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp index 986f599da6d11b..fc39eccee3926b 100644 --- a/aten/src/ATen/test/apply_test.cpp +++ b/aten/src/ATen/test/apply_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "cuda.h" #include "cuda_runtime.h" @@ -11,111 +11,111 @@ Tests related to tensor indexing and applying operations. */ #ifndef _WIN32 -TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") { +CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") { int sizes[] = {4, 4}; int strides[] = {4, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == (4 * 4)); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == (4 * 4)); } -TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") { +CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") { int sizes[] = {6, 3, 7}; int strides[] = {3 * 7, 7, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == (6 * 3 * 7)); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7)); } -TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") { +CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") { int sizes[] = {4, 3, 2}; int strides[] = {3 * 3, 3, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 2); - REQUIRE(ti.sizes[0] == (4 * 3)); - REQUIRE(ti.sizes[1] == 2); + CATCH_REQUIRE(ti.dims == 2); + CATCH_REQUIRE(ti.sizes[0] == (4 * 3)); + CATCH_REQUIRE(ti.sizes[1] == 2); } -TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") { +CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") { int sizes[] = {3, 2}; int strides[] = {2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == (3 * 2)); - REQUIRE(ti.strides[0] == 2); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == (3 * 2)); + CATCH_REQUIRE(ti.strides[0] == 2); } -TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){ +CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){ int sizes[] = {3, 6, 5, 2}; int strides[] = {6 * 22, 22, 2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 2); - REQUIRE(ti.sizes[0] == (3 * 6)); - REQUIRE(ti.strides[0] == 22); - REQUIRE(ti.sizes[1] == (5 * 2)); - REQUIRE(ti.strides[1] == 2); + CATCH_REQUIRE(ti.dims == 2); + CATCH_REQUIRE(ti.sizes[0] == (3 * 6)); + CATCH_REQUIRE(ti.strides[0] == 22); + CATCH_REQUIRE(ti.sizes[1] == (5 * 2)); + CATCH_REQUIRE(ti.strides[1] == 2); } -TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") { +CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") { int sizes[] = {1, 10, 1, 5, 4}; int strides[] = {4, 0, 16, 0, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 5, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 2); - REQUIRE(ti.sizes[0] == (10 * 5)); - REQUIRE(ti.strides[0] == 0); - REQUIRE(ti.sizes[1] == 4); - REQUIRE(ti.strides[1] == 1); + CATCH_REQUIRE(ti.dims == 2); + CATCH_REQUIRE(ti.sizes[0] == (10 * 5)); + CATCH_REQUIRE(ti.strides[0] == 0); + CATCH_REQUIRE(ti.sizes[1] == 4); + CATCH_REQUIRE(ti.strides[1] == 1); } -TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") { +CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") { int sizes[] = {1, 1, 1}; int strides[] = {17, 12, 3}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - REQUIRE(ti.collapseDims() == 0); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == 1); - REQUIRE(ti.strides[0] == 1); + CATCH_REQUIRE(ti.collapseDims() == 0); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == 1); + CATCH_REQUIRE(ti.strides[0] == 1); } -TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") { +CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") { int sizes[] = {3, 6, 5, 2}; int strides[] = {6 * 22, 22, 2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - REQUIRE(ti.collapseDims(1) == 1); - REQUIRE(ti.dims == 3); - REQUIRE(ti.sizes[0] == 3); - REQUIRE(ti.strides[0] == (6 * 22)); - REQUIRE(ti.sizes[1] == 6); - REQUIRE(ti.strides[1] == 22); - REQUIRE(ti.sizes[2] == (5 * 2)); - REQUIRE(ti.strides[2] == 2); + CATCH_REQUIRE(ti.collapseDims(1) == 1); + CATCH_REQUIRE(ti.dims == 3); + CATCH_REQUIRE(ti.sizes[0] == 3); + CATCH_REQUIRE(ti.strides[0] == (6 * 22)); + CATCH_REQUIRE(ti.sizes[1] == 6); + CATCH_REQUIRE(ti.strides[1] == 22); + CATCH_REQUIRE(ti.sizes[2] == (5 * 2)); + CATCH_REQUIRE(ti.strides[2] == 2); } -TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") { +CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") { int sizes[] = {3, 6, 5, 2}; int strides[] = {6 * 22, 22, 2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - REQUIRE(ti.collapseDims(2) == 1); - REQUIRE(ti.dims == 3); - REQUIRE(ti.sizes[0] == (3 * 6)); - REQUIRE(ti.strides[0] == 22); - REQUIRE(ti.sizes[1] == 5); - REQUIRE(ti.strides[1] == 4); - REQUIRE(ti.sizes[2] == 2); - REQUIRE(ti.strides[2] == 2); + CATCH_REQUIRE(ti.collapseDims(2) == 1); + CATCH_REQUIRE(ti.dims == 3); + CATCH_REQUIRE(ti.sizes[0] == (3 * 6)); + CATCH_REQUIRE(ti.strides[0] == 22); + CATCH_REQUIRE(ti.sizes[1] == 5); + CATCH_REQUIRE(ti.strides[1] == 4); + CATCH_REQUIRE(ti.sizes[2] == 2); + CATCH_REQUIRE(ti.strides[2] == 2); } -TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") { +CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") { int sizes[] = {1, 1, 1}; int strides[] = {17, 12, 3}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - REQUIRE_THROWS(ti.collapseDims(5)); + _CATCH_REQUIRE_THROWS(ti.collapseDims(5)); } #endif diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp index 38027baae97b73..22be6de7acbc02 100644 --- a/aten/src/ATen/test/apply_utils_test.cpp +++ b/aten/src/ATen/test/apply_utils_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/CPUApplyUtils.h" @@ -108,32 +108,32 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) { }); } -TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { +CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}, -1, -1); } -TEST_CASE("apply utils test 2-dim small", "[cpu]") { +CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}); } -TEST_CASE("apply utils test 2-dim", "[cpu]") { +CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {20, 10}); } -TEST_CASE("apply utils test 3-dim", "[cpu]") { +CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2}); } -TEST_CASE("apply utils test 3-dim medium", "[cpu]") { +CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 40, 2}); } -TEST_CASE("apply utils test 10-dim", "[cpu]") { +CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3}); } diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp index dd37fa86af3a36..8dffa3d7c02c75 100644 --- a/aten/src/ATen/test/atest.cpp +++ b/aten/src/ATen/test/atest.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" @@ -19,26 +18,26 @@ void trace() { trace += foo_a[i][i]; } - REQUIRE(foo.trace().toCFloat() == Approx(trace)); + EXPECT_FLOAT_EQ(foo.trace().toCFloat(), trace); } -TEST_CASE( "atest", "[]" ) { - +// TEST_CASE( "atest", "[]" ) { +TEST(atest, atest) { manual_seed(123, at::kCPU); manual_seed(123, at::kCUDA); auto foo = rand({12,6}); - REQUIRE(foo.data() == foo.toFloatData()); + EXPECT_EQ(foo.data(), foo.toFloatData()); - REQUIRE(foo.size(0) == 12); - REQUIRE(foo.size(1) == 6); + EXPECT_EQ(foo.size(0), 12); + EXPECT_EQ(foo.size(1), 6); foo = foo+foo*3; foo -= 4; Scalar a = 4; float b = a.to(); - REQUIRE(b == 4); + EXPECT_EQ(b, 4); foo = (foo*foo) == (foo.pow(3)); foo = 2 + (foo+1); @@ -51,7 +50,7 @@ TEST_CASE( "atest", "[]" ) { } } - REQUIRE(foo.equal(4 * ones({12, 6}, kByte))); + EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte))); trace(); @@ -61,17 +60,18 @@ TEST_CASE( "atest", "[]" ) { auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3}); auto f_a = f.accessor(); - REQUIRE(f_a[0][0][0] == 1.0); - REQUIRE(f_a[0][1][1] == 5.0); + EXPECT_EQ(f_a[0][0][0], 1.0); + EXPECT_EQ(f_a[0][1][1], 5.0); - REQUIRE(f.strides()[0] == 6); - REQUIRE(f.strides()[1] == 3); - REQUIRE(f.strides()[2] == 1); - REQUIRE(f.sizes()[0] == 1); - REQUIRE(f.sizes()[1] == 2); - REQUIRE(f.sizes()[2] == 3); + EXPECT_EQ(f.strides()[0], 6); + EXPECT_EQ(f.strides()[1], 3); + EXPECT_EQ(f.strides()[2], 1); + EXPECT_EQ(f.sizes()[0], 1); + EXPECT_EQ(f.sizes()[1], 2); + EXPECT_EQ(f.sizes()[2], 3); - REQUIRE_THROWS(f.resize_({3,4,5})); + // TODO(ezyang): maybe do a more precise exception type. + ASSERT_THROW(f.resize_({3,4,5}), std::exception); { int isgone = 0; { @@ -79,7 +79,7 @@ TEST_CASE( "atest", "[]" ) { isgone++; }); } - REQUIRE(isgone == 1); + EXPECT_EQ(isgone, 1); } { int isgone = 0; @@ -90,9 +90,9 @@ TEST_CASE( "atest", "[]" ) { }); a_view = f2.view({3,2,1}); } - REQUIRE(isgone == 0); + EXPECT_EQ(isgone, 0); a_view.reset(); - REQUIRE(isgone == 1); + EXPECT_EQ(isgone, 1); } if(at::hasCUDA()) { @@ -103,6 +103,6 @@ TEST_CASE( "atest", "[]" ) { isgone++; }); } - REQUIRE(isgone==1); + EXPECT_EQ(isgone, 1); } } diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 94988122adedcc..c64fdec0089dff 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/core/Reduction.h" @@ -20,66 +20,66 @@ using namespace at; using Catch::Matchers::StartsWith; static void test(Type & type) { - SECTION( "resize" ) { + CATCH_SECTION( "resize" ) { auto a = type.tensor(); a.resize_({3,4}); - REQUIRE(a.numel() == 12); + CATCH_REQUIRE(a.numel() == 12); a.resize_({5, 7}); - REQUIRE(a.numel() == 35); + CATCH_REQUIRE(a.numel() == 35); } - SECTION( "ones and dot" ) { + CATCH_SECTION( "ones and dot" ) { Tensor b0 = ones({1, 1}, type); - REQUIRE(2 == (b0+b0).sum().toCDouble()); + CATCH_REQUIRE(2 == (b0+b0).sum().toCDouble()); Tensor b1 = ones({1, 2}, type); - REQUIRE(4 == (b1+b1).sum().toCDouble()); + CATCH_REQUIRE(4 == (b1+b1).sum().toCDouble()); Tensor b = ones({3, 4}, type); - REQUIRE(24 == (b+b).sum().toCDouble()); - REQUIRE(12 == b.numel()); - REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12); + CATCH_REQUIRE(24 == (b+b).sum().toCDouble()); + CATCH_REQUIRE(12 == b.numel()); + CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12); } - SECTION( "rand" ) { + CATCH_SECTION( "rand" ) { for(auto i = 0; i < 10; i++) { Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble)); } } - SECTION( "sort" ) { + CATCH_SECTION( "sort" ) { Tensor b = rand({3, 4}, type); auto z = b.sort(1); auto z_sorted = std::get<0>(z); - REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat()); + CATCH_REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat()); } if(type.backend() != Backend::CUDA) - SECTION( "randperm" ) { + CATCH_SECTION( "randperm" ) { Tensor b = randperm(15, type); Tensor rv, ri; std::tie(rv, ri) = sort(b, 0); - REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat()); + CATCH_REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat()); } - SECTION( "context" ) { + CATCH_SECTION( "context" ) { std::stringstream ss; ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl; } - SECTION( "add" ) { + CATCH_SECTION( "add" ) { Tensor a = rand({3, 4}, type); Tensor b = rand({3, 4}, type); Tensor c = add(a, add(a, b)); //TODO:0-dim Tensor d(3.f); Scalar d = 3.f; - REQUIRE( add(c, d).allclose(a + a + b + d) ); + CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) ); } - SECTION( "loads of adds" ) { + CATCH_SECTION( "loads of adds" ) { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); @@ -89,10 +89,10 @@ static void test(Type & type) { auto end = std::chrono::high_resolution_clock::now(); //TODO TEST PERF? std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; - REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); + CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); } - SECTION( "loads of adds (with copy)" ) { + CATCH_SECTION( "loads of adds (with copy)" ) { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); @@ -102,59 +102,59 @@ static void test(Type & type) { auto end = std::chrono::high_resolution_clock::now(); //TODO TEST PERF? std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; - REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); + CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); } - SECTION( "isContiguous" ) { + CATCH_SECTION( "isContiguous" ) { Tensor a = rand({3, 4}, type); - REQUIRE(a.is_contiguous()); + CATCH_REQUIRE(a.is_contiguous()); a = a.transpose(0, 1); - REQUIRE(!a.is_contiguous()); + CATCH_REQUIRE(!a.is_contiguous()); } - SECTION( "permute" ) { + CATCH_SECTION( "permute" ) { Tensor a = rand({3, 4, 5}, type); Tensor b = a.permute({1, 2, 0}); - REQUIRE(b.sizes().equals({4, 5, 3})); - REQUIRE(b.strides().equals({5, 1, 20})); + CATCH_REQUIRE(b.sizes().equals({4, 5, 3})); + CATCH_REQUIRE(b.strides().equals({5, 1, 20})); } - SECTION( "mm" ) { + CATCH_SECTION( "mm" ) { Tensor a = rand({3, 4}, type); Tensor b = rand({4}, type); Tensor c = mv(a, b); - REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1))); + CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1))); } - SECTION( "squeeze" ) { + CATCH_SECTION( "squeeze" ) { Tensor a = rand({2, 1}, type); Tensor b = squeeze(a); - REQUIRE(b.dim() == 1); + CATCH_REQUIRE(b.dim() == 1); a = rand({1}, type); b = squeeze(a); //TODO 0-dim squeeze - REQUIRE(a[0].equal(b)); + CATCH_REQUIRE(a[0].equal(b)); } - SECTION( "copy" ) { + CATCH_SECTION( "copy" ) { Tensor a = zeros({4, 3}, type); Tensor e = rand({4, 3}, type); a.copy_(e); - REQUIRE(a.equal(e)); + CATCH_REQUIRE(a.equal(e)); } - SECTION( "copy (broadcasting)" ) { + CATCH_SECTION( "copy (broadcasting)" ) { Tensor a = zeros({4, 3}, type); Tensor e = rand({3}, type); a.copy_(e); for (int i = 0; i < 4; ++i) { - REQUIRE(a[i].equal(e)); + CATCH_REQUIRE(a[i].equal(e)); } } - SECTION( "abs(value)" ) { + CATCH_SECTION( "abs(value)" ) { Tensor r = at::abs(type.scalarTensor(-3)); - REQUIRE(r.toCInt() == 3); + CATCH_REQUIRE(r.toCInt() == 3); } //TODO(zach): operator overloads @@ -168,120 +168,120 @@ static void test(Type & type) { } #endif - SECTION( "adding a value with a scalar" ) { + CATCH_SECTION( "adding a value with a scalar" ) { Tensor a = rand({4, 3}, type); - REQUIRE((ones({4,3}, type) + a).equal(add(a,1))); + CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1))); } - SECTION( "select" ) { + CATCH_SECTION( "select" ) { Tensor a = rand({3, 7}, type); auto a_13 = select(a, 1, 3); auto a_13_02 = select(select(a, 1, 3), 0, 2); - REQUIRE( a[0][3].equal(a_13[0]) ); - REQUIRE( a[2][3].equal(a_13_02) ); + CATCH_REQUIRE( a[0][3].equal(a_13[0]) ); + CATCH_REQUIRE( a[2][3].equal(a_13_02) ); } - SECTION( "zero-dim" ) { + CATCH_SECTION( "zero-dim" ) { Tensor a = type.scalarTensor(4); //rand(type, {1}); Tensor b = rand({3,4}, type); - REQUIRE((a + a).dim() == 0); - REQUIRE((1 + a).dim() == 0); - REQUIRE((b + a).dim() == 2); - REQUIRE((a + b).dim() == 2); + CATCH_REQUIRE((a + a).dim() == 0); + CATCH_REQUIRE((1 + a).dim() == 0); + CATCH_REQUIRE((b + a).dim() == 2); + CATCH_REQUIRE((a + b).dim() == 2); auto c = rand({3,4}, type); - REQUIRE(c[1][2].dim() == 0); + CATCH_REQUIRE(c[1][2].dim() == 0); auto f = rand({3,4}, type); f[2] = zeros({4}, type); f[1][0] = -1; - REQUIRE(f[2][0].toCDouble() == 0); + CATCH_REQUIRE(f[2][0].toCDouble() == 0); } - SECTION( "tensor from TH" ) { + CATCH_SECTION( "tensor from TH" ) { int a = 4; THFloatTensor *t = THFloatTensor_newWithSize2d(a, a); THFloatTensor_fill(t, a); Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false); - REQUIRE_NOTHROW(tt); + CATCH_REQUIRE_NOTHROW(tt); } - SECTION( "toCFloat" ) { + CATCH_SECTION( "toCFloat" ) { Tensor a = zeros({3,4}); Tensor b = ones({3,7}); Tensor c = cat({a,b},1); - REQUIRE(c.size(1) == 11); + CATCH_REQUIRE(c.size(1) == 11); Tensor e = rand({}); - REQUIRE(*e.data() == e.sum().toCFloat()); + CATCH_REQUIRE(*e.data() == e.sum().toCFloat()); } - SECTION( "to string" ) { + CATCH_SECTION( "to string" ) { Tensor b = ones({3,7})*.0000001f; std::stringstream s; s << b << "\n"; std::string expect = "1e-07 *"; - REQUIRE(s.str().substr(0,expect.size()) == expect); + CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect); } - SECTION("indexing by Scalar") { + CATCH_SECTION("indexing by Scalar") { Tensor tensor = arange(0, 10, kInt); Tensor one = ones({}, kInt); for (int64_t i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } for (size_t i = 0; i < static_cast(tensor.numel()); ++i) { - REQUIRE(tensor[i].equal(one * static_cast(i))); + CATCH_REQUIRE(tensor[i].equal(one * static_cast(i))); } for (int i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } for (int16_t i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } for (int8_t i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[Scalar(3.14)].equal(one), StartsWith( "Can only index tensors with integral scalars")); } - SECTION("indexing by zero-dim tensor") { + CATCH_SECTION("indexing by zero-dim tensor") { Tensor tensor = arange(0, 10, kInt); Tensor one = ones({}, kInt); for (int i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[one * i].equal(one * i)); + CATCH_REQUIRE(tensor[one * i].equal(one * i)); } - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[ones({}) * 3.14].equal(one), StartsWith( "Can only index tensors with integral scalars")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[Tensor()].equal(one), StartsWith("Can only index with tensors that are defined")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[ones({2, 3, 4}, kInt)].equal(one), StartsWith("Can only index with tensors that are scalars (zero-dim)")); } - SECTION("dispatch") { + CATCH_SECTION("dispatch") { Tensor tensor = randn({20, 20}); Tensor other = randn({20, 20}); auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean); - REQUIRE(result.allclose(mse_loss(relu(tensor), other))); + CATCH_REQUIRE(result.allclose(mse_loss(relu(tensor), other))); } - SECTION("core") { + CATCH_SECTION("core") { int i = CoreTest(); - REQUIRE(i + 1 == CoreTest()); + CATCH_REQUIRE(i + 1 == CoreTest()); } } -TEST_CASE( "basic tests CPU", "[cpu]" ) { +CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) { manual_seed(123, at::kCPU); test(CPU(kFloat)); } -TEST_CASE( "basic tests GPU", "[cuda]" ) { +CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) { manual_seed(123, at::kCUDA); if(at::hasCUDA()) { diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp index cd5c43d32fae86..822a1d79df1bda 100644 --- a/aten/src/ATen/test/broadcast_test.cpp +++ b/aten/src/ATen/test/broadcast_test.cpp @@ -1,154 +1,154 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; -TEST_CASE( "broadcast", "[]" ) { +CATCH_TEST_CASE( "broadcast", "[]" ) { manual_seed(123, at::kCPU); Type & T = CPU(kFloat); // 0) pre-req tests: - SECTION( "can't expand empty tensor" ) { + CATCH_SECTION( "can't expand empty tensor" ) { auto empty = randn({0}, T); - REQUIRE_THROWS(empty.expand({3})); + _CATCH_REQUIRE_THROWS(empty.expand({3})); } // 1) out-place function with 2 args - SECTION( "out-place function with 2 args" ) { + CATCH_SECTION( "out-place function with 2 args" ) { - SECTION( "basic" ) { + CATCH_SECTION( "basic" ) { auto a = randn({3, 1}, T); auto b = randn({5}, T); std::vector expanded_sizes = {3, 5}; - REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); + CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto aScalar = ones({1}, T); aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); auto b = randn({3, 5}, T); - REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); + CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); } - SECTION( "old fallback behavior yields error" ) { + CATCH_SECTION( "old fallback behavior yields error" ) { auto a = randn({3, 5}, T); auto b = randn({5, 3}, T); - REQUIRE_THROWS(a + b); + _CATCH_REQUIRE_THROWS(a + b); } - SECTION( "with mismatched sizes" ) { + CATCH_SECTION( "with mismatched sizes" ) { auto a = randn({3, 5}, T); auto b = randn({7, 5}, T); - REQUIRE_THROWS(a + b); + _CATCH_REQUIRE_THROWS(a + b); } } - SECTION( "out-place function with 3 args" ) { + CATCH_SECTION( "out-place function with 3 args" ) { - SECTION( "basic" ) { + CATCH_SECTION( "basic" ) { auto a = randn({3, 1, 1}, T); auto b = randn({1, 2, 1}, T); auto c = randn({1, 1, 5}, T); std::vector expanded_sizes = {3, 2, 5}; - REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes))); + CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto aTensorScalar = ones({1}, T); aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); auto b = randn({3, 2, 1}, T); auto c = randn({1, 2, 5}, T); std::vector expanded_sizes = {3, 2, 5}; - REQUIRE(aTensorScalar.addcmul(b, c).equal( + CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal( aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes)))); } - SECTION( "old fallback behavior yields error" ) { + CATCH_SECTION( "old fallback behavior yields error" ) { auto a = randn({3, 2, 5}, T); auto b = randn({2, 3, 5}, T); auto c = randn({5, 3, 2}, T); - REQUIRE_THROWS(a.addcmul(b, c)); + _CATCH_REQUIRE_THROWS(a.addcmul(b, c)); } - SECTION( "with mismatched sizes" ){ + CATCH_SECTION( "with mismatched sizes" ){ auto a = randn({3, 2, 5}, T); auto b = randn({2, 3, 5}, T); auto c = randn({5, 5, 5}, T); - REQUIRE_THROWS(a.addcmul(b, c)); + _CATCH_REQUIRE_THROWS(a.addcmul(b, c)); } } - SECTION( "in-place function with 2 args" ) { - SECTION( "basic" ) { + CATCH_SECTION( "in-place function with 2 args" ) { + CATCH_SECTION( "basic" ) { auto a = randn({3, 5}, T); auto b = randn({3, 1}, T); - REQUIRE((a + b).equal(a + b.expand({3, 5}))); + CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5}))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto a = randn({3, 5}, T); auto bScalar = ones({1}, T); bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); + CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); } - SECTION( "error: would have to expand inplace arg" ) { + CATCH_SECTION( "error: would have to expand inplace arg" ) { auto a = randn({1, 5}, T); auto b = randn({3, 1}, T); - REQUIRE_THROWS(a.add_(b)); + _CATCH_REQUIRE_THROWS(a.add_(b)); } } - SECTION( "in-place function with 3 args" ) { + CATCH_SECTION( "in-place function with 3 args" ) { auto a = randn({3, 5, 2}, T); auto b = randn({3, 1, 2}, T); auto c = randn({1, 5, 1}, T); - SECTION( "basic" ) { + CATCH_SECTION( "basic" ) { auto aClone = a.clone(); - REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); + CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto aClone = a.clone(); auto bScalar = ones({1}, T); bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes())))); + CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes())))); } - SECTION( "error: would have to expand inplace arg" ) { + CATCH_SECTION( "error: would have to expand inplace arg" ) { auto a = randn({1, 3, 5}, T); auto b = randn({4, 1, 1}, T); auto c = randn({1, 3, 1}, T); - REQUIRE_THROWS(a.addcmul_(b, c)); + _CATCH_REQUIRE_THROWS(a.addcmul_(b, c)); } } - SECTION( "explicit dim specification" ) { + CATCH_SECTION( "explicit dim specification" ) { auto a = randn({1}, T); auto b = randn({5, 3}, T); auto c = randn({3, 7}, T); - SECTION( "basic" ) { - REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c))); + CATCH_SECTION( "basic" ) { + CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { Tensor aScalar = ones({1}, T); aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); + CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); } - SECTION( "with mismatched sizes" ) { + CATCH_SECTION( "with mismatched sizes" ) { auto a = randn({3, 3}, T); - REQUIRE_THROWS(a.addmm(b, c)); + _CATCH_REQUIRE_THROWS(a.addmm(b, c)); } } } diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp new file mode 100644 index 00000000000000..b9b0a87990a9ce --- /dev/null +++ b/aten/src/ATen/test/catch_utils.hpp @@ -0,0 +1,8 @@ +#pragma once + +#define CATCH_CONFIG_PREFIX_ALL +#include + +// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning; +// define our own version that doesn't warn. +#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ ) diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu index fa00e534ee07ef..cce267100589e1 100644 --- a/aten/src/ATen/test/cuda_half_test.cu +++ b/aten/src/ATen/test/cuda_half_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/cuda/NumericLimits.cuh" @@ -82,9 +82,9 @@ void launch_function(){ kernel<<<1,1>>>(); } -TEST_CASE( "half common math functions tests in device", "[cuda]" ) { +CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) { launch_function(); cudaError_t err = cudaDeviceSynchronize(); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu index 9956dcf52b04ef..b64c530b355914 100644 --- a/aten/src/ATen/test/cuda_optional_test.cu +++ b/aten/src/ATen/test/cuda_optional_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/optional.h" @@ -8,15 +8,15 @@ using namespace at; -TEST_CASE( "optional in cuda files", "[cuda]" ) { +CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) { at::optional trivially_destructible; at::optional> non_trivially_destructible; - REQUIRE(!trivially_destructible.has_value()); - REQUIRE(!non_trivially_destructible.has_value()); + CATCH_REQUIRE(!trivially_destructible.has_value()); + CATCH_REQUIRE(!non_trivially_destructible.has_value()); trivially_destructible = {5}; non_trivially_destructible = std::vector{5, 10}; - REQUIRE(trivially_destructible.has_value()); - REQUIRE(non_trivially_destructible.has_value()); + CATCH_REQUIRE(trivially_destructible.has_value()); + CATCH_REQUIRE(non_trivially_destructible.has_value()); } diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu new file mode 100644 index 00000000000000..a529f38d748a1b --- /dev/null +++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu @@ -0,0 +1,46 @@ +#define CATCH_CONFIG_MAIN +#include "catch_utils.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" +#include "ATen/core/TensorAccessor.h" +#include "ATen/cuda/CUDAContext.h" + +#include + +using namespace at; + +__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor resa, + PackedTensorAccessor t1a, + PackedTensorAccessor t2a){ + for (int64_t i = 0; i < resa.size(0); i++) { + float val = 0.0f; + for (int64_t j = 0; j < t1a.size(1); j++) { + val += t1a[i][j] * t2a[j]; + } + resa[i] = val; + } +} + +CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) { + manual_seed(123, at::kCPU); + manual_seed(123, at::kCUDA); + + Tensor t1 = rand({4, 4}, CUDA(kFloat)); + Tensor t2 = rand({4}, CUDA(kFloat)); + Tensor res = empty({4}, CUDA(kFloat)); + + auto t1a = t1.packed_accessor(); + auto t2a = t2.packed_accessor(); + auto resa = res.packed_accessor(); + + auto stream = at::cuda::getCurrentCUDAStream(); + + test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a); + cudaError_t err = cudaDeviceSynchronize(); + CATCH_REQUIRE(err == cudaSuccess); + + auto expected = mv(t1, t2); + + CATCH_REQUIRE(res.allclose(expected)); +} diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp index d32903dd2fe1f3..7b14174d3baeb3 100644 --- a/aten/src/ATen/test/cuda_rng_test.cpp +++ b/aten/src/ATen/test/cuda_rng_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "cuda.h" @@ -21,7 +21,7 @@ void testCudaRNGMultithread() { } }; -TEST_CASE( "CUDA RNG test", "[cuda]" ) { - SECTION( "multithread" ) +CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) { + CATCH_SECTION( "multithread" ) testCudaRNGMultithread(); } diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp index 31786e88a0944d..4391867d166772 100644 --- a/aten/src/ATen/test/cudnn_test.cpp +++ b/aten/src/ATen/test/cudnn_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/cudnn/Descriptors.h" @@ -9,7 +9,7 @@ using namespace at; using namespace at::native; -TEST_CASE( "cudnn", "[cuda]" ) { +CATCH_TEST_CASE( "cudnn", "[cuda]" ) { manual_seed(123, at::kCUDA); #if CUDNN_VERSION < 7000 @@ -18,8 +18,8 @@ TEST_CASE( "cudnn", "[cuda]" ) { desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42); desc2.set(handle, 0.5, desc1.state); - REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout); - REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates); - REQUIRE(desc1.desc()->states == desc2.desc()->states); + CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout); + CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates); + CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states); #endif } diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp index 48829298760276..bf0cf93f7c4064 100644 --- a/aten/src/ATen/test/dlconvertor_test.cpp +++ b/aten/src/ATen/test/dlconvertor_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/DLConvertor.h" @@ -11,17 +11,17 @@ using namespace at; -TEST_CASE( "dlconvertor", "[cpu]" ) { +CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) { manual_seed(123, at::kCPU); - INFO( "convert ATen to DLTensor" ); + CATCH_INFO( "convert ATen to DLTensor" ); Tensor a = rand({3,4}); DLManagedTensor* dlMTensor = toDLPack(a); - INFO( "convert DLTensor to ATen" ); + CATCH_INFO( "convert DLTensor to ATen" ); Tensor b = fromDLPack(dlMTensor); - REQUIRE(a.equal(b)); + CATCH_REQUIRE(a.equal(b)); } diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp index 3b2944803e6b5a..32177705a2f883 100644 --- a/aten/src/ATen/test/half_test.cpp +++ b/aten/src/ATen/test/half_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include #include @@ -12,53 +12,53 @@ using namespace at; -TEST_CASE( "half arithmetic", "[]" ) { +CATCH_TEST_CASE( "half arithmetic", "[]" ) { Half zero = 0; Half one = 1; - REQUIRE(zero + one == one); - REQUIRE(zero + zero == zero); - REQUIRE(zero * one == zero); - REQUIRE(one * one == one); - REQUIRE(one / one == one); - REQUIRE(one - one == zero); - REQUIRE(one - zero == one); - REQUIRE(zero - one == -one); - REQUIRE(one + one == Half(2)); - REQUIRE(one + one == 2); + CATCH_REQUIRE(zero + one == one); + CATCH_REQUIRE(zero + zero == zero); + CATCH_REQUIRE(zero * one == zero); + CATCH_REQUIRE(one * one == one); + CATCH_REQUIRE(one / one == one); + CATCH_REQUIRE(one - one == zero); + CATCH_REQUIRE(one - zero == one); + CATCH_REQUIRE(zero - one == -one); + CATCH_REQUIRE(one + one == Half(2)); + CATCH_REQUIRE(one + one == 2); } -TEST_CASE( "half comparisons", "[]" ) { +CATCH_TEST_CASE( "half comparisons", "[]" ) { Half zero = 0; Half one = 1; - REQUIRE(zero < one); - REQUIRE(zero < 1); - REQUIRE(1 > zero); - REQUIRE(0 >= zero); - REQUIRE(0 != one); - REQUIRE(zero == 0); - REQUIRE(zero == zero); - REQUIRE(zero == -zero); + CATCH_REQUIRE(zero < one); + CATCH_REQUIRE(zero < 1); + CATCH_REQUIRE(1 > zero); + CATCH_REQUIRE(0 >= zero); + CATCH_REQUIRE(0 != one); + CATCH_REQUIRE(zero == 0); + CATCH_REQUIRE(zero == zero); + CATCH_REQUIRE(zero == -zero); } -TEST_CASE( "half cast", "[]" ) { +CATCH_TEST_CASE( "half cast", "[]" ) { Half value = 1.5f; - REQUIRE((int)value == 1); - REQUIRE((short)value == 1); - REQUIRE((long long)value == 1LL); - REQUIRE((float)value == 1.5f); - REQUIRE((double)value == 1.5); - REQUIRE((bool)value == true); - REQUIRE((bool)Half(0.0f) == false); + CATCH_REQUIRE((int)value == 1); + CATCH_REQUIRE((short)value == 1); + CATCH_REQUIRE((long long)value == 1LL); + CATCH_REQUIRE((float)value == 1.5f); + CATCH_REQUIRE((double)value == 1.5); + CATCH_REQUIRE((bool)value == true); + CATCH_REQUIRE((bool)Half(0.0f) == false); } -TEST_CASE( "half construction", "[]" ) { - REQUIRE(Half((short)3) == Half(3.0f)); - REQUIRE(Half((unsigned short)3) == Half(3.0f)); - REQUIRE(Half(3) == Half(3.0f)); - REQUIRE(Half(3U) == Half(3.0f)); - REQUIRE(Half(3LL) == Half(3.0f)); - REQUIRE(Half(3ULL) == Half(3.0f)); - REQUIRE(Half(3.5) == Half(3.5f)); +CATCH_TEST_CASE( "half construction", "[]" ) { + CATCH_REQUIRE(Half((short)3) == Half(3.0f)); + CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f)); + CATCH_REQUIRE(Half(3) == Half(3.0f)); + CATCH_REQUIRE(Half(3U) == Half(3.0f)); + CATCH_REQUIRE(Half(3LL) == Half(3.0f)); + CATCH_REQUIRE(Half(3ULL) == Half(3.0f)); + CATCH_REQUIRE(Half(3.5) == Half(3.5f)); } static std::string to_string(const Half& h) { @@ -67,22 +67,22 @@ static std::string to_string(const Half& h) { return ss.str(); } -TEST_CASE( "half to string", "[]" ) { - REQUIRE(to_string(Half(3.5f)) == "3.5"); - REQUIRE(to_string(Half(-100.0f)) == "-100"); +CATCH_TEST_CASE( "half to string", "[]" ) { + CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5"); + CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100"); } -TEST_CASE( "half numeric limits", "[]" ) { +CATCH_TEST_CASE( "half numeric limits", "[]" ) { using limits = std::numeric_limits; - REQUIRE(limits::lowest() == -65504.0f); - REQUIRE(limits::max() == 65504.0f); - REQUIRE(limits::min() > 0); - REQUIRE(limits::min() < 1); - REQUIRE(limits::denorm_min() > 0); - REQUIRE(limits::denorm_min() / 2 == 0); - REQUIRE(limits::infinity() == std::numeric_limits::infinity()); - REQUIRE(limits::quiet_NaN() != limits::quiet_NaN()); - REQUIRE(limits::signaling_NaN() != limits::signaling_NaN()); + CATCH_REQUIRE(limits::lowest() == -65504.0f); + CATCH_REQUIRE(limits::max() == 65504.0f); + CATCH_REQUIRE(limits::min() > 0); + CATCH_REQUIRE(limits::min() < 1); + CATCH_REQUIRE(limits::denorm_min() > 0); + CATCH_REQUIRE(limits::denorm_min() / 2 == 0); + CATCH_REQUIRE(limits::infinity() == std::numeric_limits::infinity()); + CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN()); + CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN()); } // Check the declared type of members of numeric_limits matches @@ -119,7 +119,7 @@ ASSERT_SAME_TYPE(max_exponent10); ASSERT_SAME_TYPE(traps); ASSERT_SAME_TYPE(tinyness_before); -TEST_CASE( "half common math functions test", "[]" ) { +CATCH_TEST_CASE( "half common math functions test", "[]" ) { float threshold = 0.00001; assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold); assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold); diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu index 4c63ab3a8fd205..d09a423d7ca72d 100644 --- a/aten/src/ATen/test/integer_divider_test.cu +++ b/aten/src/ATen/test/integer_divider_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or // (b-1), so it takes a few minutes to run. @@ -62,18 +62,18 @@ class IntDividerTester { cudaError_t err; err = cudaMalloc(÷rsBuf_, NUM_CASES * sizeof(IntDivider)); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase)); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } ~IntDividerTester() { cudaError_t err; err = cudaFree(dividersBuf_); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); err = cudaFree(testCasesBuf_); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } void addTestCase(Value dividend, Value divisor, int steps) { @@ -92,18 +92,18 @@ class IntDividerTester { cudaError_t err; if (testCases_.empty()) return; - REQUIRE(!dividers_.empty()); + CATCH_REQUIRE(!dividers_.empty()); - REQUIRE(dividers_.size() <= NUM_CASES); - REQUIRE(testCases_.size() <= NUM_CASES); + CATCH_REQUIRE(dividers_.size() <= NUM_CASES); + CATCH_REQUIRE(testCases_.size() <= NUM_CASES); err = cudaMemcpy(dividersBuf_, dividers_.data(), dividers_.size() * sizeof(IntDivider), cudaMemcpyHostToDevice); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); err = cudaMemcpy(testCasesBuf_, testCases_.data(), testCases_.size() * sizeof(TestCase), cudaMemcpyHostToDevice); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); int numCases = testCases_.size(); testIntDivider<<<512, 512>>>( @@ -180,11 +180,11 @@ static void testUint64Divider() tester.flush(); } -TEST_CASE( "CUDA integer divider", "[cuda]" ) { +CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) { testUint64Divider(); testUint32Divider(); cudaError_t err = cudaDeviceSynchronize(); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp index e10de30ae8e023..4c57b7d8ee1d96 100644 --- a/aten/src/ATen/test/native_test.cpp +++ b/aten/src/ATen/test/native_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" @@ -9,18 +9,18 @@ using namespace at; using Catch::Matchers::StartsWith; #define REQUIRE_EQUAL(t1, t2) \ - REQUIRE(t1.equal(t2)); + CATCH_REQUIRE(t1.equal(t2)); #define REQUIRE_ALLCLOSE(t1, t2) \ - REQUIRE(t1.is_same_size(t2)); \ - REQUIRE(t1.allclose(t2)); + CATCH_REQUIRE(t1.is_same_size(t2)); \ + CATCH_REQUIRE(t1.allclose(t2)); #define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ - REQUIRE(t1.is_same_size(t2)); \ - REQUIRE(t1.allclose(t2, atol, rtol)); + CATCH_REQUIRE(t1.is_same_size(t2)); \ + CATCH_REQUIRE(t1.allclose(t2, atol, rtol)); void requireEqualTensorList(TensorList t1, TensorList t2) { - REQUIRE(t1.size() == t2.size()); + CATCH_REQUIRE(t1.size() == t2.size()); for (size_t i = 0; i < t1.size(); ++i) { REQUIRE_EQUAL(t1[ i ], t2[ i ]); } @@ -29,7 +29,7 @@ void requireEqualTensorList(TensorList t1, TensorList t2) { void test(Type & T, Type & AccT) { auto t = randn({3, 3}, T); - SECTION( "split: test method, type, namespace give same result" ) { + CATCH_SECTION( "split: test method, type, namespace give same result" ) { auto splitMethod = t.split(1, 0); auto splitType = T.split(t, 1, 0); auto splitNs = at::split(t, 1, 0); @@ -40,7 +40,7 @@ void test(Type & T, Type & AccT) { REQUIRE_EQUAL(at::cat(splitMethod, 0), t); } - SECTION( "chunk: test method, type, namespace give same result" ) { + CATCH_SECTION( "chunk: test method, type, namespace give same result" ) { // test method, type, namespace give same result auto chunkMethod = t.chunk(3, 0); auto chunkType = T.chunk(t, 3, 0); @@ -53,7 +53,7 @@ void test(Type & T, Type & AccT) { } // stack - SECTION( "stack" ) { + CATCH_SECTION( "stack" ) { auto x = rand({2, 3, 4}); auto y = rand({2, 3, 4}); auto z = rand({2, 3, 4}); @@ -66,36 +66,36 @@ void test(Type & T, Type & AccT) { expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end()); REQUIRE_EQUAL(res, res_neg); - REQUIRE(res.sizes().equals(expected_size)); + CATCH_REQUIRE(res.sizes().equals(expected_size)); REQUIRE_EQUAL(res.select(dim, 0), x); REQUIRE_EQUAL(res.select(dim, 1), y); REQUIRE_EQUAL(res.select(dim, 2), z); } } - SECTION( "size / stride" ) { + CATCH_SECTION( "size / stride" ) { auto scalar = randn({}, T); - REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); - REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); - REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); - REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); auto empty = randn({0}, T); - REQUIRE(empty.size(0) == 0); - REQUIRE(empty.size(-1) == 0); - REQUIRE(empty.stride(0) == 1); - REQUIRE(empty.stride(-1) == 1); + CATCH_REQUIRE(empty.size(0) == 0); + CATCH_REQUIRE(empty.size(-1) == 0); + CATCH_REQUIRE(empty.stride(0) == 1); + CATCH_REQUIRE(empty.stride(-1) == 1); } // matmul - SECTION( "matmul" ) { + CATCH_SECTION( "matmul" ) { auto scalar = randn({}, T); auto d1 = randn({3}, T); auto d2 = randn({2, 3}, T); // 0-d - REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D")); - REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D")); + CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D")); + CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D")); // 1-d REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1)); @@ -140,11 +140,11 @@ void test(Type & T, Type & AccT) { // non-expandable case auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T); - REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size")); + CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size")); } // _standard_gamma_grad - SECTION( "_standard_gamma_grad" ) { + CATCH_SECTION( "_standard_gamma_grad" ) { // check empty auto empty = ones({0}, T); REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty)); @@ -158,10 +158,10 @@ void test(Type & T, Type & AccT) { // check mixing types auto t1 = randn({3, 4}, T); auto t2 = randn({3, 4}, T).toType(kDouble); - REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type")); + CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type")); } - SECTION( "where" ) { + CATCH_SECTION( "where" ) { // empty auto empty = ones({0}, T); auto &bT = T.toScalarType(ScalarType::Byte); @@ -180,13 +180,13 @@ void test(Type & T, Type & AccT) { } } -TEST_CASE( "native test CPU", "[cpu]" ) { +CATCH_TEST_CASE( "native test CPU", "[cpu]" ) { manual_seed(123, at::kCPU); test(CPU(kFloat), CPU(kDouble)); } -TEST_CASE( "native test CUDA", "[cuda]" ) { +CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) { manual_seed(123, at::kCUDA); if (at::hasCUDA()) { diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index d52dc27e20295e..964f6260e7d9ff 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" @@ -18,14 +18,14 @@ using namespace at; _passed = true; \ els; \ } catch (std::exception &e) { \ - REQUIRE(!_passed); \ + CATCH_REQUIRE(!_passed); \ catc; \ } \ } void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) { - REQUIRE(lhs.dim() == rhs.dim()); - REQUIRE(lhs.sizes().equals(rhs.sizes())); + CATCH_REQUIRE(lhs.dim() == rhs.dim()); + CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes())); } bool should_expand(const IntList &from_size, const IntList &to_size) { @@ -49,15 +49,15 @@ void test(Type &T) { for (auto s = sizes.begin(); s != sizes.end(); ++s) { // verify that the dim, sizes, strides, etc match what was requested. auto t = ones(*s, T); - REQUIRE((size_t)t.dim() == s->size()); - REQUIRE((size_t)t.ndimension() == s->size()); - REQUIRE(t.sizes().equals(*s)); - REQUIRE(t.strides().size() == s->size()); + CATCH_REQUIRE((size_t)t.dim() == s->size()); + CATCH_REQUIRE((size_t)t.ndimension() == s->size()); + CATCH_REQUIRE(t.sizes().equals(*s)); + CATCH_REQUIRE(t.strides().size() == s->size()); auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies()); - REQUIRE(t.numel() == numel); + CATCH_REQUIRE(t.numel() == numel); // verify we can output std::stringstream ss; - REQUIRE_NOTHROW(ss << t << std::endl); + CATCH_REQUIRE_NOTHROW(ss << t << std::endl); // set_ auto t2 = ones(*s, T); @@ -65,22 +65,22 @@ void test(Type &T) { require_equal_size_dim(t2, ones({0}, T)); // unsqueeze - REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); + CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); // unsqueeze_ { auto t2 = ones(*s, T); auto r = t2.unsqueeze_(0); - REQUIRE(r.dim() == t.dim() + 1); + CATCH_REQUIRE(r.dim() == t.dim() + 1); } // squeeze (with dimension argument) if (t.dim() == 0 || t.sizes()[0] == 1) { - REQUIRE(t.squeeze(0).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t.squeeze(0).dim() == std::max(t.dim() - 1, 0)); } else { // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; // in NumPy this is an error. - REQUIRE(t.squeeze(0).dim() == t.dim()); + CATCH_REQUIRE(t.squeeze(0).dim() == t.dim()); } // squeeze (with no dimension argument) @@ -99,11 +99,11 @@ void test(Type &T) { // squeeze_ (with dimension argument) auto t2 = ones(*s, T); if (t2.dim() == 0 || t2.sizes()[0] == 1) { - REQUIRE(t2.squeeze_(0).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max(t.dim() - 1, 0)); } else { // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; // in NumPy this is an error. - REQUIRE(t2.squeeze_(0).dim() == t.dim()); + CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim()); } } @@ -122,31 +122,31 @@ void test(Type &T) { // reduce (with dimension argument and with 1 return argument) if (t.numel() != 0) { - REQUIRE(t.sum(0).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t.sum(0).dim() == std::max(t.dim() - 1, 0)); } else { - REQUIRE(t.sum(0).equal(at::zeros({}, T))); + CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T))); } // reduce (with dimension argument and with 2 return arguments) if (t.numel() != 0) { auto ret = t.min(0); - REQUIRE(std::get<0>(ret).dim() == std::max(t.dim() - 1, 0)); - REQUIRE(std::get<1>(ret).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(std::get<0>(ret).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(std::get<1>(ret).dim() == std::max(t.dim() - 1, 0)); } else { - REQUIRE_THROWS(t.min(0)); + _CATCH_REQUIRE_THROWS(t.min(0)); } // simple indexing if (t.dim() > 0 && t.numel() != 0) { - REQUIRE(t[0].dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t[0].dim() == std::max(t.dim() - 1, 0)); } else { - REQUIRE_THROWS(t[0]); + _CATCH_REQUIRE_THROWS(t[0]); } // fill_ (argument to fill_ can only be a 0-dim tensor) TRY_CATCH_ELSE(t.fill_(t.sum(0)), - REQUIRE(t.dim() > 1), - REQUIRE(t.dim() <= 1)); + CATCH_REQUIRE(t.dim() > 1), + CATCH_REQUIRE(t.dim() <= 1)); } for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) { @@ -156,8 +156,8 @@ void test(Type &T) { auto lhs = ones(*lhs_it, T); auto rhs = ones(*rhs_it, T); if(*lhs_it != *rhs_it) { - REQUIRE(!lhs.is_same_size(rhs)); - REQUIRE(!rhs.is_same_size(lhs)); + CATCH_REQUIRE(!lhs.is_same_size(rhs)); + CATCH_REQUIRE(!rhs.is_same_size(lhs)); } } // forced size functions (resize_, resize_as, set_) @@ -192,7 +192,7 @@ void test(Type &T) { auto storage = T.storage(rhs.numel(), false); lhs.set_(storage); // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars - REQUIRE(lhs.dim() != 0); + CATCH_REQUIRE(lhs.dim() != 0); } { // with storage, offset, sizes, strides @@ -211,8 +211,8 @@ void test(Type &T) { auto rhs = ones(*rhs_it, T); auto rhs_size = *rhs_it; TRY_CATCH_ELSE(auto result = lhs.view(rhs_size), - REQUIRE(lhs.numel() != rhs.numel()), - REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs);); + CATCH_REQUIRE(lhs.numel() != rhs.numel()), + CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs);); } // take @@ -220,7 +220,7 @@ void test(Type &T) { auto lhs = ones(*lhs_it, T); auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long); TRY_CATCH_ELSE(auto result = lhs.take(rhs), - REQUIRE(lhs.numel() == 0); REQUIRE(rhs.numel() != 0), + CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0), require_equal_size_dim(result, rhs)); } @@ -230,7 +230,7 @@ void test(Type &T) { auto lhs = ones(*lhs_it, T); auto rhs = ones(*rhs_it, T); TRY_CATCH_ELSE(auto result = lhs.ger(rhs), - REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)), + CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)), [&]() { int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0); int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0); @@ -246,8 +246,8 @@ void test(Type &T) { auto rhs_size = *rhs_it; bool should_pass = should_expand(lhs_size, rhs_size); TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size), - REQUIRE(!should_pass), - REQUIRE(should_pass); require_equal_size_dim(result, rhs);); + CATCH_REQUIRE(!should_pass), + CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs);); // in-place functions (would be good if we can also do a non-broadcasting one, b/c // broadcasting functions will always end up operating on tensors of same size; @@ -255,21 +255,21 @@ void test(Type &T) { { bool should_pass_inplace = should_expand(rhs_size, lhs_size); TRY_CATCH_ELSE(lhs.add_(rhs), - REQUIRE(!should_pass_inplace), - REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T));); + CATCH_REQUIRE(!should_pass_inplace), + CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T));); } } } } } -TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { +CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { manual_seed(123, at::kCPU); test(CPU(kFloat)); } -TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { +CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { manual_seed(123, at::kCUDA); if (at::hasCUDA()) { diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index 72ef4e4ad3cf4c..247830c3cc839c 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include // define constants like M_PI and C keywords for MSVC @@ -33,25 +33,25 @@ struct Foo { void test_overflow() { auto s1 = Scalar(M_PI); - REQUIRE(s1.toFloat() == static_cast(M_PI)); + CATCH_REQUIRE(s1.toFloat() == static_cast(M_PI)); s1.toHalf(); s1 = Scalar(100000); - REQUIRE(s1.toFloat() == 100000.0); - REQUIRE(s1.toInt() == 100000); + CATCH_REQUIRE(s1.toFloat() == 100000.0); + CATCH_REQUIRE(s1.toInt() == 100000); - REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error); + CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error); s1 = Scalar(NAN); - REQUIRE(std::isnan(s1.toFloat())); - REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + CATCH_REQUIRE(std::isnan(s1.toFloat())); + CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); s1 = Scalar(INFINITY); - REQUIRE(std::isinf(s1.toFloat())); - REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + CATCH_REQUIRE(std::isinf(s1.toFloat())); + CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); } -TEST_CASE( "scalar test", "[]" ) { +CATCH_TEST_CASE( "scalar test", "[]" ) { manual_seed(123, at::kCPU); manual_seed(123, at::kCUDA); @@ -62,7 +62,7 @@ TEST_CASE( "scalar test", "[]" ) { Scalar h2 = h; cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() << "\n"; Generator & gen = at::globalContext().defaultGenerator(at::kCPU); - REQUIRE_NOTHROW(gen.seed()); + CATCH_REQUIRE_NOTHROW(gen.seed()); auto && C = at::globalContext(); if(at::hasCUDA()) { auto t2 = zeros({4,4}, at::kCUDA); @@ -71,12 +71,12 @@ TEST_CASE( "scalar test", "[]" ) { auto t = ones({4,4}); auto wha2 = zeros({4,4}).add(t).sum(); - REQUIRE( wha2.toCDouble() == 16.0 ); + CATCH_REQUIRE( wha2.toCDouble() == 16.0 ); - REQUIRE( t.sizes()[0] == 4 ); - REQUIRE( t.sizes()[1] == 4 ); - REQUIRE( t.strides()[0] == 4 ); - REQUIRE( t.strides()[1] == 1 ); + CATCH_REQUIRE( t.sizes()[0] == 4 ); + CATCH_REQUIRE( t.sizes()[1] == 4 ); + CATCH_REQUIRE( t.strides()[0] == 4 ); + CATCH_REQUIRE( t.strides()[1] == 1 ); Type & T = CPU(Float); Tensor x = randn({1,10}, T); @@ -88,26 +88,26 @@ TEST_CASE( "scalar test", "[]" ) { Tensor next_h = i2h.add(h2h); next_h = next_h.tanh(); - REQUIRE_THROWS(at::_local_scalar(Tensor{})); + _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{})); test_overflow(); if(at::hasCUDA()) { auto r = CUDA(Float).copy(next_h); - REQUIRE(CPU(Float).copy(r).equal(next_h)); + CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h)); } - REQUIRE_NOTHROW(randn({10,10,2}, T)); + CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T)); // check Scalar.toTensor on Scalars backed by different data types - REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble); - REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong); - REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble); + CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble); + CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong); + CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble); if (x.type().scalarType() != ScalarType::Half) { AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] { scalar_t s = 1; std::stringstream ss; - REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n"); + CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n"); auto data = (scalar_t*)x.data_ptr(); (void)data; }); @@ -116,10 +116,10 @@ TEST_CASE( "scalar test", "[]" ) { // test direct C-scalar type conversions { auto x = ones({1,2}, T); - REQUIRE_THROWS(x.toCFloat()); + _CATCH_REQUIRE_THROWS(x.toCFloat()); } auto float_one = ones({}, T); - REQUIRE(float_one.toCFloat() == 1); - REQUIRE(float_one.toCInt() == 1); - REQUIRE((float_one.toCHalf() == 1)); + CATCH_REQUIRE(float_one.toCFloat() == 1); + CATCH_REQUIRE(float_one.toCInt() == 1); + CATCH_REQUIRE((float_one.toCHalf() == 1)); } diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp index 145c4f4c261276..8dc015dd1d06ae 100644 --- a/aten/src/ATen/test/stream_test.cpp +++ b/aten/src/ATen/test/stream_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/cuda/CUDAContext.h" #include "ATen/cuda/CUDAGuard.h" @@ -14,7 +14,7 @@ /* Tests related to ATen streams. */ -TEST_CASE( +CATCH_TEST_CASE( "Copying and Moving Streams", "Verifies streams are live through copying and moving") { int32_t device = -1; @@ -29,14 +29,14 @@ TEST_CASE( copyStream = s; - REQUIRE(copyStream.internals() == s.internals()); - REQUIRE(copyStream.device() == device); - REQUIRE(copyStream.stream() == cuda_stream); + CATCH_REQUIRE(copyStream.internals() == s.internals()); + CATCH_REQUIRE(copyStream.device() == device); + CATCH_REQUIRE(copyStream.stream() == cuda_stream); } - REQUIRE(copyStream.internals()); - REQUIRE(copyStream.device() == device); - REQUIRE(copyStream.stream() == cuda_stream); + CATCH_REQUIRE(copyStream.internals()); + CATCH_REQUIRE(copyStream.device() == device); + CATCH_REQUIRE(copyStream.stream() == cuda_stream); // Tests that moving works as expected and preserves the stream at::cuda::CUDAStream moveStream; @@ -47,41 +47,41 @@ TEST_CASE( moveStream = std::move(s); - REQUIRE(moveStream.device() == device); - REQUIRE(moveStream.stream() == cuda_stream); + CATCH_REQUIRE(moveStream.device() == device); + CATCH_REQUIRE(moveStream.stream() == cuda_stream); } - REQUIRE(moveStream.internals()); - REQUIRE(moveStream.device() == device); - REQUIRE(moveStream.stream() == cuda_stream); + CATCH_REQUIRE(moveStream.internals()); + CATCH_REQUIRE(moveStream.device() == device); + CATCH_REQUIRE(moveStream.stream() == cuda_stream); } -TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { +CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { at::cuda::CUDAStream myStream = at::cuda::createCUDAStream(); // Sets and gets at::cuda::setCurrentCUDAStream(myStream); at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream(); - REQUIRE(myStream == curStream); + CATCH_REQUIRE(myStream == curStream); // Gets, sets, and gets default stream at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(); at::cuda::setCurrentCUDAStream(defaultStream); curStream = at::cuda::getCurrentCUDAStream(); - REQUIRE(defaultStream != myStream); - REQUIRE(curStream == defaultStream); + CATCH_REQUIRE(defaultStream != myStream); + CATCH_REQUIRE(curStream == defaultStream); } void thread_fun(at::cuda::CUDAStream& cur_thread_stream) { auto new_stream = at::cuda::createCUDAStream(); at::cuda::setCurrentCUDAStream(new_stream); cur_thread_stream = at::cuda::getCurrentCUDAStream(); - REQUIRE(cur_thread_stream == new_stream); + CATCH_REQUIRE(cur_thread_stream == new_stream); } -TEST_CASE( +CATCH_TEST_CASE( "Multithread Getting and Setting", "Ensures streams are thread local") { at::cuda::CUDAStream s0, s1; @@ -94,25 +94,25 @@ TEST_CASE( at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream(); at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream(); - REQUIRE(cur_stream == default_stream); - REQUIRE(cur_stream != s0); - REQUIRE(cur_stream != s1); - REQUIRE(s0 != s1); + CATCH_REQUIRE(cur_stream == default_stream); + CATCH_REQUIRE(cur_stream != s0); + CATCH_REQUIRE(cur_stream != s1); + CATCH_REQUIRE(s0 != s1); } -TEST_CASE("CUDAGuard") { +CATCH_TEST_CASE("CUDAGuard") { if (at::cuda::getNumGPUs() < 2) { return; } // -- begin setup - REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::current_device() == 0); std::vector streams0 = { at::cuda::getDefaultCUDAStream(), at::cuda::createCUDAStream()}; - REQUIRE(streams0[0].device() == 0); - REQUIRE(streams0[1].device() == 0); + CATCH_REQUIRE(streams0[0].device() == 0); + CATCH_REQUIRE(streams0[1].device() == 0); at::cuda::setCurrentCUDAStream(streams0[0]); std::vector streams1; @@ -121,47 +121,47 @@ TEST_CASE("CUDAGuard") { streams1.push_back(at::cuda::getDefaultCUDAStream()); streams1.push_back(at::cuda::createCUDAStream()); } - REQUIRE(streams1[0].device() == 1); - REQUIRE(streams1[1].device() == 1); + CATCH_REQUIRE(streams1[0].device() == 1); + CATCH_REQUIRE(streams1[1].device() == 1); at::cuda::setCurrentCUDAStream(streams1[0]); - REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::current_device() == 0); // -- end setup // Test that all original streams are recorded. { at::cuda::CUDAGuard guard; - REQUIRE(guard.original_streams().empty()); + CATCH_REQUIRE(guard.original_streams().empty()); guard.set_stream(streams0[0]); - REQUIRE( + CATCH_REQUIRE( guard.original_streams().size() == at::cuda::getNumGPUs()); - REQUIRE(guard.original_streams()[0] == streams0[0]); - REQUIRE(guard.original_streams()[1] == streams1[0]); + CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]); + CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]); } // Setting a stream changes the current device and the stream on that device { at::cuda::CUDAGuard guard(streams1[1]); - REQUIRE(guard.last_device() == 1); - REQUIRE(at::cuda::current_device() == 1); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]); + CATCH_REQUIRE(guard.last_device() == 1); + CATCH_REQUIRE(at::cuda::current_device() == 1); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]); } // Device and stream are now reset - REQUIRE(at::cuda::current_device() == 0); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + CATCH_REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); // Setting only the device changes only the current device and not the stream { at::cuda::CUDAGuard guard(/*device=*/1); - REQUIRE(guard.last_device() == 1); - REQUIRE(at::cuda::current_device() == 1); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + CATCH_REQUIRE(guard.last_device() == 1); + CATCH_REQUIRE(at::cuda::current_device() == 1); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); } - REQUIRE(at::cuda::current_device() == 0); - REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); + CATCH_REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); // Setting the stream first, and then the device, first changes the devices // back, and then resets the stream on the initial device. @@ -171,12 +171,12 @@ TEST_CASE("CUDAGuard") { guard.set_device(1); } - REQUIRE(at::cuda::current_device() == 0); - REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + CATCH_REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); } -TEST_CASE("CUDAGuardIsMovable") { +CATCH_TEST_CASE("CUDAGuardIsMovable") { if (at::cuda::getNumGPUs() < 2) { return; } @@ -185,17 +185,17 @@ TEST_CASE("CUDAGuardIsMovable") { at::cuda::CUDAGuard first(stream); first.set_device(1); at::cuda::CUDAGuard second(std::move(first)); - REQUIRE(second.original_streams().size() == device_count); - REQUIRE(second.original_device() == 0); - REQUIRE(second.last_device() == 1); + CATCH_REQUIRE(second.original_streams().size() == device_count); + CATCH_REQUIRE(second.original_device() == 0); + CATCH_REQUIRE(second.last_device() == 1); at::cuda::CUDAGuard third; third = std::move(second); - REQUIRE(third.original_streams().size() == device_count); - REQUIRE(third.original_device() == 0); - REQUIRE(third.last_device() == 1); + CATCH_REQUIRE(third.original_streams().size() == device_count); + CATCH_REQUIRE(third.original_device() == 0); + CATCH_REQUIRE(third.last_device() == 1); } -TEST_CASE("Streampool Round Robin") { +CATCH_TEST_CASE("Streampool Round Robin") { std::vector streams{}; for (int i = 0; i < 200; ++i) { streams.emplace_back(at::cuda::detail::CUDAStream_createStream()); @@ -209,10 +209,10 @@ TEST_CASE("Streampool Round Robin") { if (!result_pair.second) hasDuplicates = true; } - REQUIRE(hasDuplicates); + CATCH_REQUIRE(hasDuplicates); } -TEST_CASE("Multi-GPU") { +CATCH_TEST_CASE("Multi-GPU") { if (at::cuda::getNumGPUs() < 2) return; at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0); @@ -221,17 +221,17 @@ TEST_CASE("Multi-GPU") { at::cuda::setCurrentCUDAStream(s0); at::cuda::setCurrentCUDAStream(s1); - REQUIRE(s0 == at::cuda::getCurrentCUDAStream()); + CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream()); at::DeviceGuard device_guard{1}; - REQUIRE(s1 == at::cuda::getCurrentCUDAStream()); + CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream()); } -TEST_CASE("CUDAEvent Syncs") { +CATCH_TEST_CASE("CUDAEvent Syncs") { const auto stream = at::cuda::createCUDAStream(); at::cuda::CUDAEvent event; - REQUIRE(!event.happened()); + CATCH_REQUIRE(!event.happened()); event.recordOnce(stream); @@ -242,10 +242,10 @@ TEST_CASE("CUDAEvent Syncs") { wait_stream1.synchronize_with(event); cudaStreamSynchronize(wait_stream0); - REQUIRE(event.happened()); + CATCH_REQUIRE(event.happened()); } -TEST_CASE("Cross-Device Events") { +CATCH_TEST_CASE("Cross-Device Events") { if (at::cuda::getNumGPUs() < 2) return; const auto stream0 = at::cuda::createCUDAStream(); @@ -260,10 +260,10 @@ TEST_CASE("Cross-Device Events") { event0 = std::move(event1); - REQUIRE(event0.device() == 1); + CATCH_REQUIRE(event0.device() == 1); stream0.synchronize_with(event0); cudaStreamSynchronize(stream0); - REQUIRE(event0.happened()); + CATCH_REQUIRE(event0.happened()); } diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp index 552328029ce03c..81701733b53693 100644 --- a/aten/src/ATen/test/test_parallel.cpp +++ b/aten/src/ATen/test/test_parallel.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/DLConvertor.h" @@ -11,7 +11,7 @@ using namespace at; -TEST_CASE( "parallel", "[cpu]" ) { +CATCH_TEST_CASE( "parallel", "[cpu]" ) { manual_seed(123, at::kCPU); set_num_threads(1); @@ -24,5 +24,5 @@ TEST_CASE( "parallel", "[cpu]" ) { as[0] = 1; as[1] = 0; as[2] = 0; - REQUIRE(a.sum(0).equal(as)); + CATCH_REQUIRE(a.sum(0).equal(as)); } diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp index 1b605e7271c6c7..c01dff2d0038b1 100644 --- a/aten/src/ATen/test/undefined_tensor_test.cpp +++ b/aten/src/ATen/test/undefined_tensor_test.cpp @@ -1,14 +1,14 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include #include "test_seed.h" using namespace at; -TEST_CASE( "undefined tensor test", "[]" ) { +CATCH_TEST_CASE( "undefined tensor test", "[]" ) { manual_seed(123, at::kCPU); // mainly test ops on undefined tensors don't segfault and give a reasonable errror message. @@ -17,36 +17,36 @@ TEST_CASE( "undefined tensor test", "[]" ) { std::stringstream ss; ss << und << std::endl; - REQUIRE(!und.defined()); - REQUIRE(std::string("UndefinedType") == und.toString()); - - REQUIRE_THROWS(und.strides()); - REQUIRE_THROWS(und.dim()); - REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5)); - REQUIRE_THROWS(und.add(und)); - REQUIRE_THROWS(und.add(ft)); - REQUIRE_THROWS(ft.add(und)); - REQUIRE_THROWS(und.add(5)); - REQUIRE_THROWS(und.mm(und)); + CATCH_REQUIRE(!und.defined()); + CATCH_REQUIRE(std::string("UndefinedType") == und.toString()); + + _CATCH_REQUIRE_THROWS(und.strides()); + _CATCH_REQUIRE_THROWS(und.dim()); + _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5)); + _CATCH_REQUIRE_THROWS(und.add(und)); + _CATCH_REQUIRE_THROWS(und.add(ft)); + _CATCH_REQUIRE_THROWS(ft.add(und)); + _CATCH_REQUIRE_THROWS(und.add(5)); + _CATCH_REQUIRE_THROWS(und.mm(und)); und.toType(und.type()); - REQUIRE_THROWS(und.toType(ft.type())); - REQUIRE_THROWS(ft.toType(und.type())); + _CATCH_REQUIRE_THROWS(und.toType(ft.type())); + _CATCH_REQUIRE_THROWS(ft.toType(und.type())); und.toType(ScalarType::Undefined); - REQUIRE_THROWS(und.toType(ScalarType::Float)); - REQUIRE_THROWS(ft.toType(ScalarType::Undefined)); + _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float)); + _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined)); // copy_ - REQUIRE_THROWS(und.copy_(und)); - REQUIRE_THROWS(und.copy_(ft)); - REQUIRE_THROWS(ft.copy_(und)); + _CATCH_REQUIRE_THROWS(und.copy_(und)); + _CATCH_REQUIRE_THROWS(und.copy_(ft)); + _CATCH_REQUIRE_THROWS(ft.copy_(und)); und.toBackend(Backend::Undefined); - REQUIRE_THROWS(und.toBackend(Backend::CPU)); - REQUIRE_THROWS(ft.toBackend(Backend::Undefined)); + _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU)); + _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined)); Tensor to_move = ones({1}, CPU(kFloat)); Tensor m(std::move(to_move)); - REQUIRE(!to_move.defined()); - REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensor::singleton()); + CATCH_REQUIRE(!to_move.defined()); + CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton()); } diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp index 167520beb58382..42c9f61b19b5e1 100644 --- a/aten/src/ATen/test/weakref_test.cpp +++ b/aten/src/ATen/test/weakref_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" @@ -10,53 +10,53 @@ using at::Tensor; using at::WeakTensor; -TEST_CASE( "Weak pointer tests", "" ) { - SECTION("gets invalidated") { +CATCH_TEST_CASE( "Weak pointer tests", "" ) { + CATCH_SECTION("gets invalidated") { Tensor a = at::ones({2, 2}); WeakTensor b = a; a.reset(); - REQUIRE_FALSE(b.lock().defined()); + CATCH_REQUIRE_FALSE(b.lock().defined()); } - SECTION("can successfully lock") { + CATCH_SECTION("can successfully lock") { Tensor a = at::ones({2, 2}); WeakTensor b = a; auto c = b.lock(); - REQUIRE(c.defined()); + CATCH_REQUIRE(c.defined()); a.reset(); - REQUIRE(b.lock().defined()); + CATCH_REQUIRE(b.lock().defined()); c.reset(); - REQUIRE_FALSE(b.lock().defined()); + CATCH_REQUIRE_FALSE(b.lock().defined()); } - SECTION("updates refcounts correctly") { + CATCH_SECTION("updates refcounts correctly") { Tensor a = at::ones({2, 2}); - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 1); { WeakTensor b = a; - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 2); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 2); } - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 1); { WeakTensor b = a; - REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); auto locked = b.lock(); - REQUIRE(locked.defined()); - REQUIRE(a.use_count() == 2); + CATCH_REQUIRE(locked.defined()); + CATCH_REQUIRE(a.use_count() == 2); } - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 1); { WeakTensor b = a; - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 2); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 2); a.reset(); - REQUIRE(b.use_count() == 0); - REQUIRE(b.weak_use_count() == 1); + CATCH_REQUIRE(b.use_count() == 0); + CATCH_REQUIRE(b.weak_use_count() == 1); } } } diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp index 8e813bc7f7deeb..f76dac212a0921 100644 --- a/aten/src/ATen/test/wrapdim_test.cpp +++ b/aten/src/ATen/test/wrapdim_test.cpp @@ -1,43 +1,43 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; -TEST_CASE( "wrapdim test", "[]" ) { +CATCH_TEST_CASE( "wrapdim test", "[]" ) { manual_seed(123, at::kCPU); Type & T = CPU(kFloat); - SECTION( "simple case" ) { + CATCH_SECTION( "simple case" ) { auto a = randn({2, 3, 4, 5}, T); - REQUIRE(a.prod(-4).equal(a.prod(0))); - REQUIRE(a.prod(3).equal(a.prod(-1))); + CATCH_REQUIRE(a.prod(-4).equal(a.prod(0))); + CATCH_REQUIRE(a.prod(3).equal(a.prod(-1))); } - SECTION( "expression specification" ) { + CATCH_SECTION( "expression specification" ) { auto a = randn({2, 3, 4, 5}, T); - REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0))); - REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1))); + CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0))); + CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1))); // can unsqueeze scalar auto b = randn(1, T); b.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1))); + CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1))); } - SECTION( "empty tensor" ) { + CATCH_SECTION( "empty tensor" ) { auto a = randn(0, T); - REQUIRE(a.prod(0).equal(at::ones({}, T))); + CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T))); } - SECTION( "scalar vs 1-dim, 1-size" ) { + CATCH_SECTION( "scalar vs 1-dim, 1-size" ) { auto a = randn(1, T); - REQUIRE(a.prod(0).equal(a.prod(-1))); + CATCH_REQUIRE(a.prod(0).equal(a.prod(-1))); a.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(a.dim() == 0); - REQUIRE(a.prod(0).equal(a.prod(-1))); + CATCH_REQUIRE(a.dim() == 0); + CATCH_REQUIRE(a.prod(0).equal(a.prod(-1))); } } diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index 9fe22beb0dc54e..10d43e1433c811 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -26,7 +26,6 @@ SET(hdr set(ATen_TH_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp index a806df6ac62ebf..ddf993df6605d7 100644 --- a/aten/src/TH/THDiskFile.cpp +++ b/aten/src/TH/THDiskFile.cpp @@ -359,9 +359,9 @@ READ_WRITE_METHODS(float, Float, int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++, int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++) -READ_WRITE_METHODS(THHalf, Half, - float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; }, - int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++) +READ_WRITE_METHODS(at::Half, Half, + float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= static_cast(buf); nread++; }, + int ret = fprintf(dfself->handle, "%.9g", static_cast(data[i])); if(ret <= 0) break; else nwrite++) READ_WRITE_METHODS(double, Double, int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++, diff --git a/aten/src/TH/THGenerateHalfType.h b/aten/src/TH/THGenerateHalfType.h index 09d4c878d0f11e..8e1b5eaed4d946 100644 --- a/aten/src/TH/THGenerateHalfType.h +++ b/aten/src/TH/THGenerateHalfType.h @@ -5,8 +5,8 @@ #include "THHalf.h" #define scalar_t THHalf #define accreal float -#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val) -#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val) +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (scalar_t)(_val) #define Real Half #define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF) #define TH_REAL_IS_HALF diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp deleted file mode 100644 index c95272c5f13e34..00000000000000 --- a/aten/src/TH/THHalf.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include "THHalf.h" -#include - -/* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */ - -THHalf TH_float2half(float f) -{ - THHalf h; - TH_float2halfbits(&f, &h.x); - return h; -} - -TH_API float TH_half2float(THHalf h) -{ - float f; - TH_halfbits2float(&h.x, &f); - return f; -} - - -void TH_halfbits2float(unsigned short* src, float* res) -{ - *res = at::detail::halfbits2float(*src); -} - - -void TH_float2halfbits(float* src, unsigned short* dest) -{ - *dest = at::detail::float2halfbits(*src); -} diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h index fb68639ec44752..f7c884f2cc67bd 100644 --- a/aten/src/TH/THHalf.h +++ b/aten/src/TH/THHalf.h @@ -1,10 +1,8 @@ #ifndef TH_HALF_H #define TH_HALF_H -#include - #ifdef __cplusplus -#include +#include #endif #ifdef __cplusplus @@ -14,10 +12,4 @@ typedef struct at_Half at_Half; #define THHalf at_Half #endif -TH_API void TH_float2halfbits(float*, unsigned short*); -TH_API void TH_halfbits2float(unsigned short*, float*); - -TH_API THHalf TH_float2half(float); -TH_API float TH_half2float(THHalf); - #endif diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp index 3f2187b68f74ea..b74d7926ebff21 100644 --- a/aten/src/TH/THMemoryFile.cpp +++ b/aten/src/TH/THMemoryFile.cpp @@ -343,11 +343,11 @@ READ_WRITE_METHODS(float, Float, nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]), 1) -READ_WRITE_METHODS(THHalf, Half, +READ_WRITE_METHODS(at::Half, Half, int nByteRead_; float buf; \ int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \ - data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])), + data[i] = static_cast(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", static_cast(data[i])), 1) READ_WRITE_METHODS(double, Double, diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index a5319e67dabe61..c4d1f778250ea2 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -15,9 +15,9 @@ #include "generic/THStorageCopy.cpp" #include "THGenerateHalfType.h" -THStorage* THStorage_new(at::ScalarType scalar_type) { +THStorage* THStorage_new(caffe2::TypeMeta data_type) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + data_type, 0, getTHDefaultAllocator(), true).release(); diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 93a89f1753f000..95e5bacc2cef55 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -30,7 +30,7 @@ // If it is not, you must report that the storage is dead. // -TH_CPP_API THStorage* THStorage_new(at::ScalarType scalar_type); +TH_CPP_API THStorage* THStorage_new(caffe2::TypeMeta data_type); TH_API ptrdiff_t THStorage_size(const THStorage *self); TH_API void THStorage_retain(THStorage *storage); diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 0c731779b95685..887b0dcf2dff97 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -8,7 +8,7 @@ #include -// NB: This is NOT valid on UndefinedTensor +// NB: This is NOT valid on UndefinedTensorImpl void THTensor_free(THTensor *self) { if (!self) return; @@ -39,14 +39,14 @@ void THTensor_setStorageNd(THTensor *self, THStorage *storage, ptrdiff_t storage if (!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype()); + auto data_type = THTensor_getStoragePtr(self)->dtype(); if(storage) { c10::raw::intrusive_ptr::incref(storage); THTensor_stealAndSetStoragePtr(self, storage); } else { - THTensor_stealAndSetStoragePtr(self, THStorage_new(scalar_type)); + THTensor_stealAndSetStoragePtr(self, THStorage_new(data_type)); } } @@ -123,7 +123,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons if(totalSize+self->storage_offset() > 0) { if(!THTensor_getStoragePtr(self)) { - THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type())); + THTensor_stealAndSetStoragePtr(self, THStorage_new(self->dtype())); } if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) { THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset()); @@ -202,5 +202,5 @@ void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { // Caffe2 might have tensors whose storages are null, but we // don't allow it in PyTorch. AT_ASSERT(storage); - tensor->storage_ = at::Storage(storage); + tensor->storage_ = at::Storage(c10::intrusive_ptr::reclaim(storage)); } diff --git a/aten/src/TH/THTensorCopy.cpp b/aten/src/TH/THTensorCopy.cpp index d8df519e26bdbc..482a7b986f5302 100644 --- a/aten/src/TH/THTensorCopy.cpp +++ b/aten/src/TH/THTensorCopy.cpp @@ -1,6 +1,8 @@ #include "THTensor.hpp" #include "THVector.h" +#include + #include "generic/THTensorCopy.cpp" #include "THGenerateAllTypes.h" diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 7ed962567a31ff..6dfd90cfbe1bd2 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -21,13 +21,13 @@ size_t THStorage_(elementSize)() THStorage* THStorage_(new)(void) { - return THStorage_new(at::CTypeToScalarType::to()); + return THStorage_new(caffe2::TypeMeta::Make()); } THStorage* THStorage_(newWithSize)(ptrdiff_t size) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, getTHDefaultAllocator(), true).release(); @@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, at::Allocator *allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, allocator, true).release(); @@ -48,18 +48,18 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags) { - auto scalar_type = at::CTypeToScalarType::to(); + auto type_meta = caffe2::TypeMeta::Make(); size_t actual_size = -1; THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + type_meta, size, THMapAllocator::makeDataPtr( - filename, flags, size * at::elementSize(scalar_type), &actual_size), + filename, flags, size * type_meta.itemsize(), &actual_size), /* allocator */ nullptr, false).release(); if (size <= 0) { - storage->set_numel(actual_size / at::elementSize(scalar_type)); + storage->set_numel(actual_size / type_meta.itemsize()); } return storage; @@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage) THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, std::move(data), allocator, diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp index 1de588bbd2d75b..ea8a0d5808cb16 100644 --- a/aten/src/TH/generic/THStorageCopy.cpp +++ b/aten/src/TH/generic/THStorageCopy.cpp @@ -29,40 +29,6 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage data[i] = static_cast(src_data[i]); \ } -#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \ -void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ -{ \ - THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ - ptrdiff_t i; \ - auto data = THStorage_(data)(storage); \ - auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->numel(); i++) \ - data[i] = (scalar_t)TH_half2float(src_data[i]); \ -} - -#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \ -void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ -{ \ - THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ - ptrdiff_t i; \ - auto data = THStorage_(data)(storage); \ - auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->numel(); i++) \ - data[i] = TH_float2half((float)(src_data[i])); \ -} - -#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \ -void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ -{ \ - THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ - ptrdiff_t i; \ - auto data = THStorage_(data)(storage); \ - auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->numel(); i++) \ - data[i] = static_cast(src_data[i]); \ -} - -#ifndef TH_REAL_IS_HALF IMPLEMENT_THStorage_COPY(Byte) IMPLEMENT_THStorage_COPY(Char) IMPLEMENT_THStorage_COPY(Short) @@ -70,18 +36,6 @@ IMPLEMENT_THStorage_COPY(Int) IMPLEMENT_THStorage_COPY(Long) IMPLEMENT_THStorage_COPY(Float) IMPLEMENT_THStorage_COPY(Double) -IMPLEMENT_THStorage_COPY_FROM_HALF(Half) -#else -/* only allow pass-through for Half */ -IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half) -IMPLEMENT_THStorage_COPY_TO_HALF(Byte) -IMPLEMENT_THStorage_COPY_TO_HALF(Char) -IMPLEMENT_THStorage_COPY_TO_HALF(Short) -IMPLEMENT_THStorage_COPY_TO_HALF(Int) -IMPLEMENT_THStorage_COPY_TO_HALF(Long) -IMPLEMENT_THStorage_COPY_TO_HALF(Float) -IMPLEMENT_THStorage_COPY_TO_HALF(Double) -#endif - +IMPLEMENT_THStorage_COPY(Half) #endif diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index 3f373ee2119c26..9a7bd0be736f25 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -54,13 +54,21 @@ scalar_t *THTensor_(data)(const THTensor *self) { /* Empty init */ THTensor *THTensor_(new)(void) { - return c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + return c10::make_intrusive( + c10::intrusive_ptr::reclaim(THStorage_(new)()), + at::CPUTensorId(), + false + ).release(); } /* Pointer-copy init */ THTensor *THTensor_(newWithTensor)(THTensor *tensor) { - THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + THTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THStorage_(new)()), + at::CPUTensorId(), + false + ).release(); THTensor_(setStorageNd)(self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), @@ -75,7 +83,11 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, if (strides.data()) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } - THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + THTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THStorage_(new)()), + at::CPUTensorId(), + false + ).release(); THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); @@ -547,7 +559,7 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self) } } -// NB: It is INVALID to call this on an UndefinedTensor +// NB: It is INVALID to call this on an UndefinedTensorImpl void THTensor_(retain)(THTensor *self) { c10::raw::intrusive_ptr::incref(self); diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h index d5316919d4e31e..27807ea57a7dd0 100644 --- a/aten/src/TH/generic/THTensor.h +++ b/aten/src/TH/generic/THTensor.h @@ -5,7 +5,7 @@ /* a la lua? dim, storageoffset, ... et les methodes ? */ #ifdef __cplusplus -#include +#include #endif #ifdef __cplusplus diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp index a9e0564fb574c8..91a60275a6c03e 100644 --- a/aten/src/TH/generic/THTensorCopy.cpp +++ b/aten/src/TH/generic/THTensorCopy.cpp @@ -26,13 +26,11 @@ int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) { // special case copy where tensor is contiguous and src is a transposed matrix // This can be generalized to most copies, but it's tricker void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { - #define MIN(x, y) (((x) < (y)) ? (x) : (y)) - #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #ifdef TH_REAL_IS_BYTE - const int BLOCK_SZ = 120; + const int64_t BLOCK_SZ = 120; #else - const int BLOCK_SZ = 60; + const int64_t BLOCK_SZ = 60; #endif THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ); @@ -48,8 +46,8 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { scalar_t *spo = sp + R + C * NR; scalar_t *rpo = rp + C + R * NC; - int nr = MIN(NR - R, BLOCK_SZ); - int nc = MIN(NC - C, BLOCK_SZ); + int nr = std::min(NR - R, BLOCK_SZ); + int nc = std::min(NC - C, BLOCK_SZ); // 1. copy columns from src to buf for (int c = 0; c < nc; c++) { @@ -57,10 +55,10 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { } // 2. transpose buf in place - int rc_max = MAX(nr, nc); - int rc_min = MIN(nr, nc); + int rc_max = std::max(nr, nc); + int rc_min = std::min(nr, nc); for (int r = 0; r < rc_max; r++) { - int end = MIN(r, rc_min); + int end = std::min(r, rc_min); for (int c = 0; c < end; c++) { scalar_t tmp = bp[r + BLOCK_SZ * c]; bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c]; @@ -75,8 +73,6 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { } } c10::raw::intrusive_ptr::decref(buf); - #undef MIN - #undef MAX } void THTensor_(copy)(THTensor *tensor, THTensor *src) @@ -203,28 +199,6 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src static_cast>(*src_data));) \ } -#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \ -void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ -{ \ - TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \ -} - -#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ -void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ -{ \ - TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, \ - *tensor_data = static_cast( \ - static_cast>( \ - TH_half2float(*src_data)));) \ -} - -#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ -void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ -{ \ - TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \ -} - -#ifndef TH_REAL_IS_HALF IMPLEMENT_THTensor_COPY(Byte, uint8_t) IMPLEMENT_THTensor_COPY(Char, int8_t) IMPLEMENT_THTensor_COPY(Short, int16_t) @@ -232,18 +206,6 @@ IMPLEMENT_THTensor_COPY(Int, int32_t) IMPLEMENT_THTensor_COPY(Long, int64_t) IMPLEMENT_THTensor_COPY(Float, float) IMPLEMENT_THTensor_COPY(Double, double) -IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf) -#else -/* only allow pass-through for Half */ -IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf) -IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Float, float) -IMPLEMENT_THTensor_COPY_TO_HALF(Double, double) - -#endif /* REAL_IS_HALF */ +IMPLEMENT_THTensor_COPY(Half, at::Half) #endif diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index 44f5d188d5b42b..4b8ab9f4ab101a 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -92,7 +92,6 @@ INSTALL(FILES THCGenerateFloatType.h THCGenerateFloatTypes.h THCGenerateDoubleType.h - THCHalf.h THCIntegerDivider.cuh THCNumerics.cuh THCTensorSort.cuh diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index 8fec96dde6f647..756fa0f905ac13 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -2,7 +2,7 @@ #define THC_ATOMICS_INC #include "THC.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCNumerics.cuh" #include "ATen/ATen.h" @@ -95,7 +95,7 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) { AtomicAddIntegerImpl()(address, val); } -static inline __device__ void atomicAdd(half *address, half val) { +static inline __device__ void atomicAdd(at::Half *address, at::Half val) { unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); unsigned int old = *address_as_ui; @@ -103,23 +103,13 @@ static inline __device__ void atomicAdd(half *address, half val) { do { assumed = old; -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - half hsum; + at::Half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - hsum = THCNumerics::add(hsum, val); -#else - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = THCNumerics::add(hsum, val); - hsum = __half_raw(tmpres); -#endif + hsum = THCNumerics::add(hsum, val); old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; old = atomicCAS(address_as_ui, assumed, old); } while (assumed != old); } -static inline __device__ void atomicAdd(at::Half *address, at::Half val) { - atomicAdd(reinterpret_cast(address), val); -} #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) // from CUDA C Programmic Guide diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index 20b13d82b9a152..6375ced8c691cc 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -1,6 +1,6 @@ #include "THCBlas.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include @@ -50,7 +50,7 @@ double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, doub return 0; } -half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy) +at::Half THCudaBlas_Hdot(THCState *state, int64_t n, at::Half *x, int64_t incx, at::Half *y, int64_t incy) { #if CUDA_VERSION >= 8000 if (n == 1) { @@ -59,7 +59,7 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, } if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { - half result; + at::Half result; cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasDotEx(handle, n, @@ -72,10 +72,10 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, THError("Cublas_Hdot only supports n, incx and incy " "up to signed integer limits: %d", INT_MAX); - return THC_float2half(0); + return 0.0; #else THError("Cublas_Hdot requires CUDA 8.0+"); - return THC_float2half(0); + return 0.0; #endif } @@ -267,7 +267,7 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6 # define CUDA_R_16F CUBLAS_DATA_HALF #endif -void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc) +void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc) { adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); @@ -293,8 +293,8 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6 #else // Simulated Hgemm - float fAlpha = THC_half2float(alpha); - float fBeta = THC_half2float(beta); + float fAlpha = alpha; + float fBeta = beta; #if CUDA_VERSION < 9000 THCublasCheck(cublasSgemmEx(handle, opa, opb, @@ -355,8 +355,8 @@ void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int6 #if CUDA_VERSION >= 9010 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, - half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB, - half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount) + at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB, + at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount) { if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) @@ -371,8 +371,8 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); - float fAlpha = THC_half2float(alpha); - float fBeta = THC_half2float(beta); + float fAlpha = alpha; + float fBeta = beta; THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); THCublasCheck(cublasGemmStridedBatchedEx(handle, opa, opb, (int)m, (int)n, (int)k, diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h index 36a0b35294dff2..a73b47f162c3f3 100644 --- a/aten/src/THC/THCBlas.h +++ b/aten/src/THC/THCBlas.h @@ -2,12 +2,12 @@ #define THC_BLAS_INC #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" /* Level 1 */ THC_API float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy); THC_API double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy); -THC_API half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy); +THC_API THHalf THCudaBlas_Hdot(THCState *state, int64_t n, THHalf *x, int64_t incx, THHalf *y, int64_t incy); /* Level 2 */ THC_API void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy); @@ -19,7 +19,7 @@ THC_API void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc); THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); -THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc); +THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc); THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, @@ -38,8 +38,8 @@ THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char t #if CUDA_VERSION >= 9010 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, - half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB, - half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount); + THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB, + THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount); #endif /* Inverse */ diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index 6bc7da7cee6e2d..05da61bd56e754 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -125,7 +125,7 @@ void THCudaShutdown(THCState* state) for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]); - + // Frees BLAS handle if (res->blasHandle) { THCublasCheck(cublasDestroy(res->blasHandle)); @@ -256,7 +256,7 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) THError("THCState and sparseHandles must be set as there is no default sparseHandle"); return NULL; } - + int device; THCudaCheck(cudaGetDevice(&device)); @@ -280,7 +280,7 @@ cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state) int device; THCudaCheck(cudaGetDevice(&device)); - + // Creates the sparse handle if not created yet THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); if (!res->sparseHandle) { @@ -474,30 +474,3 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB #include "THCStorage.cpp" #include "THCAllocator.cpp" - -/* from THCHalf.h */ - -half THC_float2half(float f) -{ -#if CUDA_VERSION < 9000 - half h; - TH_float2halfbits(&f, &h.x); - return h; -#else - __half_raw h_raw; - TH_float2halfbits(&f, &h_raw.x); - return half(h_raw); -#endif -} - -float THC_half2float(half h) -{ - float f; -#if CUDA_VERSION < 9000 - TH_halfbits2float(&h.x, &f); -#else - __half_raw h_raw(h); - TH_halfbits2float(&h_raw.x, &f); -#endif - return f; -} diff --git a/aten/src/THC/THCGenerateHalfType.h b/aten/src/THC/THCGenerateHalfType.h index 54358a9be1a59d..596ea47904820f 100644 --- a/aten/src/THC/THCGenerateHalfType.h +++ b/aten/src/THC/THCGenerateHalfType.h @@ -2,9 +2,9 @@ #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h" #endif -#include "THCHalf.h" +#include "TH/THHalf.h" -#define scalar_t half +#define scalar_t THHalf #define accreal float #define Real Half diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h deleted file mode 100644 index aeae06fc4739ba..00000000000000 --- a/aten/src/THC/THCHalf.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef THC_HALF_CONVERSION_INC -#define THC_HALF_CONVERSION_INC - -#include "THCGeneral.h" - -#include -#include - -#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) -#ifndef __cplusplus -typedef __half_raw half; -#endif -#endif - -THC_API half THC_float2half(float a); -THC_API float THC_half2float(half a); - -#endif diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh index 8368141d474ec4..ed6f44c2114726 100644 --- a/aten/src/THC/THCNumerics.cuh +++ b/aten/src/THC/THCNumerics.cuh @@ -4,7 +4,7 @@ #include #include #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "ATen/ATen.h" #include "ATen/cuda/NumericLimits.cuh" @@ -165,214 +165,81 @@ struct THCNumerics { // DEPRECATED: use math functions from std and NumericLimits.cuh template <> -struct THCNumerics { - static inline __host__ __device__ half min() { return at::numeric_limits::lowest(); } - static inline __host__ __device__ half max() { return at::numeric_limits::max(); } - static inline __host__ __device__ half lower_bound() { return at::numeric_limits::lower_bound(); } - static inline __host__ __device__ half upper_bound() { return at::numeric_limits::upper_bound(); } - - static inline __host__ __device__ bool lt(half a, half b) { - return static_cast(a) < static_cast(b); - } - - static inline __host__ __device__ bool le(half a, half b) { - return static_cast(a) <= static_cast(b); - } - - static inline __host__ __device__ bool gt(half a, half b) { - return static_cast(a) > static_cast(b); - } - - static inline __host__ __device__ bool ge(half a, half b) { - return static_cast(a) >= static_cast(b); - } - - static inline __host__ __device__ bool eq(half a, half b) { - // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands - // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve - return static_cast(static_cast(a)) == static_cast(static_cast(b)); - } - - static inline __host__ __device__ bool ne(half a, half b) { - // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands - // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve - return static_cast(static_cast(a)) != static_cast(static_cast(b)); - } - - static inline __host__ __device__ half exp(half a) { - return static_cast(std::exp(static_cast(a))); - } - - // note that exp10 is not in the std namespace. - static inline __host__ __device__ half exp10(half a) { - return static_cast(::exp10(static_cast(a))); - } - - static inline __host__ __device__ half log(half a) { - return static_cast(::log(static_cast(a))); - } - - static inline __host__ __device__ half log10(half a) { - return static_cast(::log10(static_cast(a))); - } - - static inline __host__ __device__ half log1p(half a) { - return static_cast(::log1p(static_cast(a))); - } - - static inline __host__ __device__ half log2(half a) { - return static_cast(::log2(static_cast(a))); - } - - static inline __host__ __device__ half lgamma(half a) { - return static_cast(::lgamma(static_cast(a))); - } - - static inline __host__ __device__ half expm1(half a) { - return static_cast(::expm1(static_cast(a))); - } - - static inline __host__ __device__ half cos(half a) { - return static_cast(::cos(static_cast(a))); - } - - static inline __host__ __device__ half sin(half a) { - return static_cast(::sin(static_cast(a))); - } - - static inline __host__ __device__ half sqrt(half a) { - return static_cast(::sqrt(static_cast(a))); - } - - // note that rsqrt is not in the std namespace. - static inline __host__ __device__ half rsqrt(half a) { - return static_cast(::rsqrt(static_cast(a))); - } - - static inline __host__ __device__ half ceil(half a) { - return static_cast(::ceil(static_cast(a))); - } - - static inline __host__ __device__ half floor(half a) { - return static_cast(::floor(static_cast(a))); - } - - static inline __host__ __device__ half trunc(half a) { - return static_cast(::trunc(static_cast(a))); - } - - static inline __host__ __device__ half neg(half a) { - return static_cast(-(static_cast(a))); - } - - static inline __host__ __device__ half acos(half a) { - return static_cast(::acos(static_cast(a))); - } - - static inline __host__ __device__ half cosh(half a) { - return static_cast(::cosh(static_cast(a))); - } - - static inline __host__ __device__ half asin(half a) { - return static_cast(::asin(static_cast(a))); - } - - static inline __host__ __device__ half sinh(half a) { - return static_cast(::sinh(static_cast(a))); - } - - static inline __host__ __device__ half tan(half a) { - return static_cast(::tan(static_cast(a))); - } - - static inline __host__ __device__ half atan(half a) { - return static_cast(::atan(static_cast(a))); - } - - static inline __host__ __device__ half tanh(half a) { - return static_cast(::tanh(static_cast(a))); - } - - - static inline __host__ __device__ half erf(half a) { - return static_cast(::erf(static_cast(a))); - } - - - static inline __host__ __device__ half erfc(half a) { - return static_cast(::erfc(static_cast(a))); - } - - // note that erfinv is not in the std namespace. - static inline __host__ __device__ half erfinv(half a) { - return static_cast(::erfinv(static_cast(a))); - } - - static inline __host__ __device__ half abs(half a) { -#ifdef __HIP_PLATFORM_HCC__ - return static_cast(std::abs(static_cast(a))); -#else - return static_cast(::abs(static_cast(a))); -#endif - } - - static inline __host__ __device__ half round(half a) { - return static_cast(::round(static_cast(a))); - } - - static inline __host__ __device__ half frac(half a) { +struct THCNumerics { + static inline __host__ __device__ at::Half min() { return at::numeric_limits::lowest(); } + static inline __host__ __device__ at::Half max() { return at::numeric_limits::max(); } + static inline __host__ __device__ at::Half lower_bound() { return at::numeric_limits::lower_bound(); } + static inline __host__ __device__ at::Half upper_bound() { return at::numeric_limits::upper_bound(); } + + static inline __host__ __device__ bool lt(at::Half a, at::Half b) { return a < b; } + static inline __host__ __device__ bool le(at::Half a, at::Half b) { return a <= b; } + static inline __host__ __device__ bool gt(at::Half a, at::Half b) { return a > b; } + static inline __host__ __device__ bool ge(at::Half a, at::Half b) { return a >= b; } + static inline __host__ __device__ bool eq(at::Half a, at::Half b) { return a == b; } + static inline __host__ __device__ bool ne(at::Half a, at::Half b) { return a != b; } + + static inline __host__ __device__ at::Half exp(at::Half a) { return std::exp(a); } + static inline __host__ __device__ at::Half exp10(at::Half a) { return ::exp10(a); } + static inline __host__ __device__ at::Half log(at::Half a) { return ::log(a); } + static inline __host__ __device__ at::Half log10(at::Half a) { return ::log10(a); } + static inline __host__ __device__ at::Half log1p(at::Half a) { return ::log1p(a); } + static inline __host__ __device__ at::Half log2(at::Half a) { return ::log2(a); } + static inline __host__ __device__ at::Half lgamma(at::Half a) { return ::lgamma(a); } + static inline __host__ __device__ at::Half expm1(at::Half a) { return ::expm1(a); } + static inline __host__ __device__ at::Half cos(at::Half a) { return ::cos(a); } + static inline __host__ __device__ at::Half sin(at::Half a) { return ::sin(a); } + static inline __host__ __device__ at::Half sqrt(at::Half a) { return ::sqrt(a); } + static inline __host__ __device__ at::Half rsqrt(at::Half a) { return ::rsqrt(a); } + static inline __host__ __device__ at::Half ceil(at::Half a) { return ::ceil(a); } + static inline __host__ __device__ at::Half floor(at::Half a) { return ::floor(a); } + static inline __host__ __device__ at::Half trunc(at::Half a) { return ::trunc(a); } + static inline __host__ __device__ at::Half neg(at::Half a) { return -a; } + static inline __host__ __device__ at::Half acos(at::Half a) { return ::acos(a); } + static inline __host__ __device__ at::Half cosh(at::Half a) { return ::cosh(a); } + static inline __host__ __device__ at::Half asin(at::Half a) { return ::asin(a); } + static inline __host__ __device__ at::Half sinh(at::Half a) { return ::sinh(a); } + static inline __host__ __device__ at::Half tan(at::Half a) { return ::tan(a); } + static inline __host__ __device__ at::Half atan(at::Half a) { return ::atan(a); } + static inline __host__ __device__ at::Half tanh(at::Half a) { return ::tanh(a); } + static inline __host__ __device__ at::Half erf(at::Half a) { return ::erf(a); } + static inline __host__ __device__ at::Half erfc(at::Half a) { return ::erfc(a); } + static inline __host__ __device__ at::Half erfinv(at::Half a) { return ::erfinv(a); } + static inline __host__ __device__ at::Half abs(at::Half a) { return std::abs(a); } + static inline __host__ __device__ at::Half round(at::Half a) { return ::round(a); } + + static inline __host__ __device__ at::Half frac(at::Half a) { #ifdef __CUDA_ARCH__ - return static_cast(a) - static_cast(::trunc(static_cast(a))); + return a - ::trunc(a); #else // __CUDA_ARCH__ - return static_cast(a) - static_cast(::floor(static_cast(a))); + return a - ::floor(a); #endif } - static inline __host__ __device__ half cinv(half a) { - return static_cast(1.0f / static_cast(a)); - } - - static inline __host__ __device__ half add(half a, half b) { - return static_cast(a) + static_cast(b); - } - - static inline __host__ __device__ half div(half a, half b) { - return static_cast(a) / static_cast(b); - } - - static inline __host__ __device__ half mul(half a, half b) { - return static_cast(a) * static_cast(b); - } - - static inline __host__ __device__ half sub(half a, half b) { - return static_cast(a) - static_cast(b); - } + static inline __host__ __device__ at::Half cinv(at::Half a) { return 1.0f / a; } + static inline __host__ __device__ at::Half add(at::Half a, at::Half b) { return a + b; } + static inline __host__ __device__ at::Half div(at::Half a, at::Half b) { return a / b; } + static inline __host__ __device__ at::Half mul(at::Half a, at::Half b) { return a * b; } + static inline __host__ __device__ at::Half sub(at::Half a, at::Half b) { return a - b; } + static inline __host__ __device__ at::Half pow(at::Half a, at::Half b) { return ::pow(a, b); } + static inline __host__ __device__ at::Half atan2(at::Half a, at::Half b) { return ::atan2(a, b); } - static inline __host__ __device__ half pow(half a, half b) { - return static_cast(::pow(static_cast(a), static_cast(b))); - } - - static inline __host__ __device__ half atan2(half a, half b) { - return static_cast(::atan2(static_cast(a), static_cast(b))); - } - - static inline __host__ __device__ bool isnan(half a) { + static inline __host__ __device__ bool isnan(at::Half a) { #ifdef _MSC_VER // Windows requires this explicit conversion. The reason is unclear // related issue with clang: https://reviews.llvm.org/D37906 - return ::isnan((float)static_cast(a)); + return ::isnan((float) a); #else - return ::isnan(static_cast(a)); + return ::isnan(a); #endif } - static inline __host__ __device__ bool isinf(half a) { + static inline __host__ __device__ bool isinf(at::Half a) { #ifdef _MSC_VER // Windows requires this explicit conversion. The reason is unclear // related issue with clang: https://reviews.llvm.org/D37906 - return ::isinf((float)static_cast(a)); + return ::isinf((float) a); #else - return ::isinf(static_cast(a)); + return ::isinf(a); #endif } @@ -510,35 +377,6 @@ struct ScalarConvert { static __host__ __device__ Out to(const In v) { return (Out) v; } }; -template -struct ScalarConvert { - static __host__ __device__ Out to(const half v) { -#ifdef __CUDA_ARCH__ - return (Out) __half2float(v); -#else - return (Out) THC_half2float(v); -#endif - } -}; - -template -struct ScalarConvert { - static __host__ __device__ half to(const In v) { -#ifdef __CUDA_ARCH__ - return __float2half((float) v); -#else - return THC_float2half((float) v); -#endif - } -}; - -template <> -struct ScalarConvert { - static __host__ __device__ half to(const half v) { - return v; - } -}; - // DEPRECATED: use static_cast in kernels instead of scalar_cast template __host__ __device__ T scalar_cast(U u) { diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index 96e3938e20b0f9..b6c52791eb56d6 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -1,7 +1,7 @@ #include "THCStorage.hpp" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include @@ -55,9 +55,9 @@ int THCStorage_getDevice(THCState* state, const THCStorage* storage) { THC_API THCStorage* THCStorage_new( THCState* state, - at::ScalarType scalar_type) { + caffe2::TypeMeta data_type) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + data_type, 0, state->cudaDeviceAllocator, true).release(); diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu index 43a293422335fc..97b66ac2b5f891 100644 --- a/aten/src/THC/THCStorage.cu +++ b/aten/src/THC/THCStorage.cu @@ -7,7 +7,7 @@ #include #endif -#include "THCHalf.h" +#include "TH/THHalf.h" #include "generic/THCStorage.cu" #include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp index ee683feced4bfb..e8dfc2213076f1 100644 --- a/aten/src/THC/THCStorage.hpp +++ b/aten/src/THC/THCStorage.hpp @@ -9,14 +9,20 @@ #include "ATen/ScalarType.h" +#include +#include +#include + namespace at { +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_HCC__) template <> struct CTypeToScalarType<__half> : public CTypeToScalarType {}; +#endif } -THC_API THCStorage* THCStorage_new(THCState* state, at::ScalarType); +THC_API THCStorage* THCStorage_new(THCState* state, caffe2::TypeMeta); THC_API void THCStorage_retain(THCState *state, THCStorage *storage); diff --git a/aten/src/THC/THCStorageCopy.cu b/aten/src/THC/THCStorageCopy.cu index 8d7c869c12c004..c4f53f7160ca5b 100644 --- a/aten/src/THC/THCStorageCopy.cu +++ b/aten/src/THC/THCStorageCopy.cu @@ -1,7 +1,7 @@ #include "THCStorageCopy.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensorCopy.h" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THC/THCStorageCopy.h b/aten/src/THC/THCStorageCopy.h index 837056fc3801d2..250b60fcb2fe74 100644 --- a/aten/src/THC/THCStorageCopy.h +++ b/aten/src/THC/THCStorageCopy.h @@ -3,7 +3,7 @@ #include "THCStorage.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "generic/THCStorageCopy.h" #include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index b35551301383f8..bfef8fffb0f89d 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -40,8 +40,9 @@ int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, return THTensor_strideLegacyNoScalars(self, dim); } -THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type) { - switch(scalar_type) { +THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta) { + auto scalar_type = at::dataTypeToScalarType(type_meta.id()); + switch (scalar_type) { case at::ScalarType::Byte: return THCudaByteTensor_new(state); case at::ScalarType::Char: @@ -189,13 +190,12 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag if (!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype()); - + auto data_type = THTensor_getStoragePtr(self)->dtype(); if (storage) { c10::raw::intrusive_ptr::incref(storage); THTensor_stealAndSetStoragePtr(self, storage); } else { - THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, scalar_type)); + THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, data_type)); } } diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp index 3505354c834bb4..8fecaf0b2296fc 100644 --- a/aten/src/THC/THCTensor.hpp +++ b/aten/src/THC/THCTensor.hpp @@ -21,7 +21,7 @@ THC_API int64_t THCTensor_sizeLegacyNoScalars(THCState *state, const THCTensor * THC_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim); THC_API int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, int dim); -THC_API THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type); +THC_API THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta); THC_API void THCTensor_resize(THCState *state, THCTensor *tensor, at::IntList size, at::IntList stride); THC_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride); diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu index 259912a8d91806..844539c4ac0b27 100644 --- a/aten/src/THC/THCTensorCopy.cu +++ b/aten/src/THC/THCTensorCopy.cu @@ -1,5 +1,5 @@ #include "THCApply.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCNumerics.cuh" #include "THCTensorCopy.hpp" #include @@ -136,8 +136,7 @@ void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src) { } else { // Types are different // Copy into the new format, contiguous, on the source device - srcContig = THCTensor_new(state, - at::CTypeToScalarType::to()); + srcContig = THCTensor_new(state, caffe2::TypeMeta::Make()); THCTensor_resizeAs(state, srcContig, dst); bool succ = diff --git a/aten/src/THC/THCTensorCopy.h b/aten/src/THC/THCTensorCopy.h index 74f2b592f54657..48dcc64b9fbedc 100644 --- a/aten/src/THC/THCTensorCopy.h +++ b/aten/src/THC/THCTensorCopy.h @@ -3,7 +3,7 @@ #include "THCTensor.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCStream.h" #include "generic/THCTensorCopy.h" diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu index 0ea5951d4ea734..6b4a77ea816225 100644 --- a/aten/src/THC/THCTensorIndex.cu +++ b/aten/src/THC/THCTensorIndex.cu @@ -4,7 +4,7 @@ #include "THCBlas.h" #include "THCTensorCopy.h" #include "THCTensorRandom.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCApply.cuh" #include "THCReduce.cuh" #include "THCDeviceUtils.cuh" diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu index b1be123b886e03..cd5a77c2227181 100644 --- a/aten/src/THC/THCTensorMathPairwise.cu +++ b/aten/src/THC/THCTensorMathPairwise.cu @@ -1,6 +1,6 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" @@ -21,26 +21,6 @@ struct TensorAddConstantOp { const T val; }; -template <> -struct TensorAddConstantOp { - TensorAddConstantOp(half v) : fval(THC_half2float(v)) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); - } - - const float fval; -}; - - template struct TensorSubConstantOp { TensorSubConstantOp(T v) : val(v) {} @@ -55,27 +35,6 @@ struct TensorSubConstantOp { const T val; }; - -template <> -struct TensorSubConstantOp { - TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); - } - - const float fval; -}; - - template struct TensorMulConstantOp { TensorMulConstantOp(T v) : val(v) {} @@ -90,25 +49,6 @@ struct TensorMulConstantOp { const T val; }; -template <> -struct TensorMulConstantOp { - TensorMulConstantOp(half v) : fval(THC_half2float(v)) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); - } - - const float fval; -}; - template struct TensorDivConstantOp { TensorDivConstantOp(T v) : val(v) {} @@ -151,24 +91,6 @@ struct TensorDivConstantOp { const double val; }; -template <> -struct TensorDivConstantOp { - TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {} - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); - } - - const float fval; -}; - template static __device__ __forceinline__ typename std::enable_if::value, bool>::type @@ -232,22 +154,18 @@ struct TensorRemainderOp { }; template <> -struct TensorRemainderOp { - TensorRemainderOp(half v): fval(THC_half2float(v)) {} +struct TensorRemainderOp { + TensorRemainderOp(at::Half v): val(v) {} - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin - fval * floorf(fin / fval); - *out = __float2half(fout); + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { + *out = *in - val * floorf(*in / val); } - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv = fv - fval * floorf(fv / fval); - *v = __float2half(fv); + __device__ __forceinline__ void operator()(at::Half* v) { + *v = *v - val * floorf(*v / val); } - const float fval; + const at::Half val; }; template @@ -278,21 +196,6 @@ struct TensorFmodOp { const double val; }; -template <> -struct TensorFmodOp { - TensorFmodOp(half v): fval(THC_half2float(v)) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - *out = __float2half(fmodf(__half2float(*in), fval)); - } - - __device__ __forceinline__ void operator()(half* v) { - *v = __float2half(fmodf(__half2float(*v), fval)); - } - - const float fval; -}; - template struct TensorTriOp { TensorTriOp(T *start_, int64_t stride0_, int64_t stride1_, int64_t k_) diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh index fb15a05b155c0f..fb1072b8bbddc1 100644 --- a/aten/src/THC/THCTensorMathPointwise.cuh +++ b/aten/src/THC/THCTensorMathPointwise.cuh @@ -4,7 +4,7 @@ #include #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" @@ -31,19 +31,6 @@ struct TensorSigmoidOp { } }; -template <> -struct TensorSigmoidOp { - __device__ __forceinline__ void operator()(half* out, half* in) const { - float fin = __half2float(*in); - *out = __float2half(1.0f / (1.0f + expf(- fin))); - } - - __device__ __forceinline__ void operator()(half* v) const { - float fv = __half2float(*v); - *v = __float2half(1.0f / (1.0f + expf(- fv))); - } -}; - template struct TensorSignOp { __device__ __forceinline__ void operator()(T* out, T* in) { @@ -70,19 +57,6 @@ struct TensorSignOp { } }; -template <> -struct TensorSignOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - float orig = __half2float(*in); - *out = __float2half((orig > 0) - (orig < 0)); - } - - __device__ __forceinline__ void operator()(half* v) { - float orig = __half2float(*v); - *v = __float2half((orig > 0) - (orig < 0)); - } -}; - template struct TensorCAddOp { TensorCAddOp(T v) : val(v) {} @@ -98,31 +72,6 @@ struct TensorCAddOp { T val; }; -template <> -struct TensorCAddOp { - TensorCAddOp(half v) : val(v) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fout = __half2float(*out); - float fval = __half2float(val); - float fin = __half2float(*in); - - fout += fval * fin; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fval = __half2float(val); - - float fout = fin1 + fval * fin2; - *out = __float2half(fout); - } - - half val; -}; - template struct TensorMulOp { __device__ __forceinline__ void operator()(T* out, T* in) { @@ -134,23 +83,6 @@ struct TensorMulOp { } }; -template <> -struct TensorMulOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - float fout = __half2float(*out); - float fin = __half2float(*in); - fout *= fin; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 * fin2; - *out = __float2half(fout); - } -}; - template struct TensorPowOp { TensorPowOp(T v) : val(v) {} @@ -249,7 +181,6 @@ struct TensorCPowOp { } }; - template <> struct TensorCPowOp { __device__ __forceinline__ void operator()(double* out, double* in) { @@ -261,25 +192,6 @@ struct TensorCPowOp { } }; -template <> -struct TensorCPowOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - // No fp16 pow function yet - float fout = __half2float(*out); - float fin = __half2float(*in); - fout = powf(fout, fin); - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - // No fp16 pow function yet - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = powf(fin1, fin2); - *out = __float2half(fout); - } -}; - template static __device__ __forceinline__ typename std::enable_if::value, bool>::type @@ -336,17 +248,13 @@ struct TensorCRemainderOp { }; template <> -struct TensorCRemainderOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - float fout = __half2float(*out); - float fin = __half2float(*in); - *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN); +struct TensorCRemainderOp { + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { + *out = *in != 0.f ? *out - *in * floorf(*out / *in) : NAN; } - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN); + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) { + *out = *in2 != 0.f ? *in1 - *in2 * floorf(*in1 / *in2) : NAN; } }; @@ -384,13 +292,13 @@ struct TensorCFmodOp { }; template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - *out = __float2half(fmodf(__half2float(*out), __half2float(*in))); +struct TensorCFmodOp { + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { + *out = fmodf(*out, *in); } - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2))); + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) { + *out = fmodf(*in1, *in2); } }; @@ -594,7 +502,6 @@ struct TensorRShiftOp { } }; - template <> struct TensorRShiftOp { __device__ __forceinline__ void @@ -671,7 +578,7 @@ template struct TensorDigammaOp { __device__ __forceinline__ void operator()(T* out, T* in) { - using compute_type = typename std::conditional::value, accreal, T>::type; + using compute_type = typename std::conditional::value, accreal, T>::type; static const double PI_f64 = 3.14159265358979323846; static const compute_type PSI_10 = 2.25175258906672110764; static const compute_type A[] = { @@ -731,7 +638,7 @@ struct TensorDigammaOp { template struct TensorTrigammaOp { - using compute_type = typename std::conditional::value, accreal, T>::type; + using compute_type = typename std::conditional::value, accreal, T>::type; __device__ __forceinline__ void operator()(T* out, T* in) { const compute_type PI = 3.14159265358979323846; diff --git a/aten/src/THC/THCTensorMode.cuh b/aten/src/THC/THCTensorMode.cuh index 0158f254a2d014..a3ed2ae8e4be93 100644 --- a/aten/src/THC/THCTensorMode.cuh +++ b/aten/src/THC/THCTensorMode.cuh @@ -7,33 +7,33 @@ struct ThrustHalfLess { - __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { - return THCNumerics::lt(lhs, rhs); + __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) { + return THCNumerics::lt(lhs, rhs); } }; struct ThrustHalfNotEqualTo { - __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { - return THCNumerics::ne(lhs, rhs); + __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) { + return THCNumerics::ne(lhs, rhs); } }; struct ThrustHalfEqualTo { - __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { - return THCNumerics::eq(lhs, rhs); + __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) { + return THCNumerics::eq(lhs, rhs); } }; struct ThrustHalfEqualToPredicate { - ThrustHalfEqualToPredicate(half val): val_(val) {} - __host__ __device__ inline bool operator()(half x) { - return THCNumerics::eq(val_, x); + ThrustHalfEqualToPredicate(at::Half val): val_(val) {} + __host__ __device__ inline bool operator()(at::Half x) { + return THCNumerics::eq(val_, x); } - half val_; + at::Half val_; }; template diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index 8eb580169cc953..386473a430329a 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -14,7 +14,7 @@ #include #include -#define MAX_NUM_BLOCKS 200 +#define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 @@ -107,11 +107,11 @@ __device__ inline T reverse_bounds(T value) { } -__device__ inline half half_uniform_scale_and_shift(float x, double a, double b) { - half width = ScalarConvert::to(b - a); - half start = ScalarConvert::to(a); - half scaled = THCNumerics::mul(reverse_bounds(ScalarConvert::to(x)), width); - return THCNumerics::add(scaled, start); +__device__ inline at::Half half_uniform_scale_and_shift(float x, double a, double b) { + at::Half width = ScalarConvert::to(b - a); + at::Half start = ScalarConvert::to(a); + at::Half scaled = THCNumerics::mul(reverse_bounds(ScalarConvert::to(x)), width); + return THCNumerics::add(scaled, start); } #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM) \ @@ -181,10 +181,10 @@ GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uni GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5)))) GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5)))) -GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b))) -GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert::to((x * stdv) + mean))) -GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert::to((float)(-1. / lambda * log(x))))) -GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert::to((float)(median + sigma * tan(M_PI*(x-0.5)))))) +GENERATE_KERNEL2(generate_uniform, at::Half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b))) +GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert::to((x * stdv) + mean))) +GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert::to((float)(-1. / lambda * log(x))))) +GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert::to((float)(median + sigma * tan(M_PI*(x-0.5)))))) #include "generic/THCTensorRandom.cu" #include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorSort.cuh b/aten/src/THC/THCTensorSort.cuh index 9b75a7355a00f4..b8feedcc1f21c9 100644 --- a/aten/src/THC/THCTensorSort.cuh +++ b/aten/src/THC/THCTensorSort.cuh @@ -1,6 +1,8 @@ #ifndef THC_TENSORSORT_CUH #define THC_TENSORSORT_CUH +#include "THCTensorMath.h" +#include "THCGeneral.h" #include "THCReduceApplyUtils.cuh" #include "THCSortUtils.cuh" #include "THCTensorCopy.h" diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh index 4f7a6b8c697913..6d1ef1b6bbfe37 100644 --- a/aten/src/THC/THCTensorTopK.cuh +++ b/aten/src/THC/THCTensorTopK.cuh @@ -113,10 +113,10 @@ struct TopKTypeConfig { }; template <> -struct TopKTypeConfig { +struct TopKTypeConfig { typedef uint32_t RadixType; - static inline __device__ RadixType convert(half v) { + static inline __device__ RadixType convert(at::Half v) { #if CUDA_VERSION >= 8000 RadixType x = __half_as_ushort(v); RadixType mask = -((x >> 15)) | 0x8000; @@ -127,16 +127,16 @@ struct TopKTypeConfig { #endif } - static inline __device__ half deconvert(RadixType v) { + static inline __device__ at::Half deconvert(RadixType v) { #if CUDA_VERSION >= 8000 RadixType mask = ((v >> 15) - 1) | 0x8000; return __ushort_as_half(v ^ mask); #else assert(false); - return ScalarConvert::to(0); + return ScalarConvert::to(0); #endif } -}; +}; // This function counts the distribution of all input values in a // slice we are selecting by radix digit at `radixDigitPos`, but only @@ -214,7 +214,7 @@ __device__ DataType findPattern(DataType* smem, BitDataType desired, BitDataType desiredMask) { #ifdef __HIP_PLATFORM_HCC__ - if (threadIdx.x < 64) { + if (threadIdx.x < 64) { #else if (threadIdx.x < 32) { #endif diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh index 377b363c006ba2..aecd8f01713d55 100644 --- a/aten/src/THC/THCTensorTypeUtils.cuh +++ b/aten/src/THC/THCTensorTypeUtils.cuh @@ -4,7 +4,7 @@ #include #include #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensor.hpp" #include "THCTensorInfo.cuh" #include "THCTensor.hpp" @@ -80,56 +80,4 @@ struct ScalarInv { static __host__ __device__ T to(const T v) { return ((T) 1) / v; } }; -template <> -struct ScalarNegate { - static __host__ __device__ half to(const half v) { -#ifdef __CUDA_ARCH__ - return __float2half(-__half2float(v)); -#else -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - half out = v; -#else - __half_raw out = __half_raw(v); -#endif - out.x ^= 0x8000; // toggle sign bit - return out; -#endif - } -}; - -template <> -struct ScalarInv { - static __host__ __device__ half to(const half v) { -#if defined (__CUDA_ARCH_) || defined(__HIP_PLATFORM_HCC__) - return __float2half(1.0f / __half2float(v)); -#else - float fv = THC_half2float(v); - fv = 1.0f / fv; - return THC_float2half(fv); -#endif - } -}; - -inline bool operator==(half a, half b) { -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - return a.x == b.x; -#else - __half_raw araw, braw; - araw = __half_raw(a); - braw = __half_raw(b); - return araw.x == braw.x; -#endif -} - -inline bool operator!=(half a, half b) { -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - return a.x != b.x; -#else - __half_raw araw, braw; - araw = __half_raw(a); - braw = __half_raw(b); - return araw.x != braw.x; -#endif -} - #endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index 8918a449e19585..36a001059a5787 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -3,6 +3,7 @@ #else #include +#include scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self) { @@ -43,7 +44,7 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind THCStorage* THCStorage_(new)(THCState *state) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), 0, state->cudaDeviceAllocator, true).release(); @@ -53,7 +54,7 @@ THCStorage* THCStorage_(new)(THCState *state) THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, state->cudaDeviceAllocator, true).release(); @@ -64,7 +65,7 @@ THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, allocator, true).release(); @@ -117,7 +118,7 @@ THCStorage* THCStorage_(newWithDataAndAllocator)( ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, std::move(data), allocator, diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu index 88ed2e5541820e..cd12d2bcae0fb6 100644 --- a/aten/src/THC/generic/THCStorage.cu +++ b/aten/src/THC/generic/THCStorage.cu @@ -18,7 +18,7 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) THCStorage_resize(state, self, size); } -THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { +int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { return THCStorage_getDevice(state, storage); } diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index db2b44511c2329..a7779047863466 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -63,13 +63,21 @@ scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self) /* Empty init */ THCTensor *THCTensor_(new)(THCState *state) { - return c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + return c10::make_intrusive( + c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), + at::CUDATensorId(), + false + ).release(); } /* Pointer-copy init */ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) { - THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + THCTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), + at::CUDATensorId(), + false + ).release(); THCTensor_(setStorageNd)(state, self, THTensor_getStoragePtr(tensor), @@ -85,7 +93,11 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd if (strides.data()) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } - THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + THCTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), + at::CUDATensorId(), + false + ).release(); THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); @@ -594,13 +606,13 @@ int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...) } const int tensorDev = THCTensor_(getDevice)(state, tensor); - + // Skips CPU tensors if (tensorDev == -1) { continue; } // Checks all tensors are on the same device - if (tensorDev != curDev) { - valid = 0; + if (tensorDev != curDev) { + valid = 0; break; } } diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu index 98478341575c75..0c694f5e4e25c8 100644 --- a/aten/src/THC/generic/THCTensor.cu +++ b/aten/src/THC/generic/THCTensor.cu @@ -2,7 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensor.cu" #else -THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) { +int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) { return THCTensor_getDevice(state, tensor); } diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp index 96ab307182639c..0c20edfbd9fd36 100644 --- a/aten/src/THC/generic/THCTensorCopy.cpp +++ b/aten/src/THC/generic/THCTensorCopy.cpp @@ -58,6 +58,13 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src) { THTensor *selfc = THTensor_(newContiguous)(self); + int tensorDevice = THCTensor_(getDevice)(state, src); + int currentDevice; + THCudaCheck(cudaGetDevice(¤tDevice)); + + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(tensorDevice)); + } src = THCTensor_(newContiguous)(state, src); cudaStream_t stream = THCState_getCurrentStream(state); @@ -68,6 +75,10 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src) stream)); THCudaCheck(cudaStreamSynchronize(stream)); + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(currentDevice)); + } + THCTensor_(free)(state, src); THTensor_(freeCopyTo)(selfc, self); } diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 0320fdfe8035a5..2c05c74f247a48 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -2,16 +2,15 @@ #define THC_GENERIC_FILE "generic/THCTensorCopy.cu" #else -THC_API void -THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { +void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { if (dst == src) return; THC_copyTensor(state, dst, src); } template <> THCTensor *THCTensor_newClone(THCState *state, THCTensor *self) { - THCTensor* tensor = THCTensor_new( - state, at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype())); + THCTensor* tensor = + THCTensor_new(state, THTensor_getStoragePtr(self)->dtype()); THCTensor_resizeAs(state, tensor, self); THC_copyTensor(state, tensor, self); return tensor; @@ -51,16 +50,14 @@ void THCTensor_copyIgnoringOverlaps(THCState* state, THCTensor* dst, T ReadOnly); } -THC_API void -THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { +void THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { THCTensor_copyIgnoringOverlaps(state, dst, src); } #define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA, SCALARC) \ - THC_API void \ - THCTensor_(copyCuda##TYPEC)(THCState *state, \ - THCTensor *self, \ - THCuda##TYPECUDA##Tensor *src) { \ + void THCTensor_(copyCuda##TYPEC)(THCState *state, \ + THCTensor *self, \ + THCuda##TYPECUDA##Tensor *src) { \ THC_copyTensor(state, self, src); \ } @@ -72,7 +69,7 @@ IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long, int64_t) // THCudaTensor aka the non-existent THCudaFloatTensor IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, , float) IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double, double) -IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, half) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, at::Half) #undef IMPLEMENT_THC_CUDA_TENSOR_COPY diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu index f7e3e3f32a9a18..684ce31b141f79 100644 --- a/aten/src/THC/generic/THCTensorMasked.cu +++ b/aten/src/THC/generic/THCTensorMasked.cu @@ -3,9 +3,8 @@ #else -THC_API void -THCTensor_(maskedFill)(THCState* state, - THCTensor *tensor, THCudaByteTensor *mask, scalar_t value) +void THCTensor_(maskedFill)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask)); THArgCheck(THCTensor_(nElement)(state, tensor) == @@ -20,9 +19,8 @@ THCTensor_(maskedFill)(THCState* state, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(maskedFillByte)(THCState* state, - THCTensor *tensor, THByteTensor *mask, scalar_t value) +void THCTensor_(maskedFillByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor)); THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {}); @@ -31,9 +29,8 @@ THCTensor_(maskedFillByte)(THCState* state, THCudaByteTensor_free(state, maskCuda); } -THC_API void -THCTensor_(maskedCopy)(THCState* state, - THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) +void THCTensor_(maskedCopy)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask); @@ -98,9 +95,8 @@ THCTensor_(maskedCopy)(THCState* state, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(maskedCopyByte)(THCState* state, - THCTensor *tensor, THByteTensor *mask, THCTensor *src) { +void THCTensor_(maskedCopyByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, THCTensor *src) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {}); THCudaByteTensor_copyByte(state, maskCuda, mask); @@ -108,9 +104,8 @@ THCTensor_(maskedCopyByte)(THCState* state, THCudaByteTensor_free(state, maskCuda); } -THC_API void -THCTensor_(maskedSelect)(THCState* state, - THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) { +void THCTensor_(maskedSelect)(THCState* state, + THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); THArgCheck(THCudaByteTensor_nElement(state, mask) == THCTensor_(nElement)(state, src), @@ -171,9 +166,8 @@ THCTensor_(maskedSelect)(THCState* state, } // FIXME: remove now that we have THCudaByteTensor? -THC_API void -THCTensor_(maskedSelectByte)(THCState* state, - THCTensor *tensor, THCTensor *src, THByteTensor *mask) +void THCTensor_(maskedSelectByte)(THCState* state, + THCTensor *tensor, THCTensor *src, THByteTensor *mask) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {}); diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index 9ffe626dd8425f..c4f7afb6a227b9 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMath.cu" #else -THC_API void -THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) +void THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); @@ -15,8 +14,7 @@ THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(zero)(THCState *state, THCTensor *self_) +void THCTensor_(zero)(THCState *state, THCTensor *self_) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); if (THCTensor_(isContiguous)(state, self_)) { @@ -35,16 +33,14 @@ THCTensor_(zero)(THCState *state, THCTensor *self_) THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input) +void THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input)); THCTensor_(resizeAs)(state, r_, input); THCTensor_(zero)(state, r_); } -THC_API void -THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input) +void THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input)); THCTensor_(resizeAs)(state, r_, input); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index a37645de394de8..d83b8ff929b4f6 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -5,8 +5,7 @@ #define ERROR_ONLY_FP_TYPES(func) \ THError("%s for CUDA tensors only supports floating-point types. Try converting the tensors with .float()", func); -THC_API accreal -THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) +accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -27,11 +26,10 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) THCTensor_(data)(state, self), 1, THCTensor_(data)(state, src), 1); #elif defined(THC_REAL_IS_HALF) - accreal result = ScalarConvert::to( - THCudaBlas_Hdot(state, + accreal result = THCudaBlas_Hdot(state, THCTensor_(nElement)(state, self), THCTensor_(data)(state, self), 1, - THCTensor_(data)(state, src), 1)); + THCTensor_(data)(state, src), 1); #endif THCTensor_(free)(state, src); @@ -44,8 +42,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) #endif } -THC_API void -THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec) +void THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); @@ -152,8 +149,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s #endif } -THC_API void -THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2) +void THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); @@ -250,8 +246,7 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, sc #endif } -THC_API void -THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2) +void THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) @@ -414,9 +409,8 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s #endif } -THC_API void -THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, - scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { +void THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, + scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 2, 4, "expected 2D tensor"); @@ -479,9 +473,8 @@ __global__ void createBatchGemmBuffer3(const scalar_t** buffer1, const scalar_t } } -THC_API void -THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, - scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { +void THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, + scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 3, 4, "expected 3D tensor"); @@ -746,7 +739,7 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor #endif } -THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a) +void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THAssert(THCTensor_(checkGPU)(state, 2, ra_, a)); @@ -853,8 +846,8 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens } -THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, - THCTensor *atf, THCudaIntTensor *pivots) +void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, + THCTensor *atf, THCudaIntTensor *pivots) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b)); diff --git a/aten/src/THC/generic/THCTensorMathCompare.cu b/aten/src/THC/generic/THCTensorMathCompare.cu index 0a0041ab9e4784..3c8e8ce0c325a7 100644 --- a/aten/src/THC/generic/THCTensorMathCompare.cu +++ b/aten/src/THC/generic/THCTensorMathCompare.cu @@ -2,7 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu" #else -THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -10,7 +10,7 @@ THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -18,7 +18,7 @@ THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -26,7 +26,7 @@ THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -34,7 +34,7 @@ THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -42,7 +42,7 @@ THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -50,7 +50,7 @@ THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -58,7 +58,7 @@ THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -66,7 +66,7 @@ THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -74,7 +74,7 @@ THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -82,7 +82,7 @@ THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -90,7 +90,7 @@ THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, diff --git a/aten/src/THC/generic/THCTensorMathCompareT.cu b/aten/src/THC/generic/THCTensorMathCompareT.cu index 6397a0b7caaa96..1bd4b9909fc26d 100644 --- a/aten/src/THC/generic/THCTensorMathCompareT.cu +++ b/aten/src/THC/generic/THCTensorMathCompareT.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu" #else -THC_API void -THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -11,8 +10,7 @@ THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -20,8 +18,7 @@ THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -29,8 +26,7 @@ THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -38,8 +34,7 @@ THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -47,8 +42,7 @@ THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -56,8 +50,7 @@ THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -65,8 +58,7 @@ THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -74,8 +66,7 @@ THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -83,8 +74,7 @@ THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -92,8 +82,7 @@ THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -101,8 +90,7 @@ THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index ecf39d9a1bf0f2..29c7999f74a9b6 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -60,7 +60,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T } -THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); @@ -98,8 +98,8 @@ THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, T #endif } -THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_, - const char *uplo, const char *trans, const char *diag) +void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_, + const char *uplo, const char *trans, const char *diag) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); @@ -135,7 +135,7 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, #endif } -THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); @@ -182,7 +182,7 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T #endif } -THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) +void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) { #ifdef USE_MAGMA int64_t n = THTensor_sizeLegacyNoScalars(a, 0); @@ -247,7 +247,7 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T #endif } -THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) +void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) { #ifdef USE_MAGMA THArgCheck(a_->dim() == 2, 3, "A should be 2 dimensional"); @@ -321,7 +321,7 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T #endif } -THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu) +void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu) { #ifdef USE_MAGMA THCTensor *ra_ = THCTensor_(new)(state); @@ -332,7 +332,7 @@ THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, #endif } -THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus) +void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus) { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); @@ -396,7 +396,7 @@ THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, #endif } -THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) +void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) { THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); @@ -524,7 +524,7 @@ __global__ void THCTensor_(copyLowerSymmetric)(scalar_t *input, int n, int len) } } -THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); @@ -564,7 +564,7 @@ THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, co #endif } -THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional"); @@ -600,7 +600,7 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co #endif } -THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) +void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); @@ -632,7 +632,7 @@ THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, TH #endif } -THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_) +void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); @@ -669,7 +669,7 @@ THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_ #endif } -THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_) +void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu index 40d6bdb6382983..0f2d0067ff5546 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu" #else -THC_API void -THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -21,8 +20,7 @@ THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -40,30 +38,17 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) +void THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) { -#ifdef THC_REAL_IS_HALF - auto v = THC_half2float(value) * THC_half2float(alpha); - THCTensor_(add)(state, self_, src_, THC_float2half(v)); -#else THCTensor_(add)(state, self_, src_, value * alpha); -#endif } -THC_API void -THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) +void THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) { -#ifdef THC_REAL_IS_HALF - auto v = THC_half2float(value) * THC_half2float(alpha); - THCTensor_(sub)(state, self_, src_, THC_float2half(v)); -#else THCTensor_(sub)(state, self_, src_, value * alpha); -#endif } -THC_API void -THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -81,8 +66,7 @@ THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); THArgCheck(value != ScalarConvert::to(0), 3, "divide by zero"); @@ -102,8 +86,7 @@ THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCTensor_(mul)(state, self_, src_, pow(2, value)); @@ -126,8 +109,7 @@ THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t #endif } -THC_API void -THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCTensor_(mul)(state, self_, src_, pow(2, -value)); @@ -150,8 +132,7 @@ THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t #endif } -THC_API void -THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -169,8 +150,7 @@ THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t va THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -245,7 +225,7 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_ THCudaCheck(cudaGetLastError()); } -THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) +int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (!THCTensor_(isSameSizeAs(state, self_, src_))) { @@ -269,8 +249,7 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_ return min != 0; } -THC_API void -THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) return THError("bitand only supported for integer type tensors"); @@ -291,8 +270,7 @@ THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t #endif } -THC_API void -THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) return THError("bitor only supported for integer type tensors"); @@ -313,8 +291,7 @@ THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t v #endif } -THC_API void -THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) return THError("bitxor only supported for integer type tensors"); diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index 9192d6c9f9a1d5..4ff836fd53dda6 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -108,8 +108,7 @@ void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, scalar THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension) +void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y)); @@ -211,8 +210,7 @@ void THCTensor_(polygamma)(THCState* state, THCTensor* self_, int64_t n, THCTens THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w) +void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b)); THArgCheck(THCTensor_(nElement)(state, a) == @@ -228,39 +226,42 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, #endif -THC_API void -THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) +namespace { +c10::intrusive_ptr retainTensorImpl(THCTensor* self) { + c10::raw::intrusive_ptr::incref(self); + return c10::intrusive_ptr::reclaim(self); +} +} + +void THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) { - auto out = at::Tensor(self_, true); + auto out = at::Tensor(retainTensorImpl(self_)); #ifdef THC_REAL_IS_HALF auto alpha = at::Half(value); #else auto alpha = value; #endif - at::add_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha); + at::add_out(out, retainTensorImpl(src1), retainTensorImpl(src2), alpha); } -THC_API void -THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) +void THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) { - auto out = at::Tensor(self_, true); + auto out = at::Tensor(retainTensorImpl(self_)); #ifdef THC_REAL_IS_HALF auto alpha = at::Half(value); #else auto alpha = value; #endif - at::sub_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha); + at::sub_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)), alpha); } -THC_API void -THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { - auto out = at::Tensor(self_, true); - at::mul_out(out, at::Tensor(src1, true), at::Tensor(src2, true)); + auto out = at::Tensor(retainTensorImpl(self_)); + at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } -THC_API void -THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -367,15 +368,13 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, scalar_t value, THCTens THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { - auto out = at::Tensor(self_, true); - at::div_out(out, at::Tensor(src1, true), at::Tensor(src2, true)); + auto out = at::Tensor(retainTensorImpl(self_)); + at::div_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } -THC_API void -THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) return THError("clshift not supported for torch.CudaHalfTensor"); @@ -402,8 +401,7 @@ THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso #endif } -THC_API void -THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) return THError("crshift not supported for torch.CudaHalfTensor"); @@ -430,8 +428,7 @@ THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso #endif } -THC_API void -THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -449,8 +446,7 @@ THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s } } -THC_API void -THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -468,8 +464,7 @@ THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s } } -THC_API void -THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -487,8 +482,7 @@ THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTen } } -THC_API void -THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -506,8 +500,7 @@ THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor * } } -THC_API void -THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) +void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -523,8 +516,7 @@ THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t } } -THC_API void -THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) +void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -540,8 +532,7 @@ THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t } } -THC_API void -THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) +void THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2)); if(self_ != t) @@ -565,8 +556,7 @@ THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) +void THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2)); if(self_ != t) @@ -589,8 +579,7 @@ THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) return THError("cbitand is only supported for integer type tensors"); @@ -617,8 +606,7 @@ THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso #endif } -THC_API void -THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) return THError("cbitor is only supported for integer type tensors"); @@ -645,8 +633,7 @@ THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor #endif } -THC_API void -THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) return THError("cbitor is only supported for integer type tensors"); diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu index 958bf623b85f05..4d7e81750c0ec7 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.cu +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu" #else -THC_API void -THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { +void THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); if (!THC_reduceDim(state, self, src, thrust::identity{}, @@ -18,8 +17,7 @@ THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { +void THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); if (!THC_reduceDim(state, self, src, thrust::identity{}, @@ -34,8 +32,7 @@ THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim) +void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); const accreal size = scalar_cast(THCTensor_(size)(state, src, dim)); @@ -54,8 +51,7 @@ THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) -THC_API void -THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm) +void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); THCTensor *self_; @@ -89,8 +85,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t va THCTensor_(free)(state, data); } -THC_API void -THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) +void THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); @@ -117,8 +112,7 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension } } -THC_API void -THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) +void THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); @@ -145,15 +139,13 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension } } -THC_API accreal -THCTensor_(stdall)(THCState *state, THCTensor *self, int biased) +accreal THCTensor_(stdall)(THCState *state, THCTensor *self, int biased) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); return THCNumerics::sqrt((THCTensor_(varall)(state, self, biased))); } -THC_API accreal -THCTensor_(varall)(THCState *state, THCTensor *self, int biased) +accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal mean = THCTensor_(meanall)(state, self); @@ -176,8 +168,7 @@ THCTensor_(varall)(THCState *state, THCTensor *self, int biased) return val; } -THC_API void -THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim) +void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim) { const accreal value = scalar_cast(_value); THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -221,8 +212,7 @@ THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _val THCudaCheck(cudaGetLastError()); } -THC_API accreal -THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value) +accreal THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value) { const accreal value = scalar_cast(_value); THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); @@ -295,8 +285,7 @@ accreal THCTensor_(dist)(THCState *state, THCTensor *self, #endif -THC_API accreal -THCTensor_(sumall)(THCState *state, THCTensor *self) { +accreal THCTensor_(sumall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -311,8 +300,7 @@ THCTensor_(sumall)(THCState *state, THCTensor *self) { return val; } -THC_API accreal -THCTensor_(prodall)(THCState *state, THCTensor *self) { +accreal THCTensor_(prodall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -327,15 +315,13 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) { return val; } -THC_API accreal -THCTensor_(meanall)(THCState *state, THCTensor *self) +accreal THCTensor_(meanall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self); } -THC_API scalar_t -THCTensor_(minall)(THCState *state, THCTensor *self) { +scalar_t THCTensor_(minall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -349,8 +335,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) { return scalar_cast(val); } -THC_API scalar_t -THCTensor_(maxall)(THCState *state, THCTensor *self) { +scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -364,8 +349,7 @@ THCTensor_(maxall)(THCState *state, THCTensor *self) { return scalar_cast(val); } -THC_API scalar_t -THCTensor_(medianall)(THCState *state, THCTensor *self) { +scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); scalar_t val; @@ -392,13 +376,12 @@ THCTensor_(medianall)(THCState *state, THCTensor *self) { return val; } -THC_API void -THCTensor_(median)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *self, - int dimension, - int keepdim) { +void THCTensor_(median)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *self, + int dimension, + int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); int64_t t_size_dim, k; @@ -434,13 +417,12 @@ THCTensor_(median)(THCState *state, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(max)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *src, - int dimension, - int keepdim) { +void THCTensor_(max)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + int dimension, + int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src)); thrust::pair @@ -453,13 +435,12 @@ THCTensor_(max)(THCState *state, MaxValuePair()); } -THC_API void -THCTensor_(min)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *src, - int dimension, - int keepdim) { +void THCTensor_(min)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + int dimension, + int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src)); thrust::pair diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu index 274093ef105ae5..db72921bcfd1b0 100644 --- a/aten/src/THC/generic/THCTensorMode.cu +++ b/aten/src/THC/generic/THCTensorMode.cu @@ -2,13 +2,13 @@ #define THC_GENERIC_FILE "generic/THCTensorMode.cu" #else -THC_API void THCTensor_(calculateMode)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *input, - THCudaLongStorage *sortBuffer, - int dimension, - THLongStorage *position) { +void THCTensor_(calculateMode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + THCudaLongStorage *sortBuffer, + int dimension, + THLongStorage *position) { THAssert(THCTensor_(isContiguous)(state, input)); // Because the input is contiguous, we want to get a reference to the @@ -129,14 +129,14 @@ THC_API void THCTensor_(calculateMode)(THCState *state, } // this probably could be a loop, not a recursive algorithm -THC_API void THCTensor_(dimApplyMode)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *input, - THCudaLongStorage *sortBuffer, - int dimension, - THLongStorage *position, - int curDim) { +void THCTensor_(dimApplyMode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + THCudaLongStorage *sortBuffer, + int dimension, + THLongStorage *position, + int curDim) { int64_t ndim = THCTensor_(nDimensionLegacyAll)(state, input); // Because we have transposed the Tensor, the data for the dimension we are mode'ing along @@ -155,12 +155,12 @@ THC_API void THCTensor_(dimApplyMode)(THCState *state, #define MAX_GRID_SIZE 65535 #define MAX_BLOCK_SIZE 1024 -THC_API void THCTensor_(mode)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *input, - int dimension, - int keepdim) { +void THCTensor_(mode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + int dimension, + int keepdim) { THCTensor *transposed, *contiguous, *valuesTransposed; THLongStorage *position; THCudaLongStorage *sortBuffer; diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu index 620c73e9af01d3..4cbc6dd1a29999 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -6,7 +6,7 @@ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) -THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b) +void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -21,7 +21,7 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv) +void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -36,13 +36,13 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) { +void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) { THCTensor_(resizeAs)(state, self, means); THCTensor_(normal)(state, self, 0, stddev); THCTensor_(cadd)(state, self, self, ScalarConvert::to(1), means); } -THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs) +void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs) { THCTensor_(resizeAs)(state, self, stddevs); THCTensor_(normal)(state, self, 0, 1); @@ -50,7 +50,7 @@ THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double THCTensor_(add)(state, self, self, ScalarConvert::to(mean)); } -THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs) +void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs) { THCTensor_(resizeAs)(state, self, means); THCTensor_(normal)(state, self, 0, 1); @@ -58,7 +58,7 @@ THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor_(cadd)(state, self, self, ScalarConvert::to(1), means); } -THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv) +void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); @@ -75,7 +75,7 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda) +void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -91,7 +91,7 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma) +void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -128,11 +128,11 @@ void THCTensor_(renormRows)(struct THCState* state, rows, cols); } -THC_API void THCTensor_(multinomial)(struct THCState *state, - THCudaLongTensor *self, - THCTensor *prob_dist, - int n_sample, - int with_replacement) +void THCTensor_(multinomial)(struct THCState *state, + THCudaLongTensor *self, + THCTensor *prob_dist, + int n_sample, + int with_replacement) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist)); THCGenerator* gen = THCRandom_getGenerator(state); @@ -299,7 +299,7 @@ THC_API void THCTensor_(multinomial)(struct THCState *state, } } -THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){ +void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){ THAssert(THCTensor_(isContiguous)(state, _q)); THAssert(THCudaLongTensor_isContiguous(state, _J)); THAssert(THCTensor_(isContiguous)(state, _probs)); @@ -354,7 +354,7 @@ THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_prob THCudaLongTensor_free(state, larger_short); } -THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){ +void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){ THAssert(THCTensor_(isContiguous)(state, _q)); THAssert(THCudaLongTensor_isContiguous(state, _J)); THCGenerator* gen = THCRandom_getGenerator(state); @@ -388,7 +388,7 @@ GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_do GENERATE_KERNEL1(generate_bernoulli, scalar_t, double p, float, curand_uniform, (ScalarConvert::to(x <= p))) #endif -THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p) +void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -413,7 +413,7 @@ void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p } #define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE) \ -THC_API void THCTensor_(NAME)(THCState* state, \ +void THCTensor_(NAME)(THCState* state, \ THCTensor *self_, PROB_TYPE *probs_) \ { \ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_)); \ @@ -458,7 +458,7 @@ GENERATE_KERNEL2(generate_random, scalar_t, int32_t base, uint32_t range, uint32 static_cast(static_cast(x % range + base))) #endif -THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) +void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -474,7 +474,7 @@ THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val) +void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val) { THArgCheck(min_val < max_val, 2, "max must be greater than min, but got: min = %lld, max = %lld", min_val, max_val); @@ -502,14 +502,14 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_ THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val) +void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val) { THCTensor_(clampedRandom)(state, self_, 0LL, max_val); }; #define HLF_MANT_DIG 11 -THC_API void THCTensor_(random)(THCState* state, THCTensor *self_) +void THCTensor_(random)(THCState* state, THCTensor *self_) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu index e310e422600595..03aad5f2d305fd 100644 --- a/aten/src/THC/generic/THCTensorSort.cu +++ b/aten/src/THC/generic/THCTensorSort.cu @@ -5,10 +5,10 @@ // In alignment with default sort on a c++ map, this function // will permute key and value tensors identically, and // in such a way that the 'key' tensor is ordered numerically -THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, - THCTensor* key, - THCudaLongTensor* value, - int dim, bool dir) { +void THCTensor_(sortKeyValueInplace)(THCState* state, + THCTensor* key, + THCudaLongTensor* value, + int dim, bool dir) { THArgCheck(key->sizes().equals(value->sizes()), 2, "Key tensor must have same size as value tensor"); int dims = THCudaLongTensor_nDimensionLegacyNoScalars(state, value); @@ -274,11 +274,11 @@ void THCTensor_(sortViaThrust)(THCState* state, THCudaLongTensor_freeCopyTo(state, trContigIndices, indices); } -THC_API void THCTensor_(sort)(THCState* state, - THCTensor *sorted, - THCudaLongTensor *indices, - THCTensor *input, - int dim, int order) { +void THCTensor_(sort)(THCState* state, + THCTensor *sorted, + THCudaLongTensor *indices, + THCTensor *input, + int dim, int order) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input)); THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices)); int64_t dims = THCTensor_(nDimensionLegacyNoScalars)(state, sorted); diff --git a/aten/src/THC/generic/THCTensorSort.h b/aten/src/THC/generic/THCTensorSort.h index 009d825a223975..eba93fb75773c3 100644 --- a/aten/src/THC/generic/THCTensorSort.h +++ b/aten/src/THC/generic/THCTensorSort.h @@ -4,10 +4,17 @@ /* Performs an in-place sort of (keys, values). Only works for slice sizes <= 2048 at the moment (slice size == size of keys/values dim `dim`) */ +#ifdef __cplusplus +THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, + THCTensor* keys, + THCudaLongTensor* values, + int dim, bool dir); +#else THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, THCTensor* keys, THCudaLongTensor* values, int dim, int order); +#endif /* Performs an out-of-place sort of `input`, returning the per-slice indices in `indices` and the sorted values in `sorted` */ diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu index 71ee008659b12a..a195dfbe5ca7a8 100644 --- a/aten/src/THC/generic/THCTensorTopK.cu +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -2,11 +2,11 @@ #define THC_GENERIC_FILE "generic/THCTensorTopK.cu" #else -THC_API void THCTensor_(topk)(THCState* state, - THCTensor *topK, - THCudaLongTensor *indices, - THCTensor *input_, - int64_t k, int dim, int dir, int sorted) { +void THCTensor_(topk)(THCState* state, + THCTensor *topK, + THCudaLongTensor *indices, + THCTensor *input_, + int64_t k, int dim, int dir, int sorted) { THAssert(topK != NULL && indices != NULL && input_ != NULL); THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_)); THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); diff --git a/aten/src/THCUNN/Abs.cu b/aten/src/THCUNN/Abs.cu index 72b7ff3c2f53d9..98542eda7e8a75 100644 --- a/aten/src/THCUNN/Abs.cu +++ b/aten/src/THCUNN/Abs.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu index cb0f47510bc559..30aa975594a160 100644 --- a/aten/src/THCUNN/AbsCriterion.cu +++ b/aten/src/THCUNN/AbsCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu index 3624588015c8a1..e458bb81c9725e 100644 --- a/aten/src/THCUNN/BCECriterion.cu +++ b/aten/src/THCUNN/BCECriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu index 0c393c3a9d6db0..97579d1c4aefd5 100644 --- a/aten/src/THCUNN/BatchNormalization.cu +++ b/aten/src/THCUNN/BatchNormalization.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/ClassNLLCriterion.cu b/aten/src/THCUNN/ClassNLLCriterion.cu index 1043454ff1528b..dd430e9b88d120 100644 --- a/aten/src/THCUNN/ClassNLLCriterion.cu +++ b/aten/src/THCUNN/ClassNLLCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" diff --git a/aten/src/THCUNN/Col2Im.cu b/aten/src/THCUNN/Col2Im.cu index d7fd995de4b88b..73eca7ff16ad30 100644 --- a/aten/src/THCUNN/Col2Im.cu +++ b/aten/src/THCUNN/Col2Im.cu @@ -4,7 +4,7 @@ #include "THCTensor.hpp" #include "THCStorage.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/Col2Im.cu" diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu index e4e85b71045f8e..bd26c0c003bb7e 100644 --- a/aten/src/THCUNN/DistKLDivCriterion.cu +++ b/aten/src/THCUNN/DistKLDivCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu index 9c4c2ea1fdc8b6..ec441645e8d7d5 100644 --- a/aten/src/THCUNN/ELU.cu +++ b/aten/src/THCUNN/ELU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/GatedLinearUnit.cu b/aten/src/THCUNN/GatedLinearUnit.cu index aba9f1e794e308..376fdb3855eb23 100644 --- a/aten/src/THCUNN/GatedLinearUnit.cu +++ b/aten/src/THCUNN/GatedLinearUnit.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include #include "common.h" diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu index 539b22fec5a515..8d6a7953975f75 100644 --- a/aten/src/THCUNN/HardTanh.cu +++ b/aten/src/THCUNN/HardTanh.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/Im2Col.cu b/aten/src/THCUNN/Im2Col.cu index 95bdcd4e8b9cd7..252c488df33f00 100644 --- a/aten/src/THCUNN/Im2Col.cu +++ b/aten/src/THCUNN/Im2Col.cu @@ -2,7 +2,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/IndexLinear.cu b/aten/src/THCUNN/IndexLinear.cu index eebb7efc153d88..032e8e31d2cffb 100644 --- a/aten/src/THCUNN/IndexLinear.cu +++ b/aten/src/THCUNN/IndexLinear.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/L1Cost.cu b/aten/src/THCUNN/L1Cost.cu index eda58c18e2c6e4..4f11a94f8106c3 100644 --- a/aten/src/THCUNN/L1Cost.cu +++ b/aten/src/THCUNN/L1Cost.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/LeakyReLU.cu b/aten/src/THCUNN/LeakyReLU.cu index ec9efb836c3441..dec13dfd112dcc 100644 --- a/aten/src/THCUNN/LeakyReLU.cu +++ b/aten/src/THCUNN/LeakyReLU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/LogSigmoid.cu b/aten/src/THCUNN/LogSigmoid.cu index 59ee39a6871bf7..e318fcea4f92f5 100644 --- a/aten/src/THCUNN/LogSigmoid.cu +++ b/aten/src/THCUNN/LogSigmoid.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu index 59aa7e8f4612ba..05eb432871ed34 100644 --- a/aten/src/THCUNN/LookupTable.cu +++ b/aten/src/THCUNN/LookupTable.cu @@ -2,7 +2,7 @@ #include "common.h" #include "THCThrustAllocator.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensorSort.cuh" #include "../THC/THCTensorMathReduce.cuh" diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu index 83a60efbb32207..2c9e09cdb7624d 100644 --- a/aten/src/THCUNN/LookupTableBag.cu +++ b/aten/src/THCUNN/LookupTableBag.cu @@ -11,7 +11,7 @@ #include #endif #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensorSort.cuh" diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu index e9571fe06c4e30..99b287baba38a9 100644 --- a/aten/src/THCUNN/MSECriterion.cu +++ b/aten/src/THCUNN/MSECriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/MarginCriterion.cu b/aten/src/THCUNN/MarginCriterion.cu index 7ccdbb725fe6b1..459d62a4e57a81 100644 --- a/aten/src/THCUNN/MarginCriterion.cu +++ b/aten/src/THCUNN/MarginCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/MultiLabelMarginCriterion.cu index 13b432c15c38c6..220de837d2349e 100644 --- a/aten/src/THCUNN/MultiLabelMarginCriterion.cu +++ b/aten/src/THCUNN/MultiLabelMarginCriterion.cu @@ -2,7 +2,7 @@ #include "THCTensor.hpp" #include "common.h" #include "THCReduceApplyUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu index c2fa2134626101..de802c82979985 100644 --- a/aten/src/THCUNN/MultiMarginCriterion.cu +++ b/aten/src/THCUNN/MultiMarginCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/PReLU.cu b/aten/src/THCUNN/PReLU.cu index cdc6b2b71a1ee4..c89152487d2f3f 100644 --- a/aten/src/THCUNN/PReLU.cu +++ b/aten/src/THCUNN/PReLU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include #include "THCTensor.hpp" @@ -69,7 +69,7 @@ struct PReLUAccGradParametersShared { __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) { - *gradInput = (*input) * (*gradOutput) * (*input <= 0); + *gradInput = (*input) * (*gradOutput) * static_cast(*input <= 0); } }; @@ -84,7 +84,7 @@ struct PReLUAccGradParameters __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) { - *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0); + *gradInput = (*input) * (*gradOutput) * scale * static_cast(*input <= 0); } }; @@ -99,7 +99,7 @@ struct PReLUAccGradParameters1to1 __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput) { - *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0); + *gradWeight += (*input) * (*gradOutput) * scale * static_cast(*input <= 0); } }; diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu index 388627791a012f..d044fadadfb0d1 100644 --- a/aten/src/THCUNN/RReLU.cu +++ b/aten/src/THCUNN/RReLU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include #include "common.h" @@ -15,8 +15,8 @@ template inline T __device__ curand_uniform_type(curandStateMtgp32 *state); template <> -inline half __device__ curand_uniform_type(curandStateMtgp32 *state) { - return ScalarConvert::to(curand_uniform(state)); +inline THHalf __device__ curand_uniform_type(curandStateMtgp32 *state) { + return ScalarConvert::to(curand_uniform(state)); } template <> diff --git a/aten/src/THCUNN/Sigmoid.cu b/aten/src/THCUNN/Sigmoid.cu index 0be57d6a621dfc..6ade198fe02af1 100644 --- a/aten/src/THCUNN/Sigmoid.cu +++ b/aten/src/THCUNN/Sigmoid.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu index c8018d997365dc..48a86e3efec996 100644 --- a/aten/src/THCUNN/SmoothL1Criterion.cu +++ b/aten/src/THCUNN/SmoothL1Criterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu index ee53e76dca2625..63b0ef3cc7ca7e 100644 --- a/aten/src/THCUNN/SoftMarginCriterion.cu +++ b/aten/src/THCUNN/SoftMarginCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/SoftPlus.cu b/aten/src/THCUNN/SoftPlus.cu index 42b2c3c5ef0617..a8a36c22fa918a 100644 --- a/aten/src/THCUNN/SoftPlus.cu +++ b/aten/src/THCUNN/SoftPlus.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/SoftShrink.cu b/aten/src/THCUNN/SoftShrink.cu index a4e45d87c6b3a3..5ec104155ea737 100644 --- a/aten/src/THCUNN/SoftShrink.cu +++ b/aten/src/THCUNN/SoftShrink.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu index 2bc17a75a76f7e..08f72046341ca7 100644 --- a/aten/src/THCUNN/SparseLinear.cu +++ b/aten/src/THCUNN/SparseLinear.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu index 2c671dad5a8364..ff68ab8440f757 100644 --- a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu +++ b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu index 592e6fd8b08154..591dd012fdd521 100644 --- a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu +++ b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/SpatialAveragePooling.cu b/aten/src/THCUNN/SpatialAveragePooling.cu index ce9941a62398c7..d07255a954ee77 100644 --- a/aten/src/THCUNN/SpatialAveragePooling.cu +++ b/aten/src/THCUNN/SpatialAveragePooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "common.h" diff --git a/aten/src/THCUNN/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/SpatialClassNLLCriterion.cu index 83addd09a2769b..92262354b3b696 100644 --- a/aten/src/THCUNN/SpatialClassNLLCriterion.cu +++ b/aten/src/THCUNN/SpatialClassNLLCriterion.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "common.h" diff --git a/aten/src/THCUNN/SpatialConvolutionLocal.cu b/aten/src/THCUNN/SpatialConvolutionLocal.cu index 17801d52b12688..0af7685432991b 100644 --- a/aten/src/THCUNN/SpatialConvolutionLocal.cu +++ b/aten/src/THCUNN/SpatialConvolutionLocal.cu @@ -2,7 +2,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu index 4a59acb2975188..d9f6f128efe81b 100644 --- a/aten/src/THCUNN/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/SpatialConvolutionMM.cu @@ -3,7 +3,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/SpatialConvolutionMM.cu" diff --git a/aten/src/THCUNN/SpatialCrossMapLRN.cu b/aten/src/THCUNN/SpatialCrossMapLRN.cu index cd6f081b1302df..2262ddba4743ed 100644 --- a/aten/src/THCUNN/SpatialCrossMapLRN.cu +++ b/aten/src/THCUNN/SpatialCrossMapLRN.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/SpatialDilatedConvolution.cu b/aten/src/THCUNN/SpatialDilatedConvolution.cu index b8e96024fdaa87..b0edadb2b357fc 100644 --- a/aten/src/THCUNN/SpatialDilatedConvolution.cu +++ b/aten/src/THCUNN/SpatialDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu index 6732e4f2b53409..3aef4ecf524cce 100644 --- a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu +++ b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCNumerics.cuh" #include "common.h" diff --git a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu index f3ca162453107a..71ddce8c84a995 100644 --- a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu +++ b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu @@ -3,7 +3,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialFullConvolution.cu b/aten/src/THCUNN/SpatialFullConvolution.cu index 4e37ecf280bbf5..b4eff4b9c22d1a 100644 --- a/aten/src/THCUNN/SpatialFullConvolution.cu +++ b/aten/src/THCUNN/SpatialFullConvolution.cu @@ -1,7 +1,7 @@ #include "THCUNN.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/SpatialFullConvolution.cu" diff --git a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu index 61e1fe5910ad18..9ba2236cea66cd 100644 --- a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu +++ b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "im2col.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/SpatialFullDilatedConvolution.cu" diff --git a/aten/src/THCUNN/SpatialReflectionPadding.cu b/aten/src/THCUNN/SpatialReflectionPadding.cu index 96472eed08b839..0b0643c8e41342 100644 --- a/aten/src/THCUNN/SpatialReflectionPadding.cu +++ b/aten/src/THCUNN/SpatialReflectionPadding.cu @@ -7,7 +7,7 @@ #include "THCReduceApplyUtils.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialReplicationPadding.cu b/aten/src/THCUNN/SpatialReplicationPadding.cu index f63c2090d5fb14..1ee5c62f6e7132 100644 --- a/aten/src/THCUNN/SpatialReplicationPadding.cu +++ b/aten/src/THCUNN/SpatialReplicationPadding.cu @@ -7,7 +7,7 @@ #include "THCReduceApplyUtils.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialSubSampling.cu b/aten/src/THCUNN/SpatialSubSampling.cu index bb0484662254fd..8e8e390c136388 100644 --- a/aten/src/THCUNN/SpatialSubSampling.cu +++ b/aten/src/THCUNN/SpatialSubSampling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu index 07daa0e9fec01d..b093ee287edce0 100644 --- a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu +++ b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu index 889d64e1817e1c..8b0784cfd75351 100644 --- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu +++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/Sqrt.cu b/aten/src/THCUNN/Sqrt.cu index a52ce34117aaf0..7358f8c6bd0bc7 100644 --- a/aten/src/THCUNN/Sqrt.cu +++ b/aten/src/THCUNN/Sqrt.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/Square.cu b/aten/src/THCUNN/Square.cu index 66bbec49d29cd9..f44fbfe7ea4fc4 100644 --- a/aten/src/THCUNN/Square.cu +++ b/aten/src/THCUNN/Square.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh index 5f8fda89909552..fe8a8bbc3cdc1a 100644 --- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh +++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh @@ -1,23 +1,23 @@ #ifndef THC_HALF_AUTO_NUMERICS_INC #define THC_HALF_AUTO_NUMERICS_INC -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCNumerics.cuh" -// WARNING: THCNumerics is being deprecated. Read the comments and function usage +// WARNING: THCNumerics is being deprecated. Read the comments and function usage // in THCNumerics to learn about the deprecation -// +// // Half numerics functions defined as free functions, so cunn code can be -//written generically, i.e. without excessive calling of THCNumerics functions. +// written generically, i.e. without excessive calling of THCNumerics functions. // these functions should move to THCNumerics -inline __host__ __device__ half fmaxType(half x, half y) { - return THCNumerics::ge(x, y) ? x : y; +inline __host__ __device__ THHalf fmaxType(THHalf x, THHalf y) { + return THCNumerics::ge(x, y) ? x : y; } -inline __host__ __device__ float fmaxType(float x, half y) { - return fmaxf(x, ScalarConvert::to(y)); +inline __host__ __device__ float fmaxType(float x, THHalf y) { + return fmaxf(x, ScalarConvert::to(y)); } inline __host__ __device__ float fmaxType(float x, float y) { @@ -31,217 +31,40 @@ inline __host__ __device__ double fmaxType(double x, double y) { // arithmetic functions -inline __host__ __device__ half operator+(half a, half b) { - return THCNumerics::add(a, b); +inline __host__ __device__ THHalf abs(THHalf a) { + return THCNumerics::abs(a); } -inline __host__ __device__ float operator+(half a, float b) { - return ScalarConvert::to(a) + b; +inline __host__ __device__ THHalf exp(THHalf a) { + return THCNumerics::exp(a); } -inline __host__ __device__ float operator+(float a, half b) { - return a + ScalarConvert::to(b); +inline __host__ __device__ THHalf log10(THHalf a) { + return THCNumerics::log10(a); } -inline __host__ __device__ double operator+(double a, half b) { - return a + ScalarConvert::to(b); +inline __host__ __device__ THHalf log1p(THHalf a) { + return THCNumerics::log1p(a); } -inline __host__ __device__ half operator-(half a) { - return THCNumerics::neg(a); +inline __host__ __device__ THHalf log2(THHalf a) { + return THCNumerics::log2(a); } -inline __host__ __device__ half operator-(half a, half b) { - return THCNumerics::add(a, THCNumerics::neg(b)); +inline __host__ __device__ THHalf expm1(THHalf a) { + return THCNumerics::expm1(a); } -inline __host__ __device__ half operator-(half a, int b) { - return THCNumerics::add(a, THCNumerics::neg(ScalarConvert::to(b))); +inline __host__ __device__ THHalf pow(THHalf a, THHalf b) { + return THCNumerics::pow(a, b); } -inline __host__ __device__ float operator-(half a, float b) { - return ScalarConvert::to(a) - b; +inline __host__ __device__ THHalf sqrt(THHalf a) { + return THCNumerics::sqrt(a); } -inline __host__ __device__ double operator-(half a, double b) { - return ScalarConvert::to(a) - b; -} - -inline __host__ __device__ half operator-(int a, half b) { - return THCNumerics::add(ScalarConvert::to(a), THCNumerics::neg(b)); -} - -inline __host__ __device__ float operator-(float a, half b) { - return a - ScalarConvert::to(b); -} - -inline __host__ __device__ double operator-(double a, half b) { - return a - ScalarConvert::to(b); -} - -inline __host__ __device__ half operator*(half a, half b) { - return THCNumerics::mul(a, b); -} - -inline __host__ __device__ float operator*(half a, float b) { - return ScalarConvert::to(a) * b; -} - -inline __host__ __device__ double operator*(half a, double b) { - return ScalarConvert::to(a) * b; -} - -inline __host__ __device__ half operator*(half a, int b) { - return a * ScalarConvert::to(b); -} - -inline __host__ __device__ float operator*(float a, half b) { - return a * ScalarConvert::to(b); -} - -inline __host__ __device__ double operator*(double a, half b) { - return a * ScalarConvert::to(b); -} - -inline __host__ __device__ half operator/(half a, half b) { - return THCNumerics::div(a, b); -} - -inline __host__ __device__ float operator/(float a, half b) { - return a / ScalarConvert::to(b); -} - -inline __host__ __device__ double operator/(double a, half b) { - return a / ScalarConvert::to(b); -} - -inline __host__ __device__ half operator/(int a, half b) { - return ScalarConvert::to(a) / b; -} - -inline __host__ __device__ float operator/(half a, float b) { - return ScalarConvert::to(a) / b; -} - -inline __host__ __device__ double operator/(half a, double b) { - return ScalarConvert::to(a) / b; -} - -inline __host__ __device__ half operator/(half a, int b) { - return a / ScalarConvert::to(b); -} - -inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) { - lhs = lhs + rhs; - return lhs; -} -inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) { - lhs = lhs + rhs; - return lhs; -} - -inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) { - lhs = lhs - rhs; - return lhs; -} - -inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) { - lhs = lhs * rhs; - return lhs; -} - -inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) { - lhs = lhs / rhs; - return lhs; -} - -inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) { - lhs = lhs / rhs; - return lhs; -} - -inline __host__ __device__ half abs(half a) { - return THCNumerics::abs(a); -} - -inline __host__ __device__ half exp(half a) { - return THCNumerics::exp(a); -} - -inline __host__ __device__ half log10(half a) { - return THCNumerics::log10(a); -} - -inline __host__ __device__ half log1p(half a) { - return THCNumerics::log1p(a); -} - -inline __host__ __device__ half log2(half a) { - return THCNumerics::log2(a); -} - -inline __host__ __device__ half expm1(half a) { - return THCNumerics::expm1(a); -} - -inline __host__ __device__ half pow(half a, half b) { - return THCNumerics::pow(a, b); -} - -inline __host__ __device__ half sqrt(half a) { - return THCNumerics::sqrt(a); -} - -inline __host__ __device__ half tanh(half a) { - return THCNumerics::tanh(a); -} - -#if defined(_MSC_VER) && CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) -inline __host__ __device__ half operator+(half a, int b) { - return THCNumerics::add(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ double operator+(half a, double b) { - return ScalarConvert::to(a) + b; -} - -inline __host__ __device__ half operator*(half a, bool b) { - return THCNumerics::mul(a, ScalarConvert::to(b)); -} -#endif - -// comparison functions - -inline __host__ __device__ bool operator<(half a, half b) { - return THCNumerics::lt(a, b); -} - -inline __host__ __device__ bool operator<=(half a, half b) { - return THCNumerics::le(a, b); -} - -inline __host__ __device__ bool operator<=(half a, int b) { - return THCNumerics::le(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ bool operator<(half a, int b) { - return THCNumerics::lt(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ bool operator>(half a, half b) { - return THCNumerics::gt(a, b); -} - -inline __host__ __device__ bool operator>(half a, int b) { - return THCNumerics::gt(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ bool operator>=(half a, half b) { - return THCNumerics::ge(a, b); -} - -inline __host__ __device__ bool operator>=(half a, int b) { - return THCNumerics::ge(a, ScalarConvert::to(b)); +inline __host__ __device__ THHalf tanh(THHalf a) { + return THCNumerics::tanh(a); } #endif diff --git a/aten/src/THCUNN/Tanh.cu b/aten/src/THCUNN/Tanh.cu index a19cc71c045dd2..676da711f33443 100644 --- a/aten/src/THCUNN/Tanh.cu +++ b/aten/src/THCUNN/Tanh.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/TemporalConvolution.cu b/aten/src/THCUNN/TemporalConvolution.cu index af12169d7a4ce8..847b82449ac60f 100644 --- a/aten/src/THCUNN/TemporalConvolution.cu +++ b/aten/src/THCUNN/TemporalConvolution.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/TemporalMaxPooling.cu b/aten/src/THCUNN/TemporalMaxPooling.cu index 2508f835177b25..de478339d8f984 100644 --- a/aten/src/THCUNN/TemporalMaxPooling.cu +++ b/aten/src/THCUNN/TemporalMaxPooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/TemporalReflectionPadding.cu b/aten/src/THCUNN/TemporalReflectionPadding.cu index 4dd4da84c0a2d6..9e905f914653ca 100644 --- a/aten/src/THCUNN/TemporalReflectionPadding.cu +++ b/aten/src/THCUNN/TemporalReflectionPadding.cu @@ -8,7 +8,7 @@ #include "THCTensor.hpp" #include "THCStorage.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/TemporalReplicationPadding.cu b/aten/src/THCUNN/TemporalReplicationPadding.cu index 2c812bda8d64f5..3f74d1e62473b2 100644 --- a/aten/src/THCUNN/TemporalReplicationPadding.cu +++ b/aten/src/THCUNN/TemporalReplicationPadding.cu @@ -7,7 +7,7 @@ #include "THCReduceApplyUtils.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/TemporalRowConvolution.cu b/aten/src/THCUNN/TemporalRowConvolution.cu index 745fef807510d8..097c78fde6f81b 100644 --- a/aten/src/THCUNN/TemporalRowConvolution.cu +++ b/aten/src/THCUNN/TemporalRowConvolution.cu @@ -2,7 +2,7 @@ #include "common.h" #include "row2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu index 89b0c37b1fa78d..2aaf4bcf4435e4 100644 --- a/aten/src/THCUNN/TemporalUpSamplingLinear.cu +++ b/aten/src/THCUNN/TemporalUpSamplingLinear.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu index c87129da7e1563..225e319423e8ad 100644 --- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu +++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/Threshold.cu b/aten/src/THCUNN/Threshold.cu index 37290894103919..1d44e442e21277 100644 --- a/aten/src/THCUNN/Threshold.cu +++ b/aten/src/THCUNN/Threshold.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu index 84e2c7f7063c3a..89ecfe09bfaf94 100644 --- a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu +++ b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu index 6d542ba39037ee..3e631b2755e9bd 100644 --- a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu +++ b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/VolumetricAveragePooling.cu b/aten/src/THCUNN/VolumetricAveragePooling.cu index 110eac44dcb997..66e89d2a950b72 100644 --- a/aten/src/THCUNN/VolumetricAveragePooling.cu +++ b/aten/src/THCUNN/VolumetricAveragePooling.cu @@ -4,7 +4,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu index da66140fb61537..2e405f3c3b00c2 100644 --- a/aten/src/THCUNN/VolumetricConvolution.cu +++ b/aten/src/THCUNN/VolumetricConvolution.cu @@ -1,7 +1,7 @@ #include "THCUNN.h" #include "THCTensor.hpp" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" // Kernel for fast unfold+copy diff --git a/aten/src/THCUNN/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/VolumetricDilatedConvolution.cu index 8a32c70b6701ad..09fc83b01a54f2 100644 --- a/aten/src/THCUNN/VolumetricDilatedConvolution.cu +++ b/aten/src/THCUNN/VolumetricDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "THCTensor.hpp" #include "common.h" #include "vol2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/VolumetricDilatedConvolution.cu" diff --git a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu index 1a0f2f617d343e..bff981f73d3a3a 100644 --- a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu +++ b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu @@ -4,7 +4,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu index e6260ceabbe282..4875ae9f7da07a 100644 --- a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu +++ b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu @@ -3,7 +3,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricFullConvolution.cu b/aten/src/THCUNN/VolumetricFullConvolution.cu index 556b5bc1d4a5ac..6c4ace126e04e7 100644 --- a/aten/src/THCUNN/VolumetricFullConvolution.cu +++ b/aten/src/THCUNN/VolumetricFullConvolution.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/VolumetricFullConvolution.cu" diff --git a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu index c5c7196bac899e..d315ace7582440 100644 --- a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu +++ b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "THCTensor.hpp" #include "common.h" #include "vol2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/VolumetricFullDilatedConvolution.cu" diff --git a/aten/src/THCUNN/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/VolumetricMaxUnpooling.cu index eac3b2d17af5e4..0974ebc763cd8d 100644 --- a/aten/src/THCUNN/VolumetricMaxUnpooling.cu +++ b/aten/src/THCUNN/VolumetricMaxUnpooling.cu @@ -4,7 +4,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/VolumetricReplicationPadding.cu b/aten/src/THCUNN/VolumetricReplicationPadding.cu index 27ea3ecad3faa2..e9ff31de27240b 100644 --- a/aten/src/THCUNN/VolumetricReplicationPadding.cu +++ b/aten/src/THCUNN/VolumetricReplicationPadding.cu @@ -5,7 +5,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" #include "THCReduceApplyUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu index babbd58b0d4a0f..2f06bdaa78ca0b 100644 --- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu +++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu index 0f353b91acb7ea..ea4c50433370fe 100644 --- a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu +++ b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 67c7856354a31d..6f679dde9e180b 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -32,9 +32,8 @@ if (NOT BUILD_ATEN_MOBILE) # Add source, includes, and libs to lists list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS}) - # ATen tests use catch instead of gtest so keep separate for now - # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) - # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) + list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) + list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE}) list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE}) @@ -86,8 +85,10 @@ add_subdirectory(mobile) add_subdirectory(mpi) add_subdirectory(observers) add_subdirectory(onnx) -add_subdirectory(operators) -add_subdirectory(operators/rnn) +if (BUILD_CAFFE2_OPS) + add_subdirectory(operators) + add_subdirectory(operators/rnn) +endif() add_subdirectory(opt) add_subdirectory(perfkernels) add_subdirectory(python) @@ -290,6 +291,11 @@ if (MSVC AND NOT BUILD_SHARED_LIBS) # as the latter is not respected by nvcc target_compile_definitions(caffe2 PUBLIC "AT_CORE_STATIC_WINDOWS=1") endif() +if (MSVC AND BUILD_SHARED_LIBS) + # ONNX is linked statically and needs to be exported from this library + # to be used externally. Make sure that references match the export. + target_compile_options(caffe2 PRIVATE "-DONNX_BUILD_MAIN_LIB") +endif() # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) @@ -426,30 +432,11 @@ if (BUILD_TEST) endforeach() endif() - if (NOT USE_ROCM) - set(__aten_test_dir "test/aten") - foreach(test_src ${ATen_CPU_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - add_executable(${test_name} "${test_src}") - target_include_directories(${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE}) - target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS}) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION ${__aten_test_dir}) - endforeach() - - if(USE_CUDA OR USE_ROCM) - foreach(test_src ${ATen_CUDA_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - torch_cuda_based_add_executable(${test_name} "${test_src}") - target_include_directories(${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE}) - target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS}) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION ${__aten_test_dir}) - endforeach() + # For special tests that explicitly uses dependencies, we add them here + if (USE_MPI) + target_link_libraries(mpi_test ${MPI_CXX_LIBRARIES}) + if (USE_CUDA) + target_link_libraries(mpi_gpu_test ${MPI_CXX_LIBRARIES}) endif() endif() endif() @@ -510,6 +497,9 @@ if (BUILD_PYTHON) if (APPLE) set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() + if (WIN32) + set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") + endif() target_include_directories(caffe2_pybind11_state PRIVATE $) target_include_directories(caffe2_pybind11_state PRIVATE ${Caffe2_CPU_INCLUDE}) @@ -535,6 +525,9 @@ if (BUILD_PYTHON) if (APPLE) set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() + if (WIN32) + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") + endif() target_include_directories(caffe2_pybind11_state_gpu PRIVATE $) target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE}) target_link_libraries( @@ -560,6 +553,9 @@ if (BUILD_PYTHON) if (APPLE) set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() + if (WIN32) + set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") + endif() target_include_directories(caffe2_pybind11_state_hip PRIVATE $) target_include_directories(caffe2_pybind11_state_hip PRIVATE ${Caffe2_CPU_INCLUDE}) target_link_libraries( diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt index 92eb671e019cb7..add3918d4c3373 100644 --- a/caffe2/contrib/aten/CMakeLists.txt +++ b/caffe2/contrib/aten/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT BUILD_ATEN_MOBILE) +if(NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS) # Add source generated by Codegen.cmake and pass to parent list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc) list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc) diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index 9f327fdd5d82d4..583a2fa492647b 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -91,14 +91,22 @@ class ATenOp : public Operator { void assignTo(Tensor* dst, const at::Tensor& src_) { at::Tensor src = src_.contiguous(); auto at_sizes = src.sizes(); - std::vector dims(at_sizes.begin(),at_sizes.end()); + caffe2::TypeMeta type_meta = typeMetaFor(src); + at::Device device = src.device(); + at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl(); + std::vector dims(at_sizes.begin(), at_sizes.end()); dst->Resize(dims); dst->ShareExternalPointer( - src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable { - // return a closure that holds a handle to t until it is called - // to keep the aten memory alive - return src.reset(); - }); + at::DataPtr( + src_impl->data(), + static_cast(src_impl), + [](void* t_ptr) -> void { + at::TensorImpl* local_impl = static_cast(t_ptr); + c10::raw::intrusive_ptr::decref(local_impl); + }, + device), + type_meta, + 0); } void assignListStartingAt( size_t offset, @@ -214,10 +222,10 @@ class ATenOp : public Operator { DEFINE_IF(int64, Long) CAFFE_THROW("unsupported type annotation: ", name); } - at::Type & stringToType(const std::string & name) { + at::TypeExtendedInterface & stringToType(const std::string & name) { return at::getNonVariableType(backend(), stringToScalarType(name)); } - at::Type * readTypeAttribute(const std::string & name) { + at::TypeExtendedInterface * readTypeAttribute(const std::string & name) { CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); return &stringToType(OperatorBase::GetSingleArgument(name, "")); } diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index 18a3db4c7daed3..70843bb0d91108 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -278,7 +278,7 @@ def find_factory_methods(decls): # first tensor input is used to define the output type. defined_inferred_type = True env['statements'].append( - 'auto inferred_type = &({}.type());'.format( + 'auto inferred_type = &at::getType({});'.format( arg['name'])) else: init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name']) diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc index 716ff7a70814e0..3612d8b46f1f8d 100644 --- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc +++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc @@ -95,10 +95,10 @@ void BlobToTensorProto( } // Set values - if (blob->template IsType(CPU)) { + if (blob->IsTensorType(CPU)) { const auto& cpu_tensor = blob->template Get(); CPUTensorToTensorProto(cpu_tensor, t); - } else if (blob->template IsType(CUDA)) { + } else if (blob->IsTensorType(CUDA)) { const auto& cuda_tensor = blob->template Get(); const auto cpu_tensor = TensorCPU(cuda_tensor, context); context->FinishDeviceComputation(); diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h index 1e9be9f50b39d8..96bc720ccd59d1 100644 --- a/caffe2/core/allocator.h +++ b/caffe2/core/allocator.h @@ -11,8 +11,8 @@ CAFFE2_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill); namespace caffe2 { -// Use 32-byte alignment should be enough for computation up to AVX512. -constexpr size_t gCaffe2Alignment = 32; +// Use 64-byte alignment should be enough for computation up to AVX512. +constexpr size_t gCaffe2Alignment = 64; using MemoryDeleter = void (*)(void*); diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h index f085ee23995bd7..490b58c1565e6e 100644 --- a/caffe2/core/blob.h +++ b/caffe2/core/blob.h @@ -23,35 +23,22 @@ namespace caffe2 { * properly when the blob is deallocated or re-allocated with a new type. A blob * could contain anything, although the most common case is to contain a Tensor. */ -class CAFFE2_API Blob { +class CAFFE2_API Blob final { public: - typedef void (*DestroyCall)(void*); + using DestroyCall = void(void*); /** * Initializes an empty Blob. */ - Blob() : meta_(), pointer_(nullptr) {} + Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {} ~Blob() { Reset(); } - Blob(Blob&& other) noexcept - : meta_(std::move(other.meta_)), - pointer_(std::move(other.pointer_)), - destroy_(std::move(other.destroy_)) { - other.meta_ = {}; - other.pointer_ = nullptr; - other.destroy_ = nullptr; + Blob(Blob&& other) noexcept : Blob() { + swap(other); } Blob& operator=(Blob&& other) noexcept { - if (pointer_ && destroy_) { - destroy_(pointer_); - } - meta_ = std::move(other.meta_); - pointer_ = std::move(other.pointer_); - destroy_ = std::move(other.destroy_); - other.meta_ = {}; - other.pointer_ = nullptr; - other.destroy_ = nullptr; + Blob(std::move(other)).swap(*this); return *this; } @@ -59,18 +46,12 @@ class CAFFE2_API Blob { * Checks if the content stored in the blob is of type T. */ template - bool IsType() const { + bool IsType() const noexcept { return meta_.Match(); } - // TODO(jerryzh): Remove template - template - bool IsType(DeviceType device_type) const { - static_assert( - std::is_same::value, - "IsType(DeviceType) only available on " - "Tensor types."); - bool is_match = meta_.Match(); + bool IsTensorType(DeviceType device_type) const { + bool is_match = meta_.Match(); auto* tensor = static_cast(pointer_); if (is_match && tensor && tensor->GetDeviceType() == device_type) { return true; @@ -81,12 +62,12 @@ class CAFFE2_API Blob { /** * Returns the meta info of the blob. */ - inline const TypeMeta& meta() const { return meta_; } + inline const TypeMeta& meta() const noexcept { return meta_; } /** * Returns a printable typename of the blob. */ - inline const char* TypeName() const { return meta_.name(); } + inline const char* TypeName() const noexcept { return meta_.name(); } /** * @brief Gets the const reference of the stored object. The code checks if @@ -107,10 +88,10 @@ class CAFFE2_API Blob { return *static_cast(pointer_); } - const void* GetRaw() const { + const void* GetRaw() const noexcept { return pointer_; } - void* GetRaw() { + void* GetRaw() noexcept { return pointer_; } @@ -149,7 +130,7 @@ class CAFFE2_API Blob { } inline Tensor* GetMutableTensor(DeviceType device_type) { - if (IsType(device_type)) { + if (IsTensorType(device_type)) { return static_cast(pointer_); } else { VLOG(1) << "Create new mutable object " << TypeMeta::TypeName() @@ -178,7 +159,7 @@ class CAFFE2_API Blob { } inline void* - Reset(void* allocated, const TypeMeta& meta, const DestroyCall& destroy) { + Reset(void* allocated, const TypeMeta& meta, DestroyCall* destroy) { if (pointer_ && destroy_) { destroy_(pointer_); } @@ -192,8 +173,8 @@ class CAFFE2_API Blob { * Releases the ownership, if any, this Blob has on the underlying pointer. * The user is then responsible for freeing the data if needed */ - inline DestroyCall Release() { - DestroyCall d = destroy_; + inline DestroyCall* Release() { + DestroyCall* d = destroy_; destroy_ = nullptr; return d; } @@ -289,7 +270,7 @@ class CAFFE2_API Blob { } TypeMeta meta_; void* pointer_ = nullptr; - DestroyCall destroy_ = nullptr; + DestroyCall* destroy_ = nullptr; AT_DISABLE_COPY_AND_ASSIGN(Blob); }; diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc index 0f0fcb54906dcd..29bf0c3bc52ae6 100644 --- a/caffe2/core/blob_gpu_test.cc +++ b/caffe2/core/blob_gpu_test.cc @@ -148,7 +148,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) { } \ Blob new_blob; \ EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \ - EXPECT_TRUE(new_blob.IsType(CUDA)); \ + EXPECT_TRUE(new_blob.IsTensorType(CUDA)); \ Tensor new_cpu_tensor(blob.Get(), CPU); \ EXPECT_EQ(new_cpu_tensor.ndim(), 2); \ EXPECT_EQ(new_cpu_tensor.dim(0), 2); \ @@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) { // Test if the restored blob is still of the same device. blob.Reset(); EXPECT_NO_THROW(blob.Deserialize(serialized)); - EXPECT_TRUE(blob.IsType(CUDA)); + EXPECT_TRUE(blob.IsTensorType(CUDA)); EXPECT_EQ(GetGPUIDForPointer(blob.Get().data()), gpu_id); // Test if we force the restored blob on a different device, we @@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) { blob.Reset(); proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0); EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString())); - EXPECT_TRUE(blob.IsType(CUDA)); + EXPECT_TRUE(blob.IsTensorType(CUDA)); EXPECT_EQ(GetGPUIDForPointer(blob.Get().data()), 0); } } diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc index 97d17c6e5924f3..8103071c81ee26 100644 --- a/caffe2/core/blob_test.cc +++ b/caffe2/core/blob_test.cc @@ -86,15 +86,15 @@ TEST(BlobTest, Blob) { int* int_unused CAFFE2_UNUSED = blob.GetMutable(); EXPECT_TRUE(blob.IsType()); EXPECT_FALSE(blob.IsType()); - EXPECT_FALSE(blob.IsType(CPU)); + EXPECT_FALSE(blob.IsTensorType(CPU)); BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable(); EXPECT_TRUE(blob.IsType()); EXPECT_FALSE(blob.IsType()); - EXPECT_FALSE(blob.IsType(CPU)); + EXPECT_FALSE(blob.IsTensorType(CPU)); Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU); - EXPECT_TRUE(blob.IsType(CPU)); + EXPECT_TRUE(blob.IsTensorType(CPU)); EXPECT_FALSE(blob.IsType()); EXPECT_FALSE(blob.IsType()); } @@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) { } \ Blob new_blob; \ EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \ - EXPECT_TRUE(new_blob.IsType(CPU)); \ + EXPECT_TRUE(new_blob.IsTensorType(CPU)); \ const TensorCPU& new_tensor = blob.Get(); \ EXPECT_EQ(new_tensor.ndim(), 2); \ EXPECT_EQ(new_tensor.dim(0), 2); \ @@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) { EXPECT_EQ(tensor_proto.field_name##_size(), 0); \ Blob new_blob; \ EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \ - EXPECT_TRUE(new_blob.IsType(CPU)); \ + EXPECT_TRUE(new_blob.IsTensorType(CPU)); \ const TensorCPU& new_tensor = blob.Get(); \ EXPECT_EQ(new_tensor.ndim(), 2); \ EXPECT_EQ(new_tensor.dim(0), 0); \ @@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) { EXPECT_EQ(proto.type(), "Tensor"); Blob new_blob; EXPECT_NO_THROW(new_blob.Deserialize(serialized)); - EXPECT_TRUE(new_blob.IsType(CPU)); + EXPECT_TRUE(new_blob.IsTensorType(CPU)); const TensorCPU& new_tensor = blob.Get(); EXPECT_EQ(new_tensor.ndim(), 2); EXPECT_EQ(new_tensor.dim(0), 2); @@ -724,7 +724,7 @@ TEST(TensorTest, float16) { } Blob new_blob; EXPECT_NO_THROW(new_blob.Deserialize(serialized)); - EXPECT_TRUE(new_blob.IsType(CPU)); + EXPECT_TRUE(new_blob.IsTensorType(CPU)); const TensorCPU& new_tensor = blob.Get(); EXPECT_EQ(new_tensor.ndim(), 1); EXPECT_EQ(new_tensor.dim(0), kSize); @@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) { load_op->Run(); VLOG(1) << "Reading blob from workspace"; auto new_blob = ws.GetBlob("test"); - EXPECT_TRUE(new_blob->IsType(CPU)); + EXPECT_TRUE(new_blob->IsTensorType(CPU)); const auto& new_tensor = new_blob->Get(); EXPECT_EQ(new_tensor.ndim(), d1); diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h index a8f2808e30eeab..2bbbce7df1e4ef 100644 --- a/caffe2/core/common_cudnn.h +++ b/caffe2/core/common_cudnn.h @@ -4,8 +4,6 @@ #include #include -#include - #include "caffe2/core/common.h" #include "caffe2/core/context.h" #include "caffe2/core/logging.h" @@ -16,6 +14,8 @@ #error("This Caffe2 install is not built with cudnn, so you should not include this file."); #endif +#include + static_assert( CUDNN_VERSION >= 5000, "Caffe2 requires cudnn version 5.0 or above."); diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h index 4658cd0d756099..b73a6aefa406a0 100644 --- a/caffe2/core/common_gpu.h +++ b/caffe2/core/common_gpu.h @@ -116,7 +116,7 @@ inline int CudaVersion() { return CUDA_VERSION; } /** * Returns the number of devices. */ -int NumCudaDevices(); +CAFFE2_CUDA_API int NumCudaDevices(); /** * Check if the current running session has a cuda gpu present. diff --git a/caffe2/core/context.h b/caffe2/core/context.h index 4faaea93c6da12..aff66534d22198 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -153,7 +153,7 @@ class CAFFE2_API CPUContext final : public BaseContext { return true; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return CPU; } @@ -207,6 +207,7 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext { void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) override { + CHECK(device); device->set_device_type(TypeToProto(GetDeviceType())); } diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 3090ca57aedc31..987c9ffe35299d 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -285,7 +285,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext { return cudaStreamQuery(stream) == cudaSuccess; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return CUDA; } @@ -403,6 +403,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext { } void ExtractDeviceOption(DeviceOption* device, const void* data) override { + CAFFE_ENFORCE(data, "data cannot be nullptr"); device->set_device_type(TypeToProto(GetDeviceType())); device->set_cuda_gpu_id(GetGPUIDForPointer(data)); } diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h index cd309e6473be4f..5a7613cf934fd0 100644 --- a/caffe2/core/hip/context_hip.h +++ b/caffe2/core/hip/context_hip.h @@ -269,7 +269,7 @@ class HIPContext final : public BaseContext { return hipStreamQuery(stream) == hipSuccess; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return HIP; } diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc index 2765b48435e7bd..28f33a43cbff2f 100644 --- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc +++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc @@ -216,17 +216,14 @@ NNNodeMatchCriteria criteriaSingleConsumer() { "Single consumer"); } -NNNodeMatchCriteria matchTensor() { - return matchOp("matchTensor"); +NNNodeMatchCriteria matchTensor(const std::string& debugString) { + return matchOp(debugString); } -NNMatchGraph::NodeRef operatorSubgraph( - NNMatchGraph& g, - const NNNodeMatchCriteria& root, - const std::vector& childrenCriteria, - int count) { - return subgraph( - g, matchTensor(), {subgraph(g, root, childrenCriteria)}, count); +NNMatchNode matchExternalTensorNode(const std::string& debugString) { + return NNMatchNode(matchTensor(debugString)) + .nonTerminal() + .excludeFromSubgraph(); } } // namespace nn diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h index c6b10f0912eca8..568d46a61ff561 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h @@ -39,7 +39,7 @@ class Node; // \brief Edge within a Graph. template -class CAFFE2_API Edge : public StorageType { +class Edge : public StorageType { public: using NodeRef = typename Graph::NodeRef; Edge(NodeRef tail, NodeRef head, U... args) @@ -73,7 +73,7 @@ class CAFFE2_API Edge : public StorageType { // \brief Node within a Graph. template -class CAFFE2_API Node : public StorageType, public Notifier> { +class Node : public StorageType, public Notifier> { public: using NodeRef = typename Graph::NodeRef; using EdgeRef = typename Graph::EdgeRef; @@ -152,7 +152,7 @@ class CAFFE2_API Node : public StorageType, public Notifier> { /// for example. /// template -class CAFFE2_API Subgraph { +class Subgraph { public: Subgraph() { DEBUG_PRINT("Creating instance of Subgraph: %p\n", this); @@ -219,7 +219,7 @@ class CAFFE2_API Subgraph { /// Everything is owned by the graph to simplify storage concerns. /// template -class CAFFE2_API Graph { +class Graph { public: using SubgraphType = Subgraph; using NodeRef = Node*; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index f4c3940acd0711..2a03e428619b30 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -53,9 +53,6 @@ class CAFFE2_API Annotation { return kind_; } - Annotation(const Annotation&) = delete; - Annotation& operator=(Annotation&) = delete; - private: const AnnotationKind kind_; }; @@ -427,7 +424,7 @@ CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m); template struct CAFFE2_EXPORT NodeHelper {}; -struct CAFFE2_API NNNodeMatchCriteria { +struct NNNodeMatchCriteria { std::function predicate; std::string debugString; @@ -474,8 +471,6 @@ NNNodeMatchCriteria matchOp(const std::string& debugString = "matchOp") { debugString); } -CAFFE2_API NNNodeMatchCriteria matchTensor(); - template NNNodeMatchCriteria matchOp( const std::function predicate, @@ -489,6 +484,12 @@ NNNodeMatchCriteria matchOp( debugString); }; +CAFFE2_API NNNodeMatchCriteria +matchTensor(const std::string& debugString = "matchTensor"); + +CAFFE2_API NNMatchNode +matchExternalTensorNode(const std::string& debugString = "matchExternalTensor"); + struct CAFFE2_API NNNodeMatch { static bool isMatch( const NNGraph::NodeRef& node, @@ -500,15 +501,6 @@ struct CAFFE2_API NNNodeMatch { using NNSubgraphMatcher = nom::matcher::SubgraphMatcher; -// This helper method makes it easy to create matching criteria in NNGraph. -// For example, operatorSubgraph(opMatch, ...) will refer to a tree like this: -// ... -> opMatch -> opMatch_Output -CAFFE2_API NNMatchGraph::NodeRef operatorSubgraph( - NNMatchGraph& g, - const NNNodeMatchCriteria& root, - const std::vector& childrenCriteria = {}, - int count = 1); - } // namespace nn } // namespace repr diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h index f11975284b217c..a303324fbb5701 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h @@ -29,18 +29,11 @@ namespace matcher { */ template -class CAFFE2_API MatchNode { +class MatchNode { public: static const int kStarCount = -1; - MatchNode( - const NodeMatchCriteria& criteria, - bool includeInSubgraph = true, - int count = 1, - bool nonTerminal = false) - : criteria_(criteria), - includeInSubgraph_(includeInSubgraph), - count_(count), - nonTerminal_(nonTerminal) {} + + MatchNode(const NodeMatchCriteria& criteria) : criteria_(criteria) {} MatchNode() = default; MatchNode(const MatchNode&) = default; @@ -55,6 +48,25 @@ class CAFFE2_API MatchNode { return count_; } + MatchNode& count(int count) { + count_ = count; + return *this; + } + + MatchNode& starCount() { + return count(kStarCount); + } + + MatchNode& nonTerminal() { + nonTerminal_ = true; + return *this; + } + + MatchNode& excludeFromSubgraph() { + includeInSubgraph_ = false; + return *this; + } + bool isNonTerminal() const { return nonTerminal_; } @@ -65,9 +77,9 @@ class CAFFE2_API MatchNode { private: NodeMatchCriteria criteria_; - bool includeInSubgraph_; - int count_; - bool nonTerminal_; + int count_ = 1; + bool includeInSubgraph_ = true; + bool nonTerminal_ = false; }; template @@ -76,38 +88,12 @@ using MatchGraph = Graph>; template using MatchNodeRef = typename MatchGraph::NodeRef; -template -MatchNodeRef subgraph( - MatchGraph& graph, - const NodeMatchCriteria& root, - const std::vector>& children, - int count = 1, - bool includeInSubgraph = true) { - auto result = graph.createNode( - MatchNode(root, includeInSubgraph, count, false)); - for (auto child : children) { - graph.createEdge(result, child); - } - return result; -} - -// Note that for nonTerminalSubgraph, the default value for includeInSubgraph -// is false since we typically do not want to include a nonTerminal node -// in the matched subgraph. -template -MatchNodeRef nonTerminalSubgraph( - MatchGraph& graph, - const NodeMatchCriteria& root, - int count = 1, - bool includeInSubgraph = false) { - return graph.createNode( - MatchNode(root, includeInSubgraph, count, true)); -} - // TODO: Reuse convertToDotString once convertToDotString can work // with subgraph. template -std::string debugString(MatchNodeRef rootCriteriaRef) { +std::string debugString( + MatchNodeRef rootCriteriaRef, + bool invertGraphTraversal) { std::ostringstream out; auto rootNode = rootCriteriaRef->data(); out << "{rootCriteria = '" << rootNode.getCriteria() << "'"; @@ -117,11 +103,14 @@ std::string debugString(MatchNodeRef rootCriteriaRef) { if (rootNode.isNonTerminal()) { out << ", nonTerminal = " << rootNode.isNonTerminal(); } - auto outEdges = rootCriteriaRef->getOutEdges(); - if (!outEdges.empty()) { + auto edges = invertGraphTraversal ? rootCriteriaRef->getInEdges() + : rootCriteriaRef->getOutEdges(); + if (!edges.empty()) { out << ", childrenCriteria = ["; - for (auto& child : outEdges) { - out << debugString(child->head()) << ", "; + for (auto& child : edges) { + auto nextNode = invertGraphTraversal ? child->tail() : child->head(); + out << debugString(nextNode, invertGraphTraversal) + << ", "; } out << "]"; } @@ -294,7 +283,8 @@ struct SubgraphMatcher { std::ostringstream debugMessage; debugMessage << "Subgraph root at " << root << " is not the same as " << matchedNode << " which previously matched criteria " - << debugString(rootCriteriaRef); + << debugString( + rootCriteriaRef, invertGraphTraversal); return SubgraphMatchResultType::notMatched(debugMessage.str()); } else { return SubgraphMatchResultType::notMatched(); @@ -307,7 +297,8 @@ struct SubgraphMatcher { std::ostringstream debugMessage; debugMessage << "Subgraph root at " << root << " does not match criteria " - << debugString(rootCriteriaRef); + << debugString( + rootCriteriaRef, invertGraphTraversal); return SubgraphMatchResultType::notMatched(debugMessage.str()); } else { return SubgraphMatchResultType::notMatched(); @@ -326,8 +317,10 @@ struct SubgraphMatcher { invertGraphTraversal ? root->getInEdges() : root->getOutEdges(); int numEdges = edges.size(); - const auto outEdges = rootCriteriaRef->getOutEdges(); - int numChildrenCriteria = outEdges.size(); + const auto criteriaEdges = invertGraphTraversal + ? rootCriteriaRef->getInEdges() + : rootCriteriaRef->getOutEdges(); + int numChildrenCriteria = criteriaEdges.size(); // The current algorithm implies that the ordering of the children is // important. The children nodes will be matched with the children subgraph @@ -336,7 +329,9 @@ struct SubgraphMatcher { int currentEdgeIdx = 0; for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria; criteriaIdx++) { - auto childrenCriteriaRef = outEdges[criteriaIdx]->head(); + auto childrenCriteriaRef = invertGraphTraversal + ? criteriaEdges[criteriaIdx]->tail() + : criteriaEdges[criteriaIdx]->head(); int expectedCount = childrenCriteriaRef->data().getCount(); bool isStarCount = @@ -374,7 +369,7 @@ struct SubgraphMatcher { debugMessage << "Child node at " << child << " does not match child criteria " << debugString( - childrenCriteriaRef) + childrenCriteriaRef, invertGraphTraversal) << ". We expected " << expectedCount << " matches but only found " << countMatch << "."; return SubgraphMatchResultType::notMatched(debugMessage.str()); @@ -399,7 +394,8 @@ struct SubgraphMatcher { std::ostringstream debugMessage; debugMessage << "Expected " << expectedCount << " matches for child criteria " - << debugString(childrenCriteriaRef) + << debugString( + childrenCriteriaRef, invertGraphTraversal) << " but only found " << countMatch; return SubgraphMatchResultType::notMatched(debugMessage.str()); } else { diff --git a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc index 32c0f20f3571bf..874da120b5be8f 100644 --- a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc +++ b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc @@ -43,27 +43,28 @@ TEST(NeuralNetGraph, ReplaceGraph) { */ auto mg = NNMatchGraph(); - // clang-format off - auto matchSumOutput = operatorSubgraph(mg, - matchOp(), { - nonTerminalSubgraph(mg, matchTensor(), 2) - });; - auto pattern = subgraph(mg, - matchOp(), { - matchSumOutput - }); - // clang-format on + auto matchSumInput = + mg.createNode(std::move(matchExternalTensorNode().count(2))); + auto matchSum = mg.createNode(matchOp("matchSum")); + mg.createEdge(matchSumInput, matchSum); + + auto matchSumOutput = mg.createNode(matchTensor("matchSumOutput")); + mg.createEdge(matchSum, matchSumOutput); + + auto matchRelu = mg.createNode(matchOp("matchRelu")); + mg.createEdge(matchSumOutput, matchRelu); - EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, pattern).isMatch()); + auto matchRoot = matchRelu; + EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, matchRoot).isMatch()); EXPECT_FALSE( - NNSubgraphMatcher::isSubgraphMatch(reluOutput, pattern).isMatch()); - EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, pattern).isMatch()); + NNSubgraphMatcher::isSubgraphMatch(reluOutput, matchRoot).isMatch()); + EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, matchRoot).isMatch()); - EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, pattern).isMatch()); + EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, matchRoot).isMatch()); NNSubgraphMatcher::replaceSubgraph( graph, - pattern, + matchRoot, [&matchSumOutput]( NNGraph& g, NNGraph::NodeRef relu, diff --git a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc index 7ed996aa5f1bdc..ee677665c6546d 100644 --- a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc +++ b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc @@ -41,27 +41,22 @@ TestMatchGraph::NodeRef Tree( const Criteria& root, const std::vector& children = {}, int count = 1) { - return subgraph(graph, root, children, count); + auto result = graph.createNode(std::move(TestMatchNode(root).count(count))); + for (auto& child : children) { + graph.createEdge(result, child); + } + return result; } TestMatchGraph::NodeRef NonTerminal(const Criteria& root, int count = 1) { - return nonTerminalSubgraph(graph, root, count); + return graph.createNode( + std::move(TestMatchNode(root).count(count).nonTerminal())); } Criteria any() { return Criteria("*"); } -// Make it more concise to create matching criteria in dataflow graph. -// For example, operatorTree("opA", ...) will refer to a tree like this: -// ... -> opA -> opA_Output -TestMatchGraph::NodeRef operatorTree( - const Criteria& root, - const std::vector& childrenCriteria = {}, - int count = 1) { - return Tree(any(), {Tree(root, childrenCriteria)}, count); -} - std::map TestGraphNodePrinter( TestGraph::NodeRef node) { std::map labelMap; @@ -185,20 +180,35 @@ struct DataFlowTestGraphCriteria { TestMatchGraph::NodeRef matchOpG; DataFlowTestGraphCriteria() { - // clang-format off - matchOpCOutput = operatorTree("opC", { - NonTerminal(Criteria("input"), TestMatchNode::kStarCount) - }); - matchOpG = Tree( - Criteria("opG"),{ - operatorTree("opF", { - operatorTree("opB", { - matchOpCOutput, matchOpCOutput, - }) - }), - NonTerminal(any()) // matches dataI - }); - // clang-format on + auto matchOpCInputs = + graph.createNode(std::move(TestMatchNode(Criteria("input")) + .starCount() + .nonTerminal() + .excludeFromSubgraph())); + auto matchOpC = graph.createNode(Criteria("opC")); + graph.createEdge(matchOpCInputs, matchOpC); + + matchOpCOutput = graph.createNode(any()); + graph.createEdge(matchOpC, matchOpCOutput); + + auto matchOpB = graph.createNode(Criteria("opB")); + graph.createEdge(matchOpCOutput, matchOpB); + graph.createEdge(matchOpCOutput, matchOpB); + + auto matchOpBOutput = graph.createNode(any()); + graph.createEdge(matchOpB, matchOpBOutput); + + auto matchOpF = graph.createNode(Criteria("opF")); + graph.createEdge(matchOpBOutput, matchOpF); + + auto matchOpFOutput = graph.createNode(any()); + graph.createEdge(matchOpF, matchOpFOutput); + + matchOpG = graph.createNode(Criteria("opG")); + auto matchDataI = graph.createNode( + std::move(TestMatchNode(any()).nonTerminal().excludeFromSubgraph())); + graph.createEdge(matchOpFOutput, matchOpG); + graph.createEdge(matchDataI, matchOpG); } }; diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc index f4f4e81a3cb45c..fa8aee6d818366 100644 --- a/caffe2/core/observer_test.cc +++ b/caffe2/core/observer_test.cc @@ -151,6 +151,9 @@ TEST(ObserverTest, TestDAGNetBase) { EXPECT_EQ(1212, count_after - count_before); } +#if 0 +// This test intermittently segfaults, +// see https://github.com/pytorch/pytorch/issues/9137 TEST(ObserverTest, TestMultipleNetBase) { Workspace ws; ws.CreateBlob("in"); @@ -176,4 +179,5 @@ TEST(ObserverTest, TestMultipleNetBase) { EXPECT_EQ(net.get()->NumObservers(), prev_num); } +#endif } // namespace caffe2 diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 2def93f0b51d08..e75681ff3a9df6 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -143,36 +143,26 @@ class CAFFE2_API OperatorBase : public Observable { inline bool InputIsType(int idx) { static_assert( !std::is_same::value, - "You should use InputIsType(int, DeviceType) for " + "You should use InputIsTensorType(int, DeviceType) for " "Tensor."); return inputs_.at(idx)->template IsType(); } - template - inline bool InputIsType(int idx, DeviceType device_type) { - static_assert( - std::is_same::value, - "InputIsType(idx, DeviceType) only available on " - "Tensor types."); - return inputs_.at(idx)->template IsType(device_type); + inline bool InputIsTensorType(int idx, DeviceType device_type) { + return inputs_.at(idx)->IsTensorType(device_type); } template inline bool OutputIsType(int idx) { static_assert( !std::is_same::value, - "You should use OutputIsType(int, DeviceType) for " + "You should use OutputIsTensorType(int, DeviceType) for " "Tensor."); return outputs_.at(idx)->template IsType(); } - template - inline bool OutputIsType(int idx, DeviceType type) { - static_assert( - std::is_same::value, - "OutputIsType(idx, DeviceType) only available on " - "Tensor types."); - return outputs_.at(idx)->template IsType(type); + inline bool OutputIsTensorType(int idx, DeviceType type) { + return outputs_.at(idx)->IsTensorType(type); } inline int InputSize() const { diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h index c7f3a7af539d51..7db975077ea8b9 100644 --- a/caffe2/core/registry.h +++ b/caffe2/core/registry.h @@ -17,21 +17,13 @@ #include #include +#include + #include "caffe2/core/common.h" #include "caffe2/core/typeid.h" namespace caffe2 { -template -inline void PrintOffendingKey(const KeyType& /*key*/) { - printf("[key type printing not supported]\n"); -} - -template <> -inline void PrintOffendingKey(const string& key) { - printf("Offending key: %s.\n", key.c_str()); -} - /** * @brief A template class that allows one to register classes by keys. * @@ -43,7 +35,7 @@ inline void PrintOffendingKey(const string& key) { * objects. */ template -class CAFFE2_API Registry { +class Registry { public: typedef std::function Creator; @@ -59,7 +51,7 @@ class CAFFE2_API Registry { std::lock_guard lock(register_mutex_); if (registry_.count(key) != 0) { printf("Key already registered.\n"); - PrintOffendingKey(key); + at::PrintOffendingKey(key); std::exit(1); } registry_[key] = creator; @@ -112,7 +104,7 @@ class CAFFE2_API Registry { }; template -class CAFFE2_API Registerer { +class Registerer { public: Registerer( const SrcType& key, diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h index 973b07ee630642..35647d7b62d8af 100644 --- a/caffe2/core/storage.h +++ b/caffe2/core/storage.h @@ -16,271 +16,17 @@ #include "caffe2/core/logging.h" #include "caffe2/core/typeid.h" +#include #include #include #include +#include +#include namespace caffe2 { -using DataType = TypeMeta; -using DataPtr = std::shared_ptr; -using at::DeviceType; - -class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target { - public: - StorageImpl() = delete; - StorageImpl(const StorageImpl&) = delete; - StorageImpl& operator=(const StorageImpl&) = delete; - - explicit StorageImpl(DeviceType device_type) : device_type_(device_type) {} - StorageImpl(DeviceType device_type, TypeMeta data_type) - : data_type_(data_type), device_type_(device_type) {} - template - StorageImpl( - DeviceType device_type, - TypeMeta data_type, - void* src, - size_t capacity, - Deleter d = nullptr) - : data_type_(data_type), device_type_(device_type) { - CAFFE_ENFORCE_WITH_CALLER( - data_type_.id() != TypeIdentifier::uninitialized(), - "To create storage with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - // Check if the deleter is a MemoryDeleter and is a simple nullptr. - if (std::is_same::value && - reinterpret_cast(static_cast(&d))[0] == - nullptr) { - // Use aliasing constructor trick to avoid calling the destructor. - data_ptr_ = std::shared_ptr(std::shared_ptr(), src); - } else { - data_ptr_.reset(src, d); - } - capacity_ = capacity; - } - - void reset() { - data_ptr_.reset(); - capacity_ = 0; - } - - template - inline bool IsType() const { - return data_type_.Match(); - } - - void* data() const { - return data_ptr_.get(); - } - - void* data() { - return data_ptr_.get(); - } - - DataPtr& data_ptr() { - return data_ptr_; - } - - const DataPtr& data_ptr() const { - return data_ptr_; - } - - void set_dtype(const DataType& data_type) { - data_type_ = data_type; - } - - const DataType& dtype() const { - return data_type_; - } - - size_t capacity() const { - return capacity_; - } - - int64_t numel() const { - return capacity_ / itemsize(); - } - - // TODO: remove later - void set_numel(int64_t numel) { - capacity_ = numel * itemsize(); - } - - inline DeviceType device_type() const { - return device_type_; - } - - inline size_t itemsize() const { - return data_type_.itemsize(); - } - - // Rule of Five - StorageImpl(StorageImpl&&) = default; - ~StorageImpl() = default; - StorageImpl& operator=(StorageImpl&&) = default; - - /** - * Can only be called when use_count is 1 - */ - template - void UniqueStorageShareExternalPointer( - void* src, - const DataType& data_type, - size_t capacity, - Deleter d = nullptr) { - data_type_ = data_type; - CAFFE_ENFORCE_WITH_CALLER( - data_type_.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to have meta " - "already set."); - // Check if the deleter is a MemoryDeleter and is a simple nullptr. - if (std::is_same::value && - reinterpret_cast(&d)[0] == nullptr) { - // Use aliasing constructor trick to avoid calling the destructor. - data_ptr_ = std::shared_ptr(std::shared_ptr(), src); - } else { - data_ptr_.reset(src, d); - } - capacity_ = capacity; - } - - private: - int64_t capacity_ = 0; - DataType data_type_; - DataPtr data_ptr_; - // allocator_ takes precedence over StaticContext from device_type_ - // Allocator* allocator_; - DeviceType device_type_ = CPU; -}; - -class CAFFE2_API Storage { - public: - Storage() {} - Storage(DeviceType device_type) - : storage_impl_(c10::make_intrusive(device_type)) {} - Storage(DeviceType device_type, TypeMeta data_type) - : storage_impl_( - c10::make_intrusive(device_type, data_type)) {} - - template - Storage( - T* src, - DeviceType device_type, - size_t capacity = 0, - Deleter d = nullptr) - : storage_impl_(c10::make_intrusive( - src, - device_type, - TypeMeta::Make(), - capacity, - d)) {} - - template - Storage( - void* src, - DeviceType device_type, - TypeMeta data_type, - size_t capacity, - Deleter d = nullptr) - : storage_impl_(c10::make_intrusive( - device_type, - data_type, - src, - capacity, - d)) {} - - void reset() { - storage_impl_->reset(); - } - - template - inline bool IsType() const { - return storage_impl_->IsType(); - } - - void* data() const { - return storage_impl_->data(); - } - - void* data() { - return storage_impl_->data(); - } - - DataPtr& data_ptr() { - return storage_impl_->data_ptr(); - } - - const DataPtr& data_ptr() const { - return storage_impl_->data_ptr(); - } - - void set_dtype(const DataType& data_type) { - storage_impl_->set_dtype(data_type); - } - - const DataType& dtype() const { - return storage_impl_->dtype(); - } - size_t capacity() const { - return storage_impl_->capacity(); - } - - int64_t numel() const { - return storage_impl_->numel(); - } - - // TODO: remove later - void set_numel(int64_t numel) { - storage_impl_->set_numel(numel); - } - - DeviceType device_type() const { - return storage_impl_->device_type(); - } - - inline size_t itemsize() const { - return storage_impl_->itemsize(); - } - - inline long use_count() const { - return storage_impl_.use_count(); - } - - inline bool unique() const { - return storage_impl_.unique(); - } - - template - void UniqueStorageShareExternalPointer( - void* src, - const DataType& data_type, - size_t capacity, - Deleter d = nullptr) { - CAFFE_ENFORCE_WITH_CALLER( - storage_impl_.unique(), - "UniqueStorageShareExternalPointer can only be called when \ - use_count == 1"); - storage_impl_->UniqueStorageShareExternalPointer( - src, data_type, capacity, d); - } - - protected: - c10::intrusive_ptr storage_impl_; -}; - -/** - * Create a Storage given an external pointer `src`. - * `device_type`: the device type of the storage - * `capacity`: the capacity of the Tensor - */ -template -Storage CreateStorage( - T* src, - DeviceType device_type, - size_t capacity = 0, - Deleter d = nullptr) { - return CreateStorage(src, device_type, TypeMeta::Make(), capacity, d); -} +using StorageImpl = at::StorageImpl; +using Storage = at::Storage; } // namespace caffe2 diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index 1659e6ba252bab..58b4c4b75e91cb 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -1,18 +1,6 @@ #include "caffe2/core/tensor.h" #include "caffe2/core/blob_stats.h" -#include "caffe2/core/flags.h" - -CAFFE2_DEFINE_bool( - caffe2_keep_on_shrink, - true, - "If set, keeps memory when a tensor is shrinking its size."); - -CAFFE2_DEFINE_int64( - caffe2_max_keep_on_shrink_memory, - LLONG_MAX, - "The maximum memory in bytes to keep on shrink, if the difference between " - "tensor sizes is bigger than this then tensor will be reset."); namespace caffe2 { @@ -93,7 +81,11 @@ vector GetTensorInfo( const void* c, size_t* capacity, DeviceOption* device) { + CHECK(capacity); const Tensor* tc = static_cast(c); + CHECK(tc); + CHECK(tc->unsafeGetTensorImpl()); + CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImpl()); *capacity = tc->capacity_nbytes(); tc->ExtractDeviceOption(device); return tc->dims(); diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 392cb523c21b44..27c09f00c4c1e2 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -2,873 +2,12 @@ #define CAFFE2_CORE_TENSOR_H_ #include "caffe2/core/storage.h" +#include "caffe2/core/tensor_impl.h" #include -// A global boolean variable to control whether we free memory when a Tensor -// is shrinked to a smaller size. As a result, a Tensor is always going to -// keep the memory allocated for its maximum capacity reshaped to so far. -CAFFE2_DECLARE_bool(caffe2_keep_on_shrink); - -// Since we can have high variance in blob memory allocated across different -// inputs in the same run, we will shrink the blob only if the memory gain -// is larger than this flag in bytes. -CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory); - namespace caffe2 { -/** - * A utility function to convert vector to vector. - */ -inline vector ToVectorTIndex(const std::vector& src) { - return vector(src.begin(), src.end()); -} - -/** - * Return product of all dimensions starting from k - */ -inline TIndex size_from_dim_(int k, const vector& dims) { - TIndex r = 1; - for (size_t i = k; i < dims.size(); ++i) { - r *= dims[i]; - } - return r; -} - -// Product of all dims up to k (not including dims[k]) -inline TIndex size_to_dim_(int k, const vector& dims) { - CAFFE_ENFORCE((unsigned)k <= dims.size()); - TIndex r = 1; - for (int i = 0; i < k; ++i) { - r *= dims[i]; - } - return r; -} - -// Product of all dims between k and l (not including dims[k] and dims[l]) -inline TIndex size_between_dim_(int k, int l, const vector& dims) { - CAFFE_ENFORCE((unsigned)l < dims.size()); - TIndex r = 1; - if (k < l) { - for (int i = k + 1; i < l; ++i) { - r *= dims[i]; - } - } else { - for (int i = l + 1; i < k; ++i) { - r *= dims[i]; - } - } - return r; -} - -// Wrap around axis_index if it is negative, s.t., -1 is the last dim -inline int canonical_axis_index_(int axis_index, int ndims) { - CAFFE_ENFORCE_GE(axis_index, -ndims); - CAFFE_ENFORCE_LT(axis_index, ndims); - if (axis_index < 0) { - return axis_index + ndims; - } - return axis_index; -} - -/** - * @brief TensorImpl is the implementation of a tensor and the basic class - * in Caffe2 that stores a contiguous memory with its shape information. - * - * The TensorImpl class is essentially a wrapper around a device-specific memory - * (the device is specified by the Context template argument), and deals with - * the allocation and de-allocation of such memory. We make a simplified - * assumption that the memory is always contiguous. - */ -class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { - public: - TensorImpl() = delete; - explicit TensorImpl(DeviceType device_type) : storage_(device_type) {} - - /** - * @brief Creates a tensor of the given dimension. - * - * Note that the actual data allocation is not going to be carried out until - * the first time mutable_data() is called. - */ - // TODO: here, we create a Storage - // and immediately discard it in Resize() since - // reset_tensor will be true and FreeMemory will be called, - // we might want to avoid creating Storage twice? - explicit TensorImpl(const vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - explicit TensorImpl(const vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - /* Now we require that context_for_copy has the same device type as src since - * template is removed - */ - TensorImpl( - const TensorImpl& src, - BaseContext* context_for_copy, - at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src, context_for_copy); - } - - /** - * @brief: Create a Tensor of at::DeviceType `type` and initialize it with - * src Tensor - */ - TensorImpl(const TensorImpl& src, at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src); - } - - /** - * @brief Creates a tensor, and fills its contents with the given values. - * The type of tensor will be decided by the context parameter - */ - template - TensorImpl( - const vector& dims, - const vector& values, - BaseContext* context) - : storage_(context->GetDevicetype(), TypeMeta::Make()) { - Resize(dims); - CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, values.data(), mutable_data()); - } - - /** - * @brief Creates a scalar tensor, and fills its content with the given value. - * The type of tensor will be decided by the context parameter - */ - template < - typename T, - typename = typename std::enable_if::value>::type> - TensorImpl(const T& value, BaseContext* context) - : storage_(context->GetDevicetype(), TypeMeta::Make()) { - Resize(vector{}); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, &value, mutable_data()); - } - - /** - * @brief Delete the copy constructor and use Clone explicitly - */ - TensorImpl(const TensorImpl& src) = delete; - - TensorImpl(TensorImpl&& src) noexcept { - swap(src); - } - - TensorImpl& operator=(TensorImpl&&) = default; - // Note(jiayq): possibly a rule-of-three violation, but we explicitly - // discourage the use of = for Tensors. - TensorImpl& operator=(const TensorImpl& src) = delete; - - virtual ~TensorImpl() noexcept {} - - /* - * Since we removed template from tensor, we now store a static - * context pointer in tensor, which indicates the type of the tensor. - */ - BaseStaticContext* GetStaticContext() const { - return get_static_context(GetDeviceType()); - } - - /* @brief - * Create a context that has the same device_type - * as the tensor. - * Note that this doesn't support passing in argument - * TODO(jerryzh): move this to a global registry - * that can create context for us - */ - std::unique_ptr CreateContext() const { - return GetStaticContext()->CreateContext(); - } - - at::DeviceType GetDeviceType() const { - return storage_.device_type(); - } - - /** - * @brief Copies the data from a source tensor, with a contex provided to - * carry out the underlying memcpy operation. - */ - void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) { - if ((void*)&src == (void*)this) { - return; - } - if (storage_.dtype() != src.meta()) { - storage_ = Storage(GetDeviceType(), src.meta()); - } - if (src.size() == -1) { - dims_.clear(); - numel_ = -1; - storage_.reset(); - return; - } - Resize(src.dims()); - if (size() > 0) { - if (storage_.dtype().copy()) { - CAFFE_ENFORCE( - GetDeviceType() == CPU, - "In CopyFrom source and dest tensors must both be CPU for meta copy"); - CAFFE_ENFORCE( - src.GetDeviceType() == CPU, - "In CopyFrom source and dest tensors must both be CPU for meta copy"); - storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size()); - } else { - // We'll need to use a non-CPU context to perform the copy if - // one of the context is not CPU since only non-CPU context - // knows how to copy between CPU and that context - if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) { - if (!context) { - src.CreateContext()->CopyBytesToDevice( - nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); - } else { - CAFFE_ENFORCE( - context->GetDevicetype() == src.GetDeviceType(), - "Type for provided context does not match the type of source"); - context->CopyBytesToDevice( - nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); - } - } else { - // In case source context is CPU, and target context is non-CPU - // We'll have to create a Context from target and perform the - // copy using that context - CreateContext()->CopyBytesFromCPU( - nbytes(), src.raw_data(), raw_mutable_data()); - } - } - } - } - - /** - * @brief Extend the outer-most dimension of this tensor - * to dimension of `num`. - */ - void ExtendTo(TIndex num, float growthPct, BaseContext* context) { - CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); - CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); - CAFFE_ENFORCE(context != nullptr, "Context must be provided."); - Extend(num - dims_[0], growthPct, context); - } - - /** - * @brief Extends the outer-most dimension of this tensor by num elements, - * preserving the existing data. - * - * The underlying data may be reallocated in order to accommodate the new - * elements, in which case this tensors' capacity is grown at a factor of - * growthPct. This ensures that Extend runs on an amortized O(1) time - * complexity. - */ - void Extend(TIndex num, float growthPct, BaseContext* context) { - CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); - CAFFE_ENFORCE_GE_WITH_CALLER( - num, 0, "`num` must be non-negative for Extend"); - auto newDims = dims_; - newDims[0] += num; - if (!storage_.data()) { - Resize(newDims); - return; - } - auto newNumel = std::accumulate( - newDims.begin(), - newDims.end(), - static_cast(1), - std::multiplies()); - if (newNumel * storage_.itemsize() <= storage_.capacity()) { - dims_ = newDims; - numel_ = newNumel; - return; - } - auto newCapacity = dims_; - newCapacity[0] = std::max( - newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100)); - auto oldData = std::move(storage_.data_ptr()); - auto oldSize = numel_; - auto oldDims = dims_; - Resize(newCapacity); - auto* newData = raw_mutable_data(storage_.dtype()); - CAFFE_ENFORCE( - context != nullptr, "Context must be provided to Extend the tensor"); - context->CopyItemsSameDevice( - storage_.dtype(), oldSize, oldData.get(), newData); - reserved_ = true; - dims_ = newDims; - numel_ = newNumel; - } - - /** - * @brief Shrinks the outer-most dimension to given size, keeping the data. - * - * This method guarantees that no re-allocations are carried out, which means - * that the extra capacity after the end of the shurnk tensor is maintained. - */ - void ShrinkTo(TIndex outer_dim) { - CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D"); - CAFFE_ENFORCE_WITH_CALLER( - outer_dim <= dims_[0], - "New outer dimension must be smaller than current."); - CAFFE_ENFORCE( - storage_.unique(), - "Can't call ShrinkTo on shared storage, please call Resize instead."); - dims_[0] = outer_dim; - numel_ = std::accumulate( - dims_.begin(), - dims_.end(), - static_cast(1), - std::multiplies()); - } - - /** - * @brief Reserve space for the underlying tensor. - * - * This must be called after Resize(), since we only specify the first - * dimension This does not copy over the old data to the newly allocated space - */ - template - void ReserveSpace(const T& outer_dim) { - CAFFE_ENFORCE( - numel_ != -1, "size should be initialized before calling ReserveSpace"); - CAFFE_ENFORCE( - storage_.unique(), "Can't call ReserveSpace on shared storage."); - auto newCapacity = dims_; - newCapacity[0] = outer_dim; - auto newNumel = std::accumulate( - newCapacity.begin(), - newCapacity.end(), - static_cast(1), - std::multiplies()); - if (newNumel * storage_.itemsize() <= storage_.capacity()) { - return; - } - // Old data is discarded - storage_.data_ptr().reset(); - auto oldSize = numel_; - auto oldDims = dims_; - Resize(newCapacity); - // Allocate new memory but don't copy over the data - raw_mutable_data(storage_.dtype()); - dims_ = oldDims; - numel_ = oldSize; - reserved_ = true; - } - - /** - * @brief Resizes a tensor. - * - * Resize takes in a vector of ints specifying the dimensions of the tensor. - * You can pass in an empty vector to specify that it is a scalar (i.e. - * containing one single item). - * - * The underlying storage may be deleted after calling Resize: if the new - * shape leads to a different number of items in the tensor, the old memory - * is deleted and new memory will be allocated next time you call - * mutable_data(). However, if the shape is different but the total number of - * items is the same, the underlying storage is kept. - */ - template - void Resize(Ts... dim_source) { - bool is_init = numel_ == -1; - bool size_changed = SetDims(dim_source...); - if (size_changed) { - // If needed, we will free the data. the next mutable_data() call - // will create the data storage. - bool reset_tensor = false; - if (reserved_) { - // If tensor is reserved then don't claim its memeory unless capacity() - // is smaller than new size - reset_tensor = storage_.capacity() < numel_ * storage_.itemsize(); - } else { - reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() || - !FLAGS_caffe2_keep_on_shrink || - storage_.capacity() - numel_ * storage_.itemsize() > - FLAGS_caffe2_max_keep_on_shrink_memory; - } - - if (reset_tensor && !is_init) { - FreeMemory(); - } - } - } - - /** - * Resize the tensor like the source tensor. Note that this is just a - * sugar wrapper that essentially calls Resize(src_tensor.dims()). - */ - inline void ResizeLike(const TensorImpl& src_tensor) { - // Note: need casting for different context types. - if (static_cast(this) != static_cast(&src_tensor)) { - Resize(src_tensor.dims()); - } - } - - /** - * Resizes the tensor without touching underlying storage. - * This requires the total size of the tensor to remains constant. - */ - inline void Reshape(const vector& dims) { - TIndex new_size = 1; - for (auto d : dims) { - CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); - new_size *= d; - } - CAFFE_ENFORCE_WITH_CALLER( - new_size == numel_, - "New size and old size are not equal. You cannot use Reshape, " - "but should use Resize." - // TODO(jiayq): remove the following warning after pending diffs - // stabilize. - " The old caffe2 mixes Reshape and Resize but this behavior has " - "been changed. If you find this error, most likely you will need " - "to change corresponding code from Reshape to Resize."); - dims_ = dims; - } - - inline void Reshape(const vector& dims) { - Reshape(ToVectorTIndex(dims)); - } - - /** - * Release whatever memory the tensor was holding but keep size and type - * information. Subsequent call to mutable_data will trigger new memory - * allocation. - */ - inline void FreeMemory() { - // We'll detach from the old Storage and create a new one - storage_ = Storage(storage_.device_type(), storage_.dtype()); - } - - /** - * A utility function to print the debug string for the tensor. Note that this - * is very slow since it involves quite some string operations, so do not use - * it in your performance-critical code. - */ - string DebugString() const { - std::stringstream ss; - ss << "A Tensor of item size " << storage_.itemsize() << " and type " - << storage_.dtype().name() << " and dimension ("; - for (int d : dims_) { - ss << d << ","; - } - ss << ")."; - return ss.str(); - } - - void swap(TensorImpl& other) noexcept { - std::swap(dims_, other.dims_); - std::swap(numel_, other.numel_); - std::swap(storage_, other.storage_); - } - - /** - * @brief Shares the data with another tensor. - * - * To share data between two tensors, the sizes of the two tensors must be - * equal already. The reason we do not implicitly do a Resize to make the two - * tensors have the same shape is that we want to allow tensors of different - * shapes but the same number of items to still be able to share data. This - * allows one to e.g. have a n-dimensional Tensor and a flattened version - * sharing the same underlying storage. - * - * The source tensor should already have its data allocated. - */ - void ShareData(const TensorImpl& src) { - // Right now, we are assuming the device_type are the same, since it is - // inherently the same in the non-templatized code. We should probably add - // an ENFORCE here which might affect perf a little bit. - CAFFE_ENFORCE_EQ_WITH_CALLER( - src.numel_, - numel_, - "Size mismatch - did you call reshape before sharing the data?"); - // It is possible that the source tensor hasn't called mutable_data() yet, - // in which case ShareData() doesn't make much sense since we don't really - // know what to share yet. - CAFFE_ENFORCE_WITH_CALLER( - src.storage_.data() || src.numel_ == 0, - "Source tensor has no content and has size > 0"); - // Finally, do sharing. - /* Since we create new Storage whenever we need to change data_type/capacity - * this still keeps the original semantics - */ - storage_ = src.storage(); - } - - /** - * @brief Shares the data with an externally managed pointer. - * - * This is similar to ShareData() but the source is a pointer with an advanced - * deleter option. In default, no deletion takes place, and one needs to make - * sure that the external memory is deallocated only after the tensor finishes - * using it. If a Deleter object is passed in, when this tensor is reallocated - * or freed, the deleter function is going to be called. - */ - template - void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) { - ShareExternalPointer(src, TypeMeta::Make(), capacity, d); - } - - template - void ShareExternalPointer( - void* src, - const TypeMeta& data_type, - size_t capacity = 0, - Deleter d = nullptr) { - CAFFE_ENFORCE_WITH_CALLER( - data_type.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - if (!capacity) { - capacity = numel_ * data_type.itemsize(); - } - if (storage_.unique()) { - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "To share data with a raw pointer, you need to set shape first."); - storage_.UniqueStorageShareExternalPointer(src, data_type, capacity, d); - } else { - // Create a new Storage - storage_ = Storage(src, GetDeviceType(), data_type, capacity, d); - } - } - - /** - * Returns a const raw void* pointer of the underlying storage. mutable_data() - * or raw_mutable_data() must have been called prior to this function call. - */ - inline const void* raw_data() const { - CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); - return storage_.data(); - } - - /** - * Returns a typed pointer of the underlying storage. mutable_data() or - * raw_mutable_data() must have been called prior to this function call, and - * the data type must be of the correct type. If you want to get a void* - * pointer instead, use raw_data(). - */ - template - inline const T* data() const { - CAFFE_ENFORCE_WITH_CALLER( - storage_.data() || numel_ == 0, - "The tensor is of non-zero shape, but its data is not allocated yet. " - "Caffe2 uses a lazy allocation, so you will need to call " - "mutable_data() or raw_mutable_data() to actually allocate memory."); - CAFFE_ENFORCE_WITH_CALLER( - IsType(), - "Tensor type mismatch, caller expects elements to be ", - TypeMeta::TypeName(), - " while tensor contains ", - storage_.dtype().name()); - return static_cast(storage_.data()); - } - - /** - * Returns a mutable raw pointer of the underlying storage. Since we will need - * to know the type of the data for allocation, a TypeMeta object is passed in - * to specify the necessary information. This is conceptually equivalent of - * calling mutable_data() where the TypeMeta parameter meta is derived from - * the type T. This function differs from mutable_data() in the sense that - * the type T can be specified during runtime via the TypeMeta object. - * - * If the existing data does not match the desired type, it will be deleted - * and a new storage will be created. - */ - inline void* raw_mutable_data(const TypeMeta& meta) { - // For 0-size tensors it's fine to return any pointer (including nullptr) - if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) { - return storage_.data(); - } else { - bool had_special_dtor = storage_.dtype().dtor() != nullptr; - if (storage_.unique()) { - storage_.set_dtype(meta); - // TODO: recalcuate numel when we store numel instead of capacity in - // Storage - } else { - if (storage_.dtype() != meta) { - storage_ = Storage(storage_.device_type(), meta); - } - } - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "Tensor is not initialized. You probably need to call Resize() " - "before calling mutable_data()"); - - // We can reuse the existing buffer if the current data does not have - // a special destructor and the new data doesn't have a special - // constructor. - if (numel_ == 0 || - (meta.ctor() == nullptr && !had_special_dtor && - storage_.capacity() >= numel_ * storage_.itemsize())) { - return storage_.data(); - } - if (meta.ctor()) { - // For types that need placement new, we will call it, as well as - // making sure that when the data is freed, it calls the right - // destruction procedure. - auto size = numel_; - auto dtor = storage_.dtype().dtor(); - auto ptr_and_deleter = - GetStaticContext()->New(numel_ * storage_.itemsize()); - auto deleter = ptr_and_deleter.second; - storage_.data_ptr().reset( - ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void { - dtor(ptr, size); - deleter(ptr); - }); - storage_.dtype().ctor()(storage_.data(), numel_); - } else { - // For fundamental type, new and delete is easier. - auto ptr_and_deleter = - GetStaticContext()->New(numel_ * storage_.itemsize()); - storage_.data_ptr().reset( - ptr_and_deleter.first, ptr_and_deleter.second); - } - storage_.set_numel(numel_); - return storage_.data(); - } - } - - /** - * Returns a mutable raw pointer of the underlying storage. This can only be - * used when you know for sure that the underlying storage of the tensor is - * already created via an earlier raw_mutable_data(meta) call or a - * mutable_data() call. - * - * If the existing data does not match the desired type, it will be deleted - * and a new storage will be created. - */ - inline void* raw_mutable_data() { - CAFFE_ENFORCE_WITH_CALLER( - storage_.dtype().id() != TypeIdentifier::uninitialized(), - "Calling raw_mutable_data() without meta, but the current meta is " - "of unknown type."); - return raw_mutable_data(storage_.dtype()); - } - - /** - * Returns a typed pointer of the underlying storage. - * - * For fundamental types, we reuse possible existing storage if there - * is sufficient capacity. - */ - template - inline T* mutable_data() { - if ((numel_ == 0 || storage_.data()) && IsType()) { - return static_cast(storage_.data()); - } - // Check it here statically - otherwise TypeMeta would throw the runtime - // error in attempt to invoke TypeMeta::ctor() - static_assert( - std::is_default_constructible::value, - "Tensor can't hold non-default-constructible types"); - return static_cast(raw_mutable_data(TypeMeta::Make())); - } - - /** - * Returns the number of dimensions of the data. - */ - inline int ndim() const { - return dims_.size(); - } - /** - * Returns the size (i.e. the number of items) of the tensor. - */ - inline TIndex size() const { - return numel_; - } - /** - * Return the number of bytes each item takes in the tensor. - */ - inline size_t itemsize() const { - return storage_.itemsize(); - } - /** - * Returns the total number of bytes of the storage. - * - * This is equivalent to calling size() * itemsize(). - */ - inline size_t nbytes() const { - return numel_ * itemsize(); - ; - } - - inline size_t capacity_nbytes() const { - return storage_.capacity(); - } - /** - * Returns the dimensions of the tensor as a vector. - */ - inline const vector& dims() const { - return dims_; - } - - inline TIndex size_from_dim(int k) const { - return size_from_dim_(k, dims_); - } - - inline TIndex size_to_dim(int k) const { - return size_to_dim_(k, dims_); - } - - inline TIndex size_between_dim(int k, int l) const { - return size_between_dim_(k, l, dims_); - } - - /** - * Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing (e.g., -1 for the last axis). - * - * @param axis_index the axis index. - * If 0 <= index < ndim(), return index. - * If -ndim <= index <= -1, return (ndim() - (-index)), - * e.g., the last axis index (ndim() - 1) if index == -1, - * the second to last if index == -2, etc. - * Dies on out of range index. - */ - inline int canonical_axis_index(int axis_index) const { - return canonical_axis_index_(axis_index, ndim()); - } - - /** - * Checks if the tensor content is of the given data type. - */ - template - inline bool IsType() const { - return storage_.IsType(); - } - /** - * Returns the TypeMeta object associated with the current data type. - */ - inline const TypeMeta& meta() const { - return storage_.dtype(); - } - - /** - * Returns the i-th dimension of the tensor in int. - * - * This function returns an int value instead of TIndex, which depending on - * the typedef could be int64. If you want int64 dim values, make sure you - * call dim() instead. - */ - inline int dim32(const int i) const { -#ifndef NDEBUG - CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); -#endif - CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits::max()); - return static_cast(dims_[i]); - } - - /** - * Returns the i-th dimension of the tensor. Note that the passed in index - * must be between 0 (inclusive) and the number of dimensions, otherwise - * this function will produce a fatal message. - */ - inline TIndex dim(const int i) const { -#ifndef NDEBUG - CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); -#endif - return dims_[i]; - } - - void ExtractDeviceOption(DeviceOption* device) const { - GetStaticContext()->ExtractDeviceOption(device, raw_data()); - } - - const Storage& storage() { - return storage_; - } - - const Storage& storage() const { - return storage_; - } - - protected: - using DimVector = std::vector; - DimVector dims_; // sizes_ - TIndex numel_ = -1; // numel_ - // we decide to keep reserved_ and it will - // live in Tensor after the split - // The logic is that if Extend() or ReserveSpace() were ever called, - // then subsequent Resize()s will not free up Storage. - bool reserved_ = false; - Storage storage_; - // int64_t storage_offset_; - - private: - template < - typename T, - typename = typename std::enable_if::value>::type> - bool SetDims(const vector& src) { - auto old_numel = numel_; - dims_.resize(src.size()); - TIndex new_numel = 1; - for (size_t i = 0; i < src.size(); ++i) { - new_numel *= src[i]; - dims_[i] = src[i]; - } - numel_ = new_numel; - return numel_ != old_numel; - } - - bool SetDims() { - auto old_numel = numel_; - dims_.resize(0); - numel_ = 1; - return numel_ != old_numel; - } - - // TODO(jiayq): maybe rewrite the following functions with initializer list. - // NVCC does not play well with initializer lists last time, but worth - // another shot. - bool SetDims(const TIndex d0) { - auto old_numel = numel_; - dims_.resize(1); - dims_[0] = d0; - numel_ = d0; - return numel_ != old_numel; - } - - bool SetDims(const TIndex d0, const TIndex d1) { - auto old_numel = numel_; - dims_.resize(2); - dims_[0] = d0; - dims_[1] = d1; - numel_ = d0 * d1; - return numel_ != old_numel; - } - - bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) { - auto old_numel = numel_; - dims_.resize(3); - dims_[0] = d0; - dims_[1] = d1; - dims_[2] = d2; - numel_ = d0 * d1 * d2; - return numel_ != old_numel; - } - - bool - SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) { - auto old_numel = numel_; - dims_.resize(4); - dims_[0] = d0; - dims_[1] = d1; - dims_[2] = d2; - dims_[3] = d3; - numel_ = d0 * d1 * d2 * d3; - return numel_ != old_numel; - } -}; - class CAFFE2_API UndefinedTensorImpl final : public TensorImpl { UndefinedTensorImpl() : TensorImpl(CPU){}; @@ -911,45 +50,75 @@ class CAFFE2_API Tensor final { return impl_.get(); } - explicit Tensor(DeviceType type) - : impl_(c10::make_intrusive(type)) {} + explicit Tensor(Storage storage) + : impl_(c10::make_intrusive(std::move(storage))) {} + /** + * @brief Creates a tensor of the given dimension. + * + * Note that the actual data allocation is not going to be carried out until + * the first time mutable_data() is called. + */ explicit Tensor(const vector& dims, DeviceType type) - : impl_( - c10::make_intrusive(dims, type)) {} + : Tensor(Storage(type)) { + // TODO: here, we create a Storage + // and immediately discard it in Resize() since + // reset_tensor will be true and FreeMemory will be called, + // we might want to avoid creating Storage twice? + Resize(dims); + } explicit Tensor(const vector& dims, DeviceType type) - : impl_( - c10::make_intrusive(dims, type)) {} + : Tensor(Storage(type)) { + Resize(dims); + } + /** + * context_for_copy is required to have the same DeviceType as src + */ Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type) - : impl_(c10::make_intrusive( - *src.impl_, - context_for_copy, - type)) {} + : Tensor(Storage(type)) { + CopyFrom(src, context_for_copy); + } + /** + * @brief: Create a Tensor of at::DeviceType `type` and initialize it with + * src Tensor + */ Tensor(const Tensor& src, DeviceType type) - : impl_(c10::make_intrusive( - *src.impl_, - type)) {} + : Tensor(Storage(type)) { + CopyFrom(src); + } + /** + * @brief Creates a tensor, and fills its contents with the given values. + * The type of tensor will be decided by the context parameter + */ template Tensor( const vector& dims, const vector& values, BaseContext* context) - : impl_(c10::make_intrusive( - dims, - values, - context)) {} + : Tensor(Storage(context->device_type(), TypeMeta::Make())) { + Resize(dims); + CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size()); + context->CopyItemsFromCPU( + storage().dtype(), size(), values.data(), mutable_data()); + } + /** + * @brief Creates a scalar tensor, and fills its content with the given value. + * The type of tensor will be decided by the context parameter + */ template < typename T, typename = typename std::enable_if::value>::type> Tensor(const T& value, BaseContext* context) - : impl_(c10::make_intrusive( - value, - context)) {} + : Tensor(Storage(context->device_type(), TypeMeta::Make())) { + Resize(std::vector{}); + context->CopyItemsFromCPU( + storage().dtype(), size(), &value, mutable_data()); + } Tensor Clone() const { Tensor x(GetDeviceType()); @@ -1019,26 +188,40 @@ class CAFFE2_API Tensor final { // swap method swaps the CONTENTS of the tensors, while std::swap // swaps the POINTERS. void swap(const Tensor& other) const noexcept { - impl_.get()->swap(*other.impl_.get()); + // NB: use get() to get a non-const pointer! + std::swap(*impl_.get(), *other.impl_.get()); } void ShareData(const Tensor& src) const { impl_.get()->ShareData(*src.impl_.get()); } - template - void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) - const { - impl_.get()->ShareExternalPointer(src, capacity, d); + template + void ShareExternalPointer( + T* src, + size_t capacity = 0, + MemoryDeleter d = nullptr) const { + impl_.get()->ShareExternalPointer(src, capacity, d); + } + + template + void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const { + impl_.get()->ShareExternalPointer(std::move(data_ptr), capacity); } - template void ShareExternalPointer( void* src, const TypeMeta& meta, size_t capacity = 0, - Deleter d = nullptr) const { - impl_.get()->ShareExternalPointer(src, meta, capacity, d); + MemoryDeleter d = nullptr) const { + impl_.get()->ShareExternalPointer(src, meta, capacity, d); + } + + void ShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { + impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity); } inline const void* raw_data() const { @@ -1103,6 +286,18 @@ class CAFFE2_API Tensor final { return impl_.get()->canonical_axis_index(axis_index); } + inline int64_t stride(int64_t dim) const { + return impl_.get()->stride(dim); + } + + inline at::DimVector strides() { + return impl_.get()->strides(); + } + + inline bool is_contiguous() const { + return impl_.get()->is_contiguous(); + } + template inline bool IsType() const { return impl_.get()->IsType(); @@ -1123,6 +318,10 @@ class CAFFE2_API Tensor final { inline void ExtractDeviceOption(DeviceOption* device) const { return impl_.get()->ExtractDeviceOption(device); } + + const Storage& storage() { + return impl_->storage(); + } }; using TensorCPU = Tensor; diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc new file mode 100644 index 00000000000000..cff98c6101ea5d --- /dev/null +++ b/caffe2/core/tensor_impl.cc @@ -0,0 +1,14 @@ +#include "caffe2/core/tensor_impl.h" + +#include "caffe2/core/flags.h" + +CAFFE2_DEFINE_bool( + caffe2_keep_on_shrink, + true, + "If set, keeps memory when a tensor is shrinking its size."); + +CAFFE2_DEFINE_int64( + caffe2_max_keep_on_shrink_memory, + LLONG_MAX, + "The maximum memory in bytes to keep on shrink, if the difference between " + "tensor sizes is bigger than this then tensor will be reset."); diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h new file mode 100644 index 00000000000000..3f42ed36b30954 --- /dev/null +++ b/caffe2/core/tensor_impl.h @@ -0,0 +1,915 @@ +#pragma once + +#include +#include +#include + +#include "caffe2/core/allocator.h" +#include "caffe2/core/common.h" +#include "caffe2/core/flags.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/context_base.h" + +// A global boolean variable to control whether we free memory when a Tensor +// is shrinked to a smaller size. As a result, a Tensor is always going to +// keep the memory allocated for its maximum capacity reshaped to so far. +CAFFE2_DECLARE_bool(caffe2_keep_on_shrink); + +// Since we can have high variance in blob memory allocated across different +// inputs in the same run, we will shrink the blob only if the memory gain +// is larger than this flag in bytes. +CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory); + +namespace caffe2 { + +/** + * A utility function to convert vector to vector. + */ +inline std::vector ToVectorTIndex(const std::vector& src) { + return std::vector(src.begin(), src.end()); +} + +/** + * Return product of all dimensions starting from k + */ +inline TIndex size_from_dim_(int k, const std::vector& dims) { + TIndex r = 1; + for (size_t i = k; i < dims.size(); ++i) { + r *= dims[i]; + } + return r; +} + +// Product of all dims up to k (not including dims[k]) +inline TIndex size_to_dim_(int k, const std::vector& dims) { + CAFFE_ENFORCE((unsigned)k <= dims.size()); + TIndex r = 1; + for (int i = 0; i < k; ++i) { + r *= dims[i]; + } + return r; +} + +// Product of all dims between k and l (not including dims[k] and dims[l]) +inline TIndex size_between_dim_(int k, int l, const std::vector& dims) { + CAFFE_ENFORCE((unsigned)l < dims.size()); + TIndex r = 1; + if (k < l) { + for (int i = k + 1; i < l; ++i) { + r *= dims[i]; + } + } else { + for (int i = l + 1; i < k; ++i) { + r *= dims[i]; + } + } + return r; +} + +// Wrap around axis_index if it is negative, s.t., -1 is the last dim +inline int canonical_axis_index_(int axis_index, int ndims) { + CAFFE_ENFORCE_GE(axis_index, -ndims); + CAFFE_ENFORCE_LT(axis_index, ndims); + if (axis_index < 0) { + return axis_index + ndims; + } + return axis_index; +} + +/** + * @brief TensorImpl is the implementation of a tensor and the basic class + * in Caffe2 that stores a contiguous memory with its shape information. + * + * The TensorImpl class is essentially a wrapper around a device-specific memory + * (the device is specified by the Context template argument), and deals with + * the allocation and de-allocation of such memory. We make a simplified + * assumption that the memory is always contiguous. + */ +class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { + public: + TensorImpl() = delete; + + explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) { + data_type_ = storage_ ? storage_.dtype() : TypeMeta{}; + } + + TensorImpl(const TensorImpl&) = default; + TensorImpl& operator=(const TensorImpl&) = default; + TensorImpl(TensorImpl&&) = default; + TensorImpl& operator=(TensorImpl&&) = default; + + virtual ~TensorImpl() noexcept {} + + /* + * Since we removed template from tensor, we now store a static + * context pointer in tensor, which indicates the type of the tensor. + */ + at::BaseStaticContext* GetStaticContext() const { + auto device_type = GetDeviceType(); + return get_static_context(device_type); + } + + /* @brief + * Create a context that has the same device_type + * as the tensor. + * Note that this doesn't support passing in argument + * TODO(jerryzh): move this to a global registry + * that can create context for us + */ + std::unique_ptr CreateContext() const { + return GetStaticContext()->CreateContext(); + } + + at::DeviceType GetDeviceType() const { + return storage_.device_type(); + } + + /** + * @brief Copies the data from a source tensor, with a contex provided to + * carry out the underlying memcpy operation. + */ + void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) { + if ((void*)&src == (void*)this) { + return; + } + if (data_type_ != src.meta()) { + CAFFE_ENFORCE_WITH_CALLER( + src.is_contiguous(), + "Right now only copy of contiguous source Tensor is supported."); + storage_ = at::Storage(GetDeviceType(), src.meta()); + data_type_ = src.meta(); + } + if (src.size() == -1) { + dims_.clear(); + numel_ = -1; + strides_.clear(); + is_contiguous_ = true; + storage_.reset(); + data_type_ = TypeMeta(); + return; + } + Resize(src.dims()); + if (size() > 0) { + if (data_type_.copy()) { + CAFFE_ENFORCE( + GetDeviceType() == CPU, + "In CopyFrom source and dest tensors must both be CPU for meta copy"); + CAFFE_ENFORCE( + src.GetDeviceType() == CPU, + "In CopyFrom source and dest tensors must both be CPU for meta copy"); + data_type_.copy()(src.raw_data(), raw_mutable_data(), size()); + } else { + // We'll need to use a non-CPU context to perform the copy if + // one of the context is not CPU since only non-CPU context + // knows how to copy between CPU and that context + if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) { + if (!context) { + src.CreateContext()->CopyBytesToDevice( + nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); + } else { + CAFFE_ENFORCE( + context->device_type() == src.GetDeviceType(), + "Type for provided context does not match the type of source"); + context->CopyBytesToDevice( + nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); + } + } else { + // In case source context is CPU, and target context is non-CPU + // We'll have to create a Context from target and perform the + // copy using that context + CreateContext()->CopyBytesFromCPU( + nbytes(), src.raw_data(), raw_mutable_data()); + } + } + } + } + + /** + * @brief Extend the outer-most dimension of this tensor + * to dimension of `num`. + */ + void ExtendTo(TIndex num, float growthPct, at::BaseContext* context) { + CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); + CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); + CAFFE_ENFORCE(context != nullptr, "Context must be provided."); + Extend(num - dims_[0], growthPct, context); + } + + /** + * @brief Extends the outer-most dimension of this tensor by num elements, + * preserving the existing data. + * + * The underlying data may be reallocated in order to accommodate the new + * elements, in which case this tensors' capacity is grown at a factor of + * growthPct. This ensures that Extend runs on an amortized O(1) time + * complexity. + */ + void Extend(TIndex num, float growthPct, at::BaseContext* context) { + CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); + CAFFE_ENFORCE_GE_WITH_CALLER( + num, 0, "`num` must be non-negative for Extend"); + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now Extend is only supported for contiguous Tensor."); + auto newDims = dims_; + newDims[0] += num; + if (!storage_.data()) { + Resize(newDims); + return; + } + auto newNumel = std::accumulate( + newDims.begin(), + newDims.end(), + static_cast(1), + std::multiplies()); + if (newNumel * storage_.itemsize() <= storage_.capacity()) { + dims_ = newDims; + numel_ = newNumel; + return; + } + auto newCapacity = dims_; + newCapacity[0] = std::max( + newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100)); + auto oldData = std::move(storage_.data_ptr()); + auto oldSize = numel_; + auto oldDims = dims_; + Resize(newCapacity); + auto* newData = raw_mutable_data(data_type_); + CAFFE_ENFORCE( + context != nullptr, "Context must be provided to Extend the tensor"); + context->CopyItemsSameDevice( + data_type_, oldSize, oldData.get(), newData); + reserved_ = true; + dims_ = newDims; + numel_ = newNumel; + } + + /** + * @brief Shrinks the outer-most dimension to given size, keeping the data. + * + * This method guarantees that no re-allocations are carried out, which means + * that the extra capacity after the end of the shurnk tensor is maintained. + */ + void ShrinkTo(TIndex outer_dim) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ShrinkTo is only supported on contiguous Tensor."); + CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D"); + CAFFE_ENFORCE_WITH_CALLER( + outer_dim <= dims_[0], + "New outer dimension must be smaller than current."); + CAFFE_ENFORCE( + storage_.unique(), + "Can't call ShrinkTo on shared storage, please call Resize instead."); + dims_[0] = outer_dim; + numel_ = std::accumulate( + dims_.begin(), + dims_.end(), + static_cast(1), + std::multiplies()); + } + + /** + * @brief Reserve space for the underlying tensor. + * + * This must be called after Resize(), since we only specify the first + * dimension This does not copy over the old data to the newly allocated space + */ + template + void ReserveSpace(const T& outer_dim) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ReserveSpace is only supported for contiguous Tensor."); + CAFFE_ENFORCE( + numel_ != -1, "size should be initialized before calling ReserveSpace"); + CAFFE_ENFORCE( + storage_.unique(), "Can't call ReserveSpace on shared storage."); + auto newCapacity = dims_; + newCapacity[0] = outer_dim; + auto newNumel = std::accumulate( + newCapacity.begin(), + newCapacity.end(), + static_cast(1), + std::multiplies()); + if (newNumel * storage_.itemsize() <= storage_.capacity()) { + return; + } + // Old data is discarded + storage_.data_ptr().clear(); + auto oldSize = numel_; + auto oldDims = dims_; + Resize(newCapacity); + // Allocate new memory but don't copy over the data + raw_mutable_data(data_type_); + dims_ = oldDims; + numel_ = oldSize; + reserved_ = true; + } + + /** + * @brief Resizes a tensor. + * + * Resize takes in a vector of ints specifying the dimensions of the tensor. + * You can pass in an empty vector to specify that it is a scalar (i.e. + * containing one single item). + * + * The underlying storage may be deleted after calling Resize: if the new + * shape leads to a different number of items in the tensor, the old memory + * is deleted and new memory will be allocated next time you call + * mutable_data(). However, if the shape is different but the total number of + * items is the same, the underlying storage is kept. + */ + template + void Resize(Ts... dim_source) { + bool is_init = numel_ == -1; + bool size_changed = SetDims(dim_source...); + if (size_changed) { + // If needed, we will free the data. the next mutable_data() call + // will create the data storage. + bool reset_tensor = false; + if (reserved_) { + // If tensor is reserved then don't claim its memeory unless capacity() + // is smaller than new size + reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize(); + } else { + reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() || + !FLAGS_caffe2_keep_on_shrink || + storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() > + FLAGS_caffe2_max_keep_on_shrink_memory; + } + + if (reset_tensor && !is_init) { + FreeMemory(); + } + } + } + + /** + * Resize the tensor like the source tensor. Note that this is just a + * sugar wrapper that essentially calls Resize(src_tensor.dims()). + */ + inline void ResizeLike(const TensorImpl& src_tensor) { + CAFFE_ENFORCE_WITH_CALLER( + src_tensor.is_contiguous(), + "Right now ResizeLike is only supported for contiguous Tensor."); + // Note: need casting for different context types. + if (static_cast(this) != static_cast(&src_tensor)) { + Resize(src_tensor.dims()); + } + } + + /** + * Resizes the tensor without touching underlying storage. + * This requires the total size of the tensor to remains constant. + */ + inline void Reshape(const std::vector& dims) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now Reshape is only supported for contiguous Tensor."); + TIndex new_size = 1; + for (auto d : dims) { + CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); + new_size *= d; + } + CAFFE_ENFORCE_WITH_CALLER( + new_size == numel_, + "New size and old size are not equal. You cannot use Reshape, " + "but should use Resize." + // TODO(jiayq): remove the following warning after pending diffs + // stabilize. + " The old caffe2 mixes Reshape and Resize but this behavior has " + "been changed. If you find this error, most likely you will need " + "to change corresponding code from Reshape to Resize."); + dims_ = dims; + } + + inline void Reshape(const std::vector& dims) { + Reshape(ToVectorTIndex(dims)); + } + + /** + * Release whatever memory the tensor was holding but keep size and type + * information. Subsequent call to mutable_data will trigger new memory + * allocation. + */ + inline void FreeMemory() { + // We'll detach from the old Storage and create a new one + storage_ = at::Storage(storage_.device_type(), data_type_); + storage_offset_ = 0; + } + + /** + * A utility function to print the debug string for the tensor. Note that this + * is very slow since it involves quite some string operations, so do not use + * it in your performance-critical code. + */ + std::string DebugString() const { + std::stringstream ss; + ss << "A Tensor of item size " << storage_.itemsize() << " and type " + << data_type_.name() << " and dimension ("; + for (int d : dims_) { + ss << d << ","; + } + ss << ")."; + return ss.str(); + } + + /** + * @brief Shares the data with another tensor. + * + * To share data between two tensors, the sizes of the two tensors must be + * equal already. The reason we do not implicitly do a Resize to make the two + * tensors have the same shape is that we want to allow tensors of different + * shapes but the same number of items to still be able to share data. This + * allows one to e.g. have a n-dimensional Tensor and a flattened version + * sharing the same underlying storage. + * + * The source tensor should already have its data allocated. + */ + void ShareData(const TensorImpl& src) { + // Right now, we are assuming the device_type are the same, since it is + // inherently the same in the non-templatized code. We should probably add + // an ENFORCE here which might affect perf a little bit. + CAFFE_ENFORCE_EQ_WITH_CALLER( + src.numel_, + numel_, + "Size mismatch - did you call reshape before sharing the data?"); + // It is possible that the source tensor hasn't called mutable_data() yet, + // in which case ShareData() doesn't make much sense since we don't really + // know what to share yet. + CAFFE_ENFORCE_WITH_CALLER( + src.storage_.data() || src.numel_ == 0, + "Source tensor has no content and has size > 0"); + // Finally, do sharing. + /* Since we create new Storage whenever we need to change data_type/capacity + * this still keeps the original semantics + */ + storage_ = src.storage(); + data_type_ = src.dtype(); + storage_offset_ = src.storage_offset(); + } + + /** + * @brief Shares the data with an externally managed pointer. + * + * This is similar to ShareData() but the source is a pointer with an advanced + * deleter option. In default, no deletion takes place, and one needs to make + * sure that the external memory is deallocated only after the tensor finishes + * using it. If a Deleter object is passed in, when this tensor is reallocated + * or freed, the deleter function is going to be called. + */ + template + void + ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) { + ShareExternalPointer((void*)src, TypeMeta::Make(), capacity, d); + } + + template + void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) { + ShareExternalPointer(std::move(data_ptr), TypeMeta::Make(), capacity); + } + + void ShareExternalPointer( + void* src, + const TypeMeta& data_type, + size_t capacity = 0, + MemoryDeleter d = nullptr) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ShareExternalPointer is only supported for contiguos Tensor."); + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + ShareExternalPointer( + at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity); + } + + void ShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + if (!capacity) { + capacity = numel_ * data_type.itemsize(); + } + if (storage_.unique()) { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "To share data with a raw pointer, you need to set shape first."); + storage_.UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); + data_type_ = data_type; + storage_offset_ = 0; + } else { + int64_t numel = capacity / data_type.itemsize(); + // Create a new Storage + storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true); + data_type_ = data_type; + storage_offset_ = 0; + } + } + + /** + * Returns a const raw void* pointer of the underlying storage. mutable_data() + * or raw_mutable_data() must have been called prior to this function call. + */ + inline const void* raw_data() const { + CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); + return static_cast(static_cast(storage_.data()) + storage_offset_ * storage_.itemsize()); + } + + /** + * Returns a typed pointer of the underlying storage. mutable_data() or + * raw_mutable_data() must have been called prior to this function call, and + * the data type must be of the correct type. If you want to get a void* + * pointer instead, use raw_data(). + */ + template + inline const T* data() const { + CAFFE_ENFORCE_WITH_CALLER( + storage_.data() || numel_ == 0, + "The tensor is of non-zero shape, but its data is not allocated yet. " + "Caffe2 uses a lazy allocation, so you will need to call " + "mutable_data() or raw_mutable_data() to actually allocate memory."); + CAFFE_ENFORCE_WITH_CALLER( + IsType(), + "Tensor type mismatch, caller expects elements to be ", + TypeMeta::TypeName(), + ", while tensor contains ", + data_type_.name(), + ". "); + return static_cast(storage_.data()) + storage_offset_; + } + + /** + * Returns a mutable raw pointer of the underlying storage. Since we will need + * to know the type of the data for allocation, a TypeMeta object is passed in + * to specify the necessary information. This is conceptually equivalent of + * calling mutable_data() where the TypeMeta parameter meta is derived from + * the type T. This function differs from mutable_data() in the sense that + * the type T can be specified during runtime via the TypeMeta object. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ + inline void* raw_mutable_data(const TypeMeta& meta) { + // For 0-size tensors it's fine to return any pointer (including nullptr) + if (data_type_ == meta && (storage_.data() || numel_ == 0)) { + return static_cast(static_cast(storage_.data()) + storage_offset_ * meta.itemsize()); + } else { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "Tensor is not initialized. You probably need to call Resize() " + "before calling mutable_data()"); + bool had_special_dtor = data_type_.dtor() != nullptr; + storage_offset_ = 0; + if (storage_.unique()) { + storage_.set_dtype(meta); + } else { + if (data_type_ != meta) { + storage_ = at::Storage(storage_.device_type(), meta); + } + } + data_type_ = meta; + + // We can reuse the existing buffer if the current data does not have + // a special destructor and the new data doesn't have a special + // constructor. + if (numel_ == 0 || + (meta.ctor() == nullptr && !had_special_dtor && + storage_.numel() >= numel_)) { + AT_ASSERT(storage_offset_ == 0); // because we just reallocated + return storage_.data(); + } + const at::Allocator* allocator = storage_.allocator(); + // TODO: Get rid of StaticContext + CAFFE_ENFORCE( + allocator == nullptr, + "Allocator is not used within Caffe2 functions, please use StaticContext instead."); + if (meta.ctor()) { + // For types that need placement new, we will call it, as well as + // making sure that when the data is freed, it calls the right + // destruction procedure. + auto size = numel_; + auto dtor = data_type_.dtor(); + void* ptr; + at::DeleterFnPtr deleter; + auto ptr_and_deleter = GetStaticContext()->New( + numel_ * storage_.itemsize()); // Removing this can get rid of + // InefficientStdFunctionContext + ptr = ptr_and_deleter.first; + deleter = ptr_and_deleter.second; + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr, + [size, dtor, deleter](void* local_ptr) -> void { + dtor(local_ptr, size); + deleter(local_ptr); + }, + at::Device(storage_.device_type()))); + data_type_.ctor()(storage_.data(), numel_); + } else { + // For fundamental type, new and delete is easier. + auto ptr_and_deleter = + GetStaticContext()->New(numel_ * storage_.itemsize()); + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr_and_deleter.first, + ptr_and_deleter.second, + at::Device(storage_.device_type()))); + } + storage_.set_numel(numel_); + AT_ASSERT(storage_offset_ == 0); // because we just reallocated + return storage_.data(); + } + } + + /** + * Returns a mutable raw pointer of the underlying storage. This can only be + * used when you know for sure that the underlying storage of the tensor is + * already created via an earlier raw_mutable_data(meta) call or a + * mutable_data() call. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ + inline void* raw_mutable_data() { + CAFFE_ENFORCE_WITH_CALLER( + data_type_.id() != TypeIdentifier::uninitialized(), + "Calling raw_mutable_data() without meta, but the current meta is " + "of unknown type."); + return raw_mutable_data(data_type_); + } + + /** + * Returns a typed pointer of the underlying storage. + * + * For fundamental types, we reuse possible existing storage if there + * is sufficient capacity. + */ + template + inline T* mutable_data() { + if ((numel_ == 0 || storage_.data()) && IsType()) { + return static_cast(storage_.data()) + storage_offset_; + } + // Check it here statically - otherwise TypeMeta would throw the runtime + // error in attempt to invoke TypeMeta::ctor() + static_assert( + std::is_default_constructible::value, + "Tensor can't hold non-default-constructible types"); + return static_cast(raw_mutable_data(TypeMeta::Make())); + } + + /** + * Returns the number of dimensions of the data. + */ + inline int ndim() const { + return dims_.size(); + } + /** + * Returns the size (i.e. the number of items) of the tensor. + */ + inline TIndex size() const { + return numel_; + } + /** + * Return the number of bytes each item takes in the tensor. + */ + inline size_t itemsize() const { + return storage_.itemsize(); + } + /** + * Returns the total number of bytes of the storage. + * + * This is equivalent to calling size() * itemsize(). + */ + inline size_t nbytes() const { + return numel_ * itemsize(); + ; + } + + // NB: This capacity may also include available space + // in the storage BEFORE the tensor data, if storage_offset != 0 + inline size_t capacity_nbytes() const { + return storage_.capacity(); + } + /** + * Returns the dimensions of the tensor as a vector. + */ + inline const std::vector& dims() const { + return dims_; + } + + inline TIndex size_from_dim(int k) const { + return size_from_dim_(k, dims_); + } + + inline TIndex size_to_dim(int k) const { + return size_to_dim_(k, dims_); + } + + inline TIndex size_between_dim(int k, int l) const { + return size_between_dim_(k, l, dims_); + } + + /** + * Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing (e.g., -1 for the last axis). + * + * @param axis_index the axis index. + * If 0 <= index < ndim(), return index. + * If -ndim <= index <= -1, return (ndim() - (-index)), + * e.g., the last axis index (ndim() - 1) if index == -1, + * the second to last if index == -2, etc. + * Dies on out of range index. + */ + inline int canonical_axis_index(int axis_index) const { + return canonical_axis_index_(axis_index, ndim()); + } + + inline int64_t stride(int64_t dim) const { +#ifndef NDEBUG + // TODO: dim wrapping? + CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER( + dim, 0, "Cannot have negative dimension index"); +#endif + return strides_[dim]; + } + + // TODO: Change to ArrayRef later + inline at::DimVector strides() { + return strides_; + } + + inline bool is_contiguous() const { + return is_contiguous_; + } + + /** + * Checks if the tensor content is of the given data type. + */ + template + inline bool IsType() const { + return storage_.IsType(); + } + /** + * Returns the TypeMeta object associated with the current data type. + */ + inline const TypeMeta& meta() const { + return data_type_; + } + + inline const TypeMeta& dtype() const { + return data_type_; + } + + /** + * Returns the i-th dimension of the tensor in int. + * + * This function returns an int value instead of TIndex, which depending on + * the typedef could be int64. If you want int64 dim values, make sure you + * call dim() instead. + */ + inline int dim32(const int i) const { +#ifndef NDEBUG + CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); +#endif + CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits::max()); + return static_cast(dims_[i]); + } + + /** + * Returns the i-th dimension of the tensor. Note that the passed in index + * must be between 0 (inclusive) and the number of dimensions, otherwise + * this function will produce a fatal message. + */ + inline TIndex dim(const int i) const { +#ifndef NDEBUG + CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); +#endif + return dims_[i]; + } + + void ExtractDeviceOption(DeviceOption* device) const { + auto* context = GetStaticContext(); + CHECK(context); + context->ExtractDeviceOption(device, raw_data()); + } + + const at::Storage& storage() { + return storage_; + } + + const at::Storage& storage() const { + return storage_; + } + + int64_t storage_offset() const { + return storage_offset_; + } + + protected: + // TODO: change to DimVector + std::vector dims_; // sizes_ + at::DimVector strides_; + TIndex numel_ = -1; // numel_ + bool is_contiguous_ = true; + // we decide to keep reserved_ and it will + // live in Tensor after the split + // The logic is that if Extend() or ReserveSpace() were ever called, + // then subsequent Resize()s will not free up Storage. + bool reserved_ = false; + at::Storage storage_; + int64_t storage_offset_ = 0; + TypeMeta data_type_; + + private: + template < + typename T, + typename = typename std::enable_if::value>::type> + bool SetDims(const std::vector& src) { + auto old_numel = numel_; + dims_.resize(src.size()); + TIndex new_numel = 1; + for (size_t i = 0; i < src.size(); ++i) { + new_numel *= src[i]; + dims_[i] = src[i]; + } + update_strides(); + numel_ = new_numel; + return numel_ != old_numel; + } + + bool SetDims() { + auto old_numel = numel_; + dims_.resize(0); + update_strides(); + numel_ = 1; + return numel_ != old_numel; + } + + // TODO(jiayq): maybe rewrite the following functions with initializer list. + // NVCC does not play well with initializer lists last time, but worth + // another shot. + bool SetDims(const TIndex d0) { + auto old_numel = numel_; + dims_.resize(1); + dims_[0] = d0; + update_strides(); + numel_ = d0; + return numel_ != old_numel; + } + + bool SetDims(const TIndex d0, const TIndex d1) { + auto old_numel = numel_; + dims_.resize(2); + dims_[0] = d0; + dims_[1] = d1; + update_strides(); + numel_ = d0 * d1; + return numel_ != old_numel; + } + + bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) { + auto old_numel = numel_; + dims_.resize(3); + dims_[0] = d0; + dims_[1] = d1; + dims_[2] = d2; + update_strides(); + numel_ = d0 * d1 * d2; + return numel_ != old_numel; + } + + bool + SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) { + auto old_numel = numel_; + dims_.resize(4); + dims_[0] = d0; + dims_[1] = d1; + dims_[2] = d2; + dims_[3] = d3; + update_strides(); + numel_ = d0 * d1 * d2 * d3; + return numel_ != old_numel; + } + + inline void update_strides() { + strides_.resize(dims_.size()); + if (ndim() > 0) { + int last_idx = ndim() - 1; + strides_[last_idx] = 1; + for (auto i = last_idx - 1; i >= 0; --i) { + strides_[i] = strides_[i + 1] * std::max(dims_[i + 1], 1); + } + } + is_contiguous_ = true; + } +}; + +} diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc index 25d4e16d2f9e7a..311c6446184a87 100644 --- a/caffe2/ideep/operators/concat_split_op.cc +++ b/caffe2/ideep/operators/concat_split_op.cc @@ -33,7 +33,7 @@ class IDEEPConcatOp final : public IDEEPOperator { if (OperatorBase::InputBlob(i).template IsType()) { inputs.emplace_back(Input(i)); } else { - CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType(CPU), + CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU), "Expect cpu tensor if not itensor"); auto& tensor_cpu = OperatorBase::Input(i, CPU); CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 || diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index 9ae2323442120a..08e6de2ae3f0dc 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator { continue; } CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(CPU), + local_output_blobs_[i]->IsTensorType(CPU), "IDEEP fallback op currently does not support non-TensorCPU " "output type who needs copying."); const auto& src = local_output_blobs_[i]->template Get(); diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc index 63bd0da7cb5cb6..626568a989b939 100644 --- a/caffe2/ideep/operators/utility_ops.cc +++ b/caffe2/ideep/operators/utility_ops.cc @@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator { USE_IDEEP_DEF_ALIASES(); bool RunOnDevice() override { const auto& input_blob = OperatorBase::InputBlob(0); - if (input_blob.template IsType(CPU)) { + if (input_blob.IsTensorType(CPU)) { VLOG(2) << "Directing sharing of TensorCPU"; const auto& X = OperatorBase::Input(0, CPU); auto* Y = OperatorBase::Output(0, CPU); diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index c8657728c57e76..f50a4f34c66789 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -119,7 +119,7 @@ class IDEEPContext final : public BaseContext { return true; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return IDEEP; } diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h index ce4c85d2c231e0..6d9713b74612d8 100644 --- a/caffe2/mkl/operators/operator_fallback_mkl.h +++ b/caffe2/mkl/operators/operator_fallback_mkl.h @@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator { continue; } CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(CPU), + local_output_blobs_[i]->IsTensorType(CPU), "MKL fallback op currently does not support non-TensorCPU " "output type who needs copying."); const auto& src = local_output_blobs_[i]->template Get(); diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc index 1f3231dc521f5d..0ed93cf061070c 100644 --- a/caffe2/mkl/operators/packed_fc_op.cc +++ b/caffe2/mkl/operators/packed_fc_op.cc @@ -49,7 +49,7 @@ class PackedFCOp final : public Operator { // Check out what is the passed in format. const MKLPackedMatrix* packed_matrix = nullptr; - if (OperatorBase::InputIsType(1, CPU)) { + if (OperatorBase::InputIsTensorType(1, CPU)) { const auto& W = Input(1); CAFFE_ENFORCE_EQ(W.ndim(), 2); CAFFE_ENFORCE_EQ(W.dim32(0), N); diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h index 636ebf2217eac8..0a7b5808a446be 100644 --- a/caffe2/mkl/utils/mkl_context.h +++ b/caffe2/mkl/utils/mkl_context.h @@ -127,7 +127,7 @@ class MKLContext : public BaseContext { return true; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return MKLDNN; } diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 755e1b5a57b8a9..2238d7af08dda6 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -489,7 +489,7 @@ bool RunOnDevice() override { "noise_size", 491 /* prime to avoid artifacts */); // Treaded as half4 in the kernel, so need half4 here. noiseSize = divRoundUp(noiseSize, 4) * 4; - if (!noiseBlob->IsType(CPU) || + if (!noiseBlob->IsTensorType(CPU) || noiseBlob->Get().size() != noiseSize) { VLOG(2) << "Initializing stylizer with noise: " << noiseSize; caffe2::Timer rt; diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h index b84de851a9e948..8657c107ed0f33 100644 --- a/caffe2/mpi/mpi_ops.h +++ b/caffe2/mpi/mpi_ops.h @@ -36,7 +36,7 @@ class MPIBroadcastOp final : public Operator { bool RunOnDevice() override { MPI_Comm comm = OperatorBase::Input(0).comm(); CAFFE_ENFORCE( - OperatorBase::OutputIsType(0, Context::GetDeviceType()), + OperatorBase::OutputIsTensorType(0, Context::GetDeviceType()), "Output is of wrong type."); auto* output = Output(0); // Make sure that output is already allocated. diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc index 5d40916be09346..bf4e20b7904711 100644 --- a/caffe2/observers/profile_observer_gpu.cc +++ b/caffe2/observers/profile_observer_gpu.cc @@ -26,12 +26,12 @@ void ProfileOperatorObserver::Dump() const { LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type() << " op#" << getId() << " ---------"; for (int i = 0; i < subject_->InputSize(); ++i) { - if (subject_->InputIsType(i, CPU)) { + if (subject_->InputIsTensorType(i, CPU)) { const auto& tensor = subject_->Input(i, CPU); const auto& name = subject_->debug_def().input(i); TensorPrinter printer(name); LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor); - } else if (subject_->InputIsType(i, CUDA)) { + } else if (subject_->InputIsTensorType(i, CUDA)) { const auto& tensor = subject_->Input(i, CUDA); const auto& name = subject_->debug_def().input(i); TensorPrinter printer(name); @@ -46,12 +46,12 @@ void ProfileOperatorObserver::Dump() const { } for (int o = 0; o < subject_->OutputSize(); ++o) { - if (subject_->OutputIsType(o, CPU)) { + if (subject_->OutputIsTensorType(o, CPU)) { auto* tensor = subject_->Output(o, CPU); const auto& name = subject_->debug_def().output(o); TensorPrinter printer(name); LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor); - } else if (subject_->OutputIsType(o, CUDA)) { + } else if (subject_->OutputIsTensorType(o, CUDA)) { auto* tensor = subject_->Output(o, CUDA); const auto& name = subject_->debug_def().output(o); TensorPrinter printer(name); diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc index 5ccea70926fb6f..2350910febff27 100644 --- a/caffe2/onnx/backend.cc +++ b/caffe2/onnx/backend.cc @@ -301,7 +301,8 @@ Caffe2Backend::get_renamed_operators() const { {"Less", "LT"}, {"Greater", "GT"}, {"Unsqueeze", "ExpandDims"}, - {"Tile", "NumpyTile"}}; + {"Tile", "NumpyTile"}, + {"DynamicSlice", "Slice"}}; return kRenamedOperators; } @@ -356,7 +357,8 @@ Caffe2Backend::get_special_operators() const { {"MatMul", &Caffe2Backend::CreateMatMul}, {"Upsample", &Caffe2Backend::CreateUpsample}, {"Dropout", &Caffe2Backend::CreateDropout}, - {"LRN", &Caffe2Backend::CreateLRN}}; + {"LRN", &Caffe2Backend::CreateLRN}, + {"DynamicSlice", &Caffe2Backend::CreateDynamicSlice}}; return kSpecialOperators; } @@ -899,7 +901,6 @@ Caffe2Ops Caffe2Backend::CreateSlice( auto starts_vals_tensor = dummy_->NewDummyName(); auto starts_tensor = dummy_->NewDummyName(); - auto casted_starts_tensor = dummy_->NewDummyName(); c2_op = ret.ops.Add(); { caffe2::Argument shape_starts; @@ -936,12 +937,9 @@ Caffe2Ops Caffe2Backend::CreateSlice( caffe2::Argument to; to.set_name("to"); to.set_i(static_cast(caffe2::TensorProto::INT32)); - c2_op = ret.ops.Add(); - BuildOperator(c2_op, "Cast", {starts_tensor}, {casted_starts_tensor}, {to}); auto ends_vals_tensor = dummy_->NewDummyName(); auto ends_tensor = dummy_->NewDummyName(); - auto casted_ends_tensor = dummy_->NewDummyName(); c2_op = ret.ops.Add(); { caffe2::Argument shape_ends; @@ -965,17 +963,168 @@ Caffe2Ops Caffe2Backend::CreateSlice( "ScatterAssign", {ends_tensor, axes_tensor, ends_vals_tensor}, {ends_tensor}); - // Slice only accepts ends as int + + // attach the original op at the end + c2_op = ret.ops.Add(); + c2_op->CopyFrom(*op); + c2_op->mutable_input()->Clear(); + c2_op->add_input(data); + c2_op->add_input(starts_tensor); + c2_op->add_input(ends_tensor); + c2_op->mutable_arg()->Clear(); + for (const auto& kv : args) { + c2_op->add_arg()->CopyFrom(*kv.second); + } + + return ret; +} + +// Do the following: +// for a given index tensor (i.e. `starts` or `ends`): +// 1) Hilariously subtract 1 from the value if it is negative. This due to +// the behavior of Caffe2's slice operator not matching that of ONNX's slice +// 2) Fully expand the index tensor out to the rank of the data tensor. +// pseudocode: indices_full = zeros(rank); indices_full[axes] = indices.int() +std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node, + Caffe2Ops& ret, + std::string indices_tensor, + std::string axes_tensor, + std::string rank_tensor, + std::string zero_tensor, + std::string one_tensor, + int default_value) { + auto indices_tensor_full = dummy_->NewDummyName(); + + { + caffe2::Argument value; + value.set_name("value"); + value.set_i(default_value); + caffe2::Argument dtype; + dtype.set_name("dtype"); + dtype.set_i(static_cast(caffe2::TensorProto::INT64)); + caffe2::Argument input_as_shape; + input_as_shape.set_name("input_as_shape"); + input_as_shape.set_i(1); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "ConstantFill", {rank_tensor}, {indices_tensor_full}, + {value, dtype, input_as_shape}); + } + + // Subtract 1 from each element of the indices tensor that is negative + auto lt_tensor = dummy_->NewDummyName(); + { + caffe2::Argument broadcast; + broadcast.set_name("broadcast"); + broadcast.set_i(1); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "LT", {indices_tensor, zero_tensor}, {lt_tensor}, {broadcast}); + } + + auto sub_one_tensor = dummy_->NewDummyName(); + { + caffe2::Argument broadcast; + broadcast.set_name("broadcast"); + broadcast.set_i(1); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "Sub", {indices_tensor, one_tensor}, {sub_one_tensor}, {broadcast}); + } + + auto indices_tensor_adjusted = dummy_->NewDummyName(); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "Conditional", {lt_tensor, sub_one_tensor, indices_tensor}, {indices_tensor_adjusted}, {}); + + // Fill in values specified from the partially-specified ONNX indices tensor + c2_op = ret.ops.Add(); + BuildOperator(c2_op, "ScatterAssign", + {indices_tensor_full, axes_tensor, indices_tensor_adjusted}, + {indices_tensor_full}); + + return indices_tensor_full; +} + +Caffe2Ops Caffe2Backend::CreateDynamicSlice( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); + CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1); + auto* op = op_tmp.ops.Mutable(0); + std::unordered_map args; + for (auto& arg : *op->mutable_arg()) { + args.emplace(arg.name(), &arg); + } + + CAFFE_ENFORCE_GE(op->input_size(), 1); + auto data = op->input(0); + Caffe2Ops ret; + + // First get the shape of the input tensor + auto* c2_op = ret.ops.Add(); + auto size_tensor = dummy_->NewDummyName(); + BuildOperator(c2_op, "Shape", {data}, {size_tensor}); + + // Now get the rank of the tensor by getting the shape of the shape of + // the input tensor c2_op = ret.ops.Add(); - BuildOperator(c2_op, "Cast", {ends_tensor}, {casted_ends_tensor}, {to}); + auto rank_tensor = dummy_->NewDummyName(); + BuildOperator(c2_op, "Shape", {size_tensor}, {rank_tensor}); + + // Axes tensor will be used to populate the fully-specified starts and ends + // arguments to the caffe2 Slice operator. + std::string axes_tensor; + if (onnx_node->node.input_size() > 2) { + axes_tensor = onnx_node->node.input(3); + } else { + axes_tensor = dummy_->NewDummyName(); + auto* c2_op = ret.ops.Add(); + BuildOperator(c2_op, "Range", {rank_tensor}, {axes_tensor}, {}); + } + + // Useful int tensors + auto define_integer_constant = [this, &ret](int val) { + caffe2::Argument value; + value.set_name("value"); + value.set_i(val); + caffe2::Argument dtype; + dtype.set_name("dtype"); + dtype.set_i(static_cast(caffe2::TensorProto::INT64)); + caffe2::Argument shape; + shape.set_name("shape"); + shape.add_ints(1); + auto c2_op = ret.ops.Add(); + auto name = dummy_->NewDummyName(); + BuildOperator(c2_op, "ConstantFill", {}, {name}, + {value, dtype, shape}); + return name; + }; + + auto zero_tensor = define_integer_constant(0); + auto one_tensor = define_integer_constant(1); + + auto starts_tensor_full = PreprocessSliceIndexTensor(onnx_node, + ret, + onnx_node->node.input(1), // starts + axes_tensor, + rank_tensor, + zero_tensor, + one_tensor, + 0); + + auto ends_tensor_full = PreprocessSliceIndexTensor(onnx_node, + ret, + onnx_node->node.input(2), // ends + axes_tensor, + rank_tensor, + zero_tensor, + one_tensor, + -1); // attach the original op at the end c2_op = ret.ops.Add(); c2_op->CopyFrom(*op); c2_op->mutable_input()->Clear(); c2_op->add_input(data); - c2_op->add_input(casted_starts_tensor); - c2_op->add_input(casted_ends_tensor); + c2_op->add_input(starts_tensor_full); + c2_op->add_input(ends_tensor_full); c2_op->mutable_arg()->Clear(); for (const auto& kv : args) { c2_op->add_arg()->CopyFrom(*kv.second); diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h index eab0e2f7e1f131..2b74dec1e3ccea 100644 --- a/caffe2/onnx/backend.h +++ b/caffe2/onnx/backend.h @@ -11,7 +11,7 @@ #include #include -constexpr int kKnownOpsetVersion = 7; +constexpr int kKnownOpsetVersion = 9; namespace caffe2 { namespace onnx { @@ -212,6 +212,17 @@ class CAFFE2_API Caffe2Backend { Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx); + std::string PreprocessSliceIndexTensor(OnnxNode* onnx_node, + Caffe2Ops& ret, + std::string indices_tensor, + std::string axes_tensor, + std::string rank_tensor, + std::string zero_tensor, + std::string one_tensor, + int default_value); + + Caffe2Ops CreateDynamicSlice(OnnxNode* onnx_node, const ConversionContext& ctx); + Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx); Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx); diff --git a/caffe2/operators/copy_op.cc b/caffe2/operators/copy_op.cc new file mode 100644 index 00000000000000..582e31475780bf --- /dev/null +++ b/caffe2/operators/copy_op.cc @@ -0,0 +1,198 @@ +#include "caffe2/operators/copy_op.h" + +namespace caffe2 { + +// From CPU, copy it to whatever the current context +REGISTER_CPU_OPERATOR( + CopyFromCPUInput, + CopyOp); +REGISTER_CPU_OPERATOR( + CopyOnDeviceLike, + CopyOnDeviceLikeOp); +REGISTER_CPU_OPERATOR(Copy, CopyOp); + +OPERATOR_SCHEMA(Copy) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .SetDoc(R"DOC( +Copy input tensor into output, potentially across devices. + +Github Links: + +- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.cc +- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.h + + +
+ + Example + +**Code** + +``` + +workspace.ResetWorkspace() + +op = core.CreateOperator( + "Copy", + ["input"], + ["output"] +) + +workspace.FeedBlob("input", np.random.rand(3,3)) +print("input:", workspace.FetchBlob("input")) +workspace.RunOperatorOnce(op) +print("output:", workspace.FetchBlob("output")) + +``` + +**Result** + +``` + +input: +[[0.16826761 0.68168217 0.55196001] + [0.19735483 0.34837823 0.69015595] + [0.09448514 0.57390828 0.37097193]] +output: +[[0.16826761 0.68168217 0.55196001] + [0.19735483 0.34837823 0.69015595] + [0.09448514 0.57390828 0.37097193]] + +``` + +
+ +)DOC") + .Input(0, "input", "(*Tensor*): input tensor to copy") + .Output(0, "output", "(*Tensor*): copy of input tensor"); + +OPERATOR_SCHEMA(CopyGPUToCPU) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .DeviceInferenceFunction([](const OperatorDef& def) { + CAFFE_ENFORCE( + def.has_device_option(), + "CopyGPUToCPU op should have cuda device option."); + auto& cuda_option = def.device_option(); + auto cpu_option = DeviceOption(); + vector in_dev(def.input_size(), cuda_option); + vector out_dev(def.output_size(), cpu_option); + return std::make_pair(in_dev, out_dev); + }) + .SetDoc(R"DOC( +Copy tensor for GPU to CPU context. Must be run under GPU device option. +)DOC") + .Input(0, "input", "The input tensor.") + .Output(0, "output", "Tensor that will contain a copy of the input."); + +OPERATOR_SCHEMA(CopyCPUToGPU) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .DeviceInferenceFunction([](const OperatorDef& def) { + CAFFE_ENFORCE( + def.has_device_option(), + "CopyCPUToGPU op should have cuda device option."); + auto& cuda_option = def.device_option(); + auto cpu_option = DeviceOption(); + vector in_dev(def.input_size(), cpu_option); + vector out_dev(def.output_size(), cuda_option); + return std::make_pair(in_dev, out_dev); + }) + .SetDoc(R"DOC( +Copy tensor for CPU to GPU context. Must be run under GPU device option. +)DOC") + .Input(0, "input", "The input tensor.") + .Output(0, "output", "Tensor that will contain a copy of the input."); + +OPERATOR_SCHEMA(CopyFromCPUInput) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .DeviceInferenceFunction([](const OperatorDef& def) { + auto op_device = + def.has_device_option() ? def.device_option() : DeviceOption(); + auto cpu_option = DeviceOption(); + vector in_dev(def.input_size(), cpu_option); + vector out_dev(def.output_size(), op_device); + return std::make_pair(in_dev, out_dev); + }) + .SetDoc(R"DOC( +Take a CPU input tensor and copy it to an output in the current +Context (GPU or CPU). This may involves cross-device MemCpy. +)DOC") + .Input(0, "input", "The input CPU tensor.") + .Output(0, "output", "either a TensorCUDA or a TensorCPU"); + +OPERATOR_SCHEMA(CopyOnDeviceLike) + .NumInputs(2) + .NumOutputs(1) + .SetDoc("Copy input tensor into output to the specific device.") + .Input(0, "input", "The input tensor.") + .Input(1, "dst", "Tensor, on which device the copy will be performed.") + .Output(0, "output", "Tensor that will contain a copy of the input."); + +struct GetCopyGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + return SingleGradientDef( + "CopyOnDeviceLike", + "", + vector{GO(0), I(0)}, + vector{GI(0)}); + } +}; +REGISTER_GRADIENT(Copy, GetCopyGradient); + +struct GetGPUToCPUGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + if (g_output_[0].IsDense()) { + return SingleGradientDef( + "CopyCPUToGPU", "", vector{GO(0)}, vector{GI(0)}); + } else { + return vector{CreateOperatorDef( + "CopyCPUToGPU", + "", + std::vector{GO_I(0)}, + std::vector{GI_I(0)}), + CreateOperatorDef( + "CopyCPUToGPU", + "", + std::vector{GO_V(0)}, + std::vector{GI_V(0)})}; + } + } +}; +REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient); + +struct GetCPUToGPUGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + if (g_output_[0].IsDense()) { + return SingleGradientDef( + "CopyGPUToCPU", "", vector{GO(0)}, vector{GI(0)}); + } else { + return vector{CreateOperatorDef( + "CopyGPUToCPU", + "", + std::vector{GO_I(0)}, + std::vector{GI_I(0)}), + CreateOperatorDef( + "CopyGPUToCPU", + "", + std::vector{GO_V(0)}, + std::vector{GI_V(0)})}; + } + } +}; +REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/copy_op.cu b/caffe2/operators/copy_op.cu new file mode 100644 index 00000000000000..e833e720e556f3 --- /dev/null +++ b/caffe2/operators/copy_op.cu @@ -0,0 +1,48 @@ +#include "caffe2/core/context_gpu.h" +#include "caffe2/operators/copy_op.h" + +namespace caffe2 { + +template <> +class CopyOnDeviceLikeOp + : public Operator { + public: + CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + USE_OPERATOR_FUNCTIONS(CUDAContext); + + bool RunOnDevice() override { + auto& input = Input(0); + auto* output = OperatorBase::Output(0, CUDA); + CUDAContext context(GetGPUIDForPointer(Input(1).raw_data())); + output->ResizeLike(input); + context.template CopyItems( + input.meta(), + input.size(), + input.raw_data(), + output->raw_mutable_data(input.meta())); + return true; + } +}; + +// From CPU, copy it to whatever the current context +REGISTER_CUDA_OPERATOR( + CopyFromCPUInput, + CopyOp); + +// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context, +// since gpu code will be involved. +REGISTER_CUDA_OPERATOR( + CopyGPUToCPU, + CopyOp); +REGISTER_CUDA_OPERATOR( + CopyCPUToGPU, + CopyOp); +// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe +// involving different GPUs. +REGISTER_CUDA_OPERATOR(Copy, CopyOp); + +REGISTER_CUDA_OPERATOR( + CopyOnDeviceLike, + CopyOnDeviceLikeOp); +} // namespace caffe2 diff --git a/caffe2/operators/copy_op.h b/caffe2/operators/copy_op.h new file mode 100644 index 00000000000000..11e8e15fbcf005 --- /dev/null +++ b/caffe2/operators/copy_op.h @@ -0,0 +1,38 @@ +#ifndef CAFFE2_OPERATORS_COPY_OP_H_ +#define CAFFE2_OPERATORS_COPY_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { + +template +class CopyOp : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + USE_SIMPLE_CTOR_DTOR(CopyOp) + + bool RunOnDevice() override { + auto& input = this->template Input(0, SrcContext::GetDeviceType()); + auto* output = + this->template Output(0, DstContext::GetDeviceType()); + output->ResizeLike(input); + this->context_.template CopyItems( + input.meta(), + input.size(), + input.raw_data(), + output->raw_mutable_data(input.meta())); + return true; + } +}; + +template +class CopyOnDeviceLikeOp : public CopyOp { + public: + CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) + : CopyOp(operator_def, ws) {} +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_COPY_OP_H_ diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu index 7030e846b714e2..df7a124d29711f 100644 --- a/caffe2/operators/cross_entropy_op.cu +++ b/caffe2/operators/cross_entropy_op.cu @@ -13,7 +13,7 @@ __global__ void LabelCrossEntropyKernel( const float log_threshold, float* Ydata) { CUDA_1D_KERNEL_LOOP(i, N) { CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D); - Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold)); + Ydata[i] = -logf(fmaxf(Xdata[i * D + labeldata[i]], log_threshold)); } } __global__ void LabelCrossEntropyGradientKernel( @@ -21,7 +21,7 @@ __global__ void LabelCrossEntropyGradientKernel( const float* dYdata, const float log_threshold, float* dXdata) { CUDA_1D_KERNEL_LOOP(i, N) { int idx = i * D + labeldata[i]; - dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold); + dXdata[idx] = - dYdata[i] / fmaxf(Xdata[idx], log_threshold); } } } // namespace diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h index 08207644f7f094..3b8cb439f12066 100644 --- a/caffe2/operators/ensure_cpu_output_op.h +++ b/caffe2/operators/ensure_cpu_output_op.h @@ -15,9 +15,9 @@ class EnsureCPUOutputOp : public Operator { : Operator(operator_def, ws) {} bool RunOnDevice() override { - if (this->template InputIsType(0, CPU)) { + if (this->InputIsTensorType(0, CPU)) { return CopyWithContext(); - } else if (this->template InputIsType(0, Context::GetDeviceType())) { + } else if (this->InputIsTensorType(0, Context::GetDeviceType())) { // CUDA Context will go this branch return CopyWithContext(); } else { diff --git a/caffe2/operators/half_float_ops.h b/caffe2/operators/half_float_ops.h index e6698a0b7283d1..b8d5dacf69472b 100644 --- a/caffe2/operators/half_float_ops.h +++ b/caffe2/operators/half_float_ops.h @@ -76,7 +76,7 @@ inline std::vector Float16FillerTensorInference( vector out(1); ArgumentHelper helper(def); out[0].set_data_type(static_cast( - helper.GetSingleArgument("dtype", TensorProto_DataType_FLOAT))); + helper.GetSingleArgument("dtype", TensorProto_DataType_FLOAT16))); auto shape = helper.GetRepeatedArgument("shape"); for (int d : shape) { out[0].add_dims(d); diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h index cff2a620ef4694..e76fea0ee8f21a 100644 --- a/caffe2/operators/if_op.h +++ b/caffe2/operators/if_op.h @@ -32,7 +32,7 @@ class IfOp final : public Operator { bool RunOnDevice() override { CAFFE_ENFORCE( - this->template InputIsType(0, Context::GetDeviceType()), + this->InputIsTensorType(0, Context::GetDeviceType()), "Invalid condition in If operator: tensor expected"); const auto& condition = Input(0); diff --git a/caffe2/operators/length_split_op.cc b/caffe2/operators/length_split_op.cc new file mode 100644 index 00000000000000..7c342d154491b1 --- /dev/null +++ b/caffe2/operators/length_split_op.cc @@ -0,0 +1,37 @@ +#include "caffe2/operators/length_split_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(LengthsSplit, LengthsSplitOp); + +OPERATOR_SCHEMA(LengthsSplit) + .NumInputs(1, 2) + .NumOutputs(1) + .ScalarType(TensorProto::INT32) + .SetDoc(R"DOC( +Given input vector LENGTHS, and input n_split, LengthsSplit returns +a single output vector. It "splits" each length into n_split values which add +up to the original length. It will attempt to do equal splits, and if not possible, +it orders larger values first. If the n_split is larger than the length, zero +padding will be applied. + +e.g. LENGTHS = [9 4 5] + n_split = 3 + Y = [3 3 3 2 1 1 2 2 1] + +e.g. LENGTHS = [2, 1, 2] + n_split = 3 + Y = [1 1 0 1 0 0 1 1 0] +)DOC") + .Arg("n_split", "Number of splits for each element in LENGTHS") + .Input(0, "LENGTHS", "Mx1 Input tensor denoting INT32 lengths") + .Input( + 1, + "n_split", + "(Optional) Number of splits for each element in LENGTHS (overrides argument)") + .Output(0, "Y", "(M*n_split)x1 Output vector denoting split lengths"); + +// TODO: Write gradient for this when needed +GRADIENT_NOT_IMPLEMENTED_YET(LengthsSplit); + +} // namespace caffe2 diff --git a/caffe2/operators/length_split_op.h b/caffe2/operators/length_split_op.h new file mode 100644 index 00000000000000..d8c98bf085c8a2 --- /dev/null +++ b/caffe2/operators/length_split_op.h @@ -0,0 +1,75 @@ +#ifndef CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_ +#define CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_ + +#include "caffe2/core/common_omp.h" +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class LengthsSplitOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + LengthsSplitOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + n_split_(OperatorBase::GetSingleArgument("n_split", 0)) { + if (InputSize() == 1) { + // If not specified, then must have this argument + CAFFE_ENFORCE( + OperatorBase::HasArgument("n_split"), + "Argument `n_split` is missing and was not specified as input."); + CAFFE_ENFORCE( + n_split_ > 0, + "`n_split` must contain a positive value for defined behavior."); + } + } + ~LengthsSplitOp() {} + + bool RunOnDevice() override { + const auto& L = Input(0); + CAFFE_ENFORCE_EQ(L.ndim(), 1, "Input `LENGTHS` should be a 1D vector."); + + if (InputSize() > 1) { + // We potentially have n_split specified as inputs as well + CAFFE_ENFORCE( + Input(1).ndim() == 1 && Input(1).size() == 1, + "Input `n_split` should be a vector of size 1."); + + const auto& input1 = Input(1); + context_.template CopyItems( + input1.meta(), 1, input1.raw_data(), &n_split_); + } + + CAFFE_ENFORCE( + n_split_ > 0, + "`n_split` must contain a positive value for defined behavior."); + const auto M = L.size(); + + auto* Y = Output(0); + Y->Resize(M * n_split_); + + const int32_t* Ldata = L.template data(); + int32_t* Ydata = Y->template mutable_data(); + + for (int i = 0; i < M; i++) { + int32_t mod = Ldata[i] % n_split_; + int32_t res = + mod != 0 ? math::divUp(Ldata[i], n_split_) : Ldata[i] / n_split_ + 1; + for (int j = 0; j < n_split_; j++) { + Ydata[(i * n_split_) + j] = mod-- > 0 ? res : res - 1; + } + } + return true; + } + + private: + int32_t n_split_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_ diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc index e832fe8723a0ce..d5af0a91bd65c0 100644 --- a/caffe2/operators/lengths_tile_op.cc +++ b/caffe2/operators/lengths_tile_op.cc @@ -1,6 +1,50 @@ #include "caffe2/operators/lengths_tile_op.h" namespace caffe2 { + +template <> +bool LengthsTileOp::RunOnDevice() { + auto& data = Input(DATA); + auto& lengths = Input(LENGTHS); + auto* output = Output(0); + + CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D"); + CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); + CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); + + // Context::CopyFrom and math::Sum need the same context to avoid race + // conditions + // why? CPUContext is not used in Sum + lengths_host_.CopyFrom(lengths, &context_); + context_.FinishDeviceComputation(); + auto lengths_size = lengths_host_.size(); + auto* lengths_data = lengths_host_.data(); + + int32_t total_length = 0; + CPUContext cpuContext; + math::Sum( + lengths_size, lengths_data, &total_length, &cpuContext); + + auto shape = data.dims(); + shape[0] = total_length; + output->Resize(shape); + + auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize(); + auto src = static_cast(data.raw_data()); + auto out = static_cast(output->raw_mutable_data(data.meta())); + + for (TIndex i = 0; i < lengths_size; ++i) { + auto length = lengths_data[i]; + CAFFE_ENFORCE_GE(length, 0); + for (int32_t j = 0; j < length; ++j) { + context_.CopyBytesSameDevice(block_bytesize, src, out); + out += block_bytesize; + } + src += block_bytesize; + } + return true; +} + REGISTER_CPU_OPERATOR(LengthsTile, LengthsTileOp); OPERATOR_SCHEMA(LengthsTile) diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu new file mode 100644 index 00000000000000..aebb33c1460a56 --- /dev/null +++ b/caffe2/operators/lengths_tile_op.cu @@ -0,0 +1,110 @@ +#include "caffe2/core/context_gpu.h" +#include "caffe2/operators/lengths_tile_op.h" + +namespace caffe2 { + +template +__global__ void lengthsTileKernel( + int numElements, + int rowSize, + const T* input, + T* output, + const int32_t* inputRowOffsets) { + CUDA_1D_KERNEL_LOOP(i, numElements) { + auto outputRowIndex = i / rowSize; + auto inputBlockOffset = inputRowOffsets[outputRowIndex]; + auto indexInRow = i - outputRowIndex * rowSize; + output[i] = input[inputBlockOffset + indexInRow]; + } +} + +template <> +bool LengthsTileOp::RunOnDevice() { + auto& data = Input(DATA); + auto& lengths = Input(LENGTHS); + auto* output = Output(0); + + CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D"); + CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); + CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); + + lengths_host_.CopyFrom(lengths, &context_); + context_.FinishDeviceComputation(); + auto lengths_size = lengths_host_.size(); + auto* lengths_data = lengths_host_.data(); + + int32_t total_length = 0; + CPUContext cpuContext; + math::Sum( + lengths_size, lengths_data, &total_length, &cpuContext); + + auto shape = data.dims(); + shape[0] = total_length; + output->Resize(shape); + + auto numElementsPerRow = data.size_from_dim(1); + auto numElements = total_length * numElementsPerRow; + auto numBlocks = CAFFE_GET_BLOCKS(numElements); + + rowMappingHost_.Resize(total_length); + rowMappingDevice_.Resize(total_length); + auto* rowOffsets = rowMappingHost_.mutable_data(); + int32_t outputRow = 0; + for (TIndex i = 0; i < lengths_size; i++) { + auto length = lengths_data[i]; + for (int32_t j = 0; j < length; j++) { + rowOffsets[outputRow++] = i * numElementsPerRow; + } + } + + context_.CopyFromCPU( + total_length, + rowMappingHost_.data(), + rowMappingDevice_.mutable_data()); + context_.FinishDeviceComputation(); + + if (data.template IsType()) { + lengthsTileKernel<<< + numBlocks, + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + numElements, + numElementsPerRow, + data.data(), + output->mutable_data(), + rowMappingDevice_.data()); + } else if (data.template IsType()) { + lengthsTileKernel<<< + numBlocks, + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + numElements, + numElementsPerRow, + data.data(), + output->mutable_data(), + rowMappingDevice_.data()); + } else if (data.template IsType()) { + lengthsTileKernel<<< + numBlocks, + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + numElements, + numElementsPerRow, + data.data(), + output->mutable_data(), + rowMappingDevice_.data()); + } else { + CAFFE_THROW( + "LengthsTile operator only supports 32-bit float, int and int64_t" + " types but input was of type ", + data.meta().name()); + } + return true; +} + +REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp); + +} // namespace caffe2 diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h index c92adcb956d916..c9bd5ef500d996 100644 --- a/caffe2/operators/lengths_tile_op.h +++ b/caffe2/operators/lengths_tile_op.h @@ -13,44 +13,6 @@ class LengthsTileOp : public Operator { USE_SIMPLE_CTOR_DTOR(LengthsTileOp); bool RunOnDevice() override { - auto& data = Input(DATA); - auto& lengths = Input(LENGTHS); - auto* output = Output(0); - - CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D"); - CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); - CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); - - // Context::CopyFrom and math::Sum need the same context to avoid race - // conditions - // why? CPUContext is not used in Sum - lengths_host_.CopyFrom(lengths, &context_); - context_.FinishDeviceComputation(); - auto lengths_size = lengths_host_.size(); - auto* lengths_data = lengths_host_.data(); - - int32_t total_length = 0; - CPUContext cpuContext; - math::Sum( - lengths_size, lengths_data, &total_length, &cpuContext); - - auto shape = data.dims(); - shape[0] = total_length; - output->Resize(shape); - - auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize(); - auto src = static_cast(data.raw_data()); - auto out = static_cast(output->raw_mutable_data(data.meta())); - - for (TIndex i = 0; i < lengths_size; ++i) { - auto length = lengths_data[i]; - CAFFE_ENFORCE_GE(length, 0); - for (int32_t j = 0; j < length; ++j) { - context_.CopyBytesSameDevice(block_bytesize, src, out); - out += block_bytesize; - } - src += block_bytesize; - } return true; } @@ -58,6 +20,8 @@ class LengthsTileOp : public Operator { private: Tensor lengths_host_{CPU}; + Tensor rowMappingHost_{CPU}; + Tensor rowMappingDevice_{Context::GetDeviceType()}; }; } // namespace caffe2 diff --git a/caffe2/operators/lengths_tile_op_gpu.cc b/caffe2/operators/lengths_tile_op_gpu.cc deleted file mode 100644 index 65ed44b735de43..00000000000000 --- a/caffe2/operators/lengths_tile_op_gpu.cc +++ /dev/null @@ -1,6 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/lengths_tile_op.h" - -namespace caffe2 { -REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp); -} // namespace caffe2 diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc index 4e9f6f2ac280f1..aee7fff4bc3391 100644 --- a/caffe2/operators/onnxifi_op.cc +++ b/caffe2/operators/onnxifi_op.cc @@ -15,7 +15,7 @@ void BlobToTensorDescriptor( // Memory type // We only allow weights to be CPU tensor for now CAFFE_ENFORCE( - blob->template IsType(CPU), + blob->IsTensorType(CPU), "Initialization blob ", name, " needs to be TensorCPU"); diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h index b0ee9611a69042..8ef39e7c0e78d1 100644 --- a/caffe2/operators/operator_fallback_gpu.h +++ b/caffe2/operators/operator_fallback_gpu.h @@ -64,7 +64,7 @@ class GPUFallbackOpEx final : public Operator { bool RunOnDevice() override { bool need_sync = false; for (int i = 0; i < InputSize(); ++i) { - if (this->template InputIsType(i, CUDA)) { + if (this->InputIsTensorType(i, CUDA)) { local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom( Input(i), &context_); need_sync = true; @@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator { continue; } CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(CPU), + local_output_blobs_[i]->IsTensorType(CPU), "GPU fallback op currently does not support non-TensorCPU " "output type who needs copying."); Output(i)->CopyFrom(local_output_blobs_[i]->template Get()); diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu index 27a71a69141098..f63a7d87fa88fe 100644 --- a/caffe2/operators/order_switch_ops.cu +++ b/caffe2/operators/order_switch_ops.cu @@ -1,91 +1,168 @@ #include "caffe2/operators/order_switch_ops.h" + #include "caffe2/core/context_gpu.h" +#include "caffe2/utils/fixed_divisor.h" namespace caffe2 { -__global__ void NHWC2NCHWKernel( - const int N, - const int HW, +template +__global__ void NHWC2NCHWCUDAKernel( + const int size, +#ifndef __HIPCC__ + const FixedDivisor C, + const FixedDivisor HxW, +#else const int C, - const float* X, - float* Y) { - CUDA_1D_KERNEL_LOOP(i, N * HW * C) { - const int c = i % C; - const int hw = i / C % HW; - const int n = i / C / HW; - Y[(n * C + c) * HW + hw] = X[i]; + const int HxW, +#endif + const T* X, + T* Y) { + CUDA_1D_KERNEL_LOOP(i, size) { + int n; + int c; + int hxw; + + int c_d; + int hxw_d; +#ifndef __HIPCC__ + HxW.DivMod(i, &c, &hxw); + C.DivMod(c, &n, &c); + + c_d = C.d(); + hxw_d = HxW.d(); +#else + c = i / HxW; + hxw = i % HxW; + n = c / C; + c = c % C; + + c_d = C; + hxw_d = HxW; +#endif + +#if __CUDA_ARCH__ >= 350 + Y[i] = __ldg(X + (n * hxw_d + hxw) * c_d + c); +#else + Y[i] = X[(n * hxw_d + hxw) * c_d + c]; +#endif } } -__global__ void NCHW2NHWCKernel( - const int N, +template +__global__ void NCHW2NHWCCUDAKernel( + const int size, +#ifndef __HIPCC__ + const FixedDivisor C, + const FixedDivisor HxW, +#else const int C, - const int HW, - const float* X, - float* Y) { - CUDA_1D_KERNEL_LOOP(i, N * C * HW) { - const int hw = i % HW; - const int c = i / HW % C; - const int n = i / C / HW; - Y[(n * HW + hw) * C + c] = X[i]; + const int HxW, +#endif + const T* X, + T* Y) { + CUDA_1D_KERNEL_LOOP(i, size) { + int n; + int c; + int hxw; + + int c_d; + int hxw_d; +#ifndef __HIPCC__ + C.DivMod(i, &hxw, &c); + HxW.DivMod(hxw, &n, &hxw); + + c_d = C.d(); + hxw_d = HxW.d(); +#else + hxw = i / C; + c = i % C; + n = hxw / HxW; + hxw = hxw % HxW; + + c_d = C; + hxw_d = HxW; +#endif +#if __CUDA_ARCH__ >= 350 + Y[i] = __ldg(X + (n * c_d + c) * hxw_d + hxw); +#else + Y[i] = X[(n * c_d + c) * hxw_d + hxw]; +#endif } } template <> bool NHWC2NCHWOp::RunOnDevice() { - auto& X = Input(0); + const auto& X = Input(0); auto* Y = Output(0); - - auto ndim = X.ndim(); - DCHECK_GE(ndim, 3); - const int N = X.dim32(0), C = X.dim32(ndim - 1); + const int ndim = X.ndim(); + CAFFE_ENFORCE_GE(ndim, 3); + const int N = X.dim32(0); + const int C = X.dim32(ndim - 1); vector Y_dims(ndim); Y_dims[0] = N; Y_dims[1] = C; - size_t image_size = 1; - for (auto i = 2; i < ndim; ++i) { + int HxW = 1; + for (int i = 2; i < ndim; ++i) { Y_dims[i] = X.dim32(i - 1); - image_size *= Y_dims[i]; + HxW *= Y_dims[i]; } Y->Resize(Y_dims); - - NHWC2NCHWKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - N, image_size, C, X.data(), Y->template mutable_data()); + const int size = X.size(); + NHWC2NCHWCUDAKernel + <<>>( + size, +#ifndef __HIPCC__ + FixedDivisor(C), + FixedDivisor(HxW), +#else + C, + HxW, +#endif + X.data(), + Y->template mutable_data()); return true; } template <> bool NCHW2NHWCOp::RunOnDevice() { - auto& X = Input(0); + const auto& X = Input(0); auto* Y = Output(0); - - auto ndim = X.ndim(); - DCHECK_GE(X.ndim(), 3); - const int N = X.dim32(0), C = X.dim32(1); + const int ndim = X.ndim(); + CAFFE_ENFORCE_GE(X.ndim(), 3); + const int N = X.dim32(0); + const int C = X.dim32(1); vector Y_dims(ndim); Y_dims[0] = N; - size_t image_size = 1; + int HxW = 1; for (auto i = 1; i < ndim - 1; ++i) { Y_dims[i] = X.dim32(i + 1); - image_size *= Y_dims[i]; + HxW *= Y_dims[i]; } Y_dims[ndim - 1] = C; Y->Resize(Y_dims); - - NCHW2NHWCKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - N, C, image_size, X.data(), Y->template mutable_data()); + const int size = X.size(); + NCHW2NHWCCUDAKernel + <<>>( + size, +#ifndef __HIPCC__ + FixedDivisor(C), + FixedDivisor(HxW), +#else + C, + HxW, +#endif + X.data(), + Y->template mutable_data()); return true; } - REGISTER_CUDA_OPERATOR(NHWC2NCHW, NHWC2NCHWOp); REGISTER_CUDA_OPERATOR(NCHW2NHWC, NCHW2NHWCOp); -} // namespace caffe2 + +} // namespace caffe2 diff --git a/caffe2/operators/order_switch_ops_cudnn.cc b/caffe2/operators/order_switch_ops_cudnn.cc new file mode 100644 index 00000000000000..4cb0034e7ee60d --- /dev/null +++ b/caffe2/operators/order_switch_ops_cudnn.cc @@ -0,0 +1,160 @@ +#include "caffe2/operators/order_switch_ops.h" + +#include +#include +#include + +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/cudnn_wrappers.h" +#include "caffe2/core/types.h" + +namespace caffe2 { + +namespace { + +class CuDNNOrderSwithOpBase : public Operator { + public: + USE_OPERATOR_FUNCTIONS(CUDAContext); + + CuDNNOrderSwithOpBase(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), cudnn_wrapper_(&context_) { + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_)); + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_)); + } + + virtual ~CuDNNOrderSwithOpBase() { + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_)); + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_)); + } + + protected: + void SetTensorDescriptor( + const cudnnDataType_t data_type, + const StorageOrder order, + const std::vector& data_dims, + cudnnTensorDescriptor_t data_desc) const { + const int ndim = data_dims.size(); + const int N = data_dims[0]; + const int C = order == StorageOrder::NCHW ? data_dims[1] : data_dims.back(); + if (ndim == 3) { + const int H = 1; + const int W = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1]; + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W)); + } else if (ndim == 4) { + const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1]; + const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2]; + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W)); + } else { + const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1]; + const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2]; + const auto l_iter = order == StorageOrder::NCHW ? data_dims.cbegin() + 4 + : data_dims.cbegin() + 3; + const auto r_iter = + order == StorageOrder::NCHW ? data_dims.cend() : data_dims.cend() - 1; + const int D = std::accumulate(l_iter, r_iter, 1, std::multiplies()); + const std::array dims = {N, C, H, W, D}; + const std::array strides = order == StorageOrder::NCHW + ? std::array{C * H * W * D, H * W * D, W * D, D, 1} + : std::array{C * H * W * D, 1, W * D * C, D * C, C}; + CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( + data_desc, data_type, 5, dims.data(), strides.data())); + } + } + + CuDNNWrapper cudnn_wrapper_; + cudnnTensorDescriptor_t X_desc_; + cudnnTensorDescriptor_t Y_desc_; + + std::vector cached_X_dims_; +}; + +class CuDNNNHWC2NCHWOp final : public CuDNNOrderSwithOpBase { + public: + CuDNNNHWC2NCHWOp(const OperatorDef& operator_def, Workspace* ws) + : CuDNNOrderSwithOpBase(operator_def, ws) {} + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + auto* Y = Output(0); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = X.dim32(ndim - 1); + const std::vector X_dims(X.dims().cbegin(), X.dims().cend()); + std::vector Y_dims(ndim); + Y_dims[0] = N; + Y_dims[1] = C; + std::copy(X_dims.cbegin() + 1, X_dims.cend() - 1, Y_dims.begin() + 2); + Y->Resize(Y_dims); + if (cached_X_dims_ != X_dims) { + cached_X_dims_ = X_dims; + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NHWC, X_dims, X_desc_); + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NCHW, Y_dims, Y_desc_); + } + CUDNN_ENFORCE(cudnnTransformTensor( + cudnn_wrapper_.inline_cudnn_handle(), + cudnnTypeWrapper::kOne(), + X_desc_, + X.template data(), + cudnnTypeWrapper::kZero(), + Y_desc_, + Y->template mutable_data())); + return true; + } +}; + +class CuDNNNCHW2NHWCOp final : public CuDNNOrderSwithOpBase { + public: + CuDNNNCHW2NHWCOp(const OperatorDef& operator_def, Workspace* ws) + : CuDNNOrderSwithOpBase(operator_def, ws) {} + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + auto* Y = Output(0); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = X.dim32(1); + const std::vector X_dims(X.dims().cbegin(), X.dims().cend()); + std::vector Y_dims(ndim); + Y_dims[0] = N; + Y_dims[ndim - 1] = C; + std::copy(X_dims.cbegin() + 2, X_dims.cend(), Y_dims.begin() + 1); + Y->Resize(Y_dims); + if (cached_X_dims_ != X_dims) { + cached_X_dims_ = X_dims; + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NCHW, X_dims, X_desc_); + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NHWC, Y_dims, Y_desc_); + } + CUDNN_ENFORCE(cudnnTransformTensor( + cudnn_wrapper_.inline_cudnn_handle(), + cudnnTypeWrapper::kOne(), + X_desc_, + X.template data(), + cudnnTypeWrapper::kZero(), + Y_desc_, + Y->template mutable_data())); + return true; + } +}; + +} // namespace + +REGISTER_CUDNN_OPERATOR(NHWC2NCHW, CuDNNNHWC2NCHWOp); +REGISTER_CUDNN_OPERATOR(NCHW2NHWC, CuDNNNCHW2NHWCOp); + +} // namespace caffe2 diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc index 529394b9092555..93c039c965f448 100644 --- a/caffe2/operators/slice_op.cc +++ b/caffe2/operators/slice_op.cc @@ -3,8 +3,8 @@ namespace caffe2 { -REGISTER_CPU_OPERATOR(Slice, SliceOp); -REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp); +REGISTER_CPU_OPERATOR(Slice, SliceOp); +REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp); OPERATOR_SCHEMA(Slice) .NumInputs(1, 3) diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu index e2523ad7cbf3fc..5de302814ba2aa 100644 --- a/caffe2/operators/slice_op.cu +++ b/caffe2/operators/slice_op.cu @@ -231,79 +231,133 @@ bool SliceImplGpu( } // namespace -template <> -bool SliceOp::RunOnDevice() { - auto* output = Output(0); - auto& data = Input(0); +template<> +class SliceOp : public Operator { + public: + USE_OPERATOR_FUNCTIONS(CUDAContext); + SliceOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), + statically_inited_(false) {} + + bool RunOnDevice() override { + if (InputSize() > 1) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } + } - if (InputSize() > 1) { - starts_host_.CopyFrom(Input(1)); - ends_host_.CopyFrom(Input(2)); - } else { - if (!statically_inited_) { - CAFFE_ENFORCE(HasArgument("starts")); - CAFFE_ENFORCE(HasArgument("ends")); - CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); - - starts_host_.Resize(starts_.size()); - ends_host_.Resize(ends_.size()); - - memcpy( - starts_host_.mutable_data(), - starts_.data(), - sizeof(int) * starts_.size()); - memcpy( - ends_host_.mutable_data(), - ends_.data(), - sizeof(int) * ends_.size()); - statically_inited_ = true; + template + bool DoRunWithType() { + auto* output = Output(0); + auto& data = Input(0); + + if (InputSize() > 1) { + starts_host_.CopyFrom(Input(1)); + ends_host_.CopyFrom(Input(2)); + } else { + if (!statically_inited_) { + CAFFE_ENFORCE(HasArgument("starts")); + CAFFE_ENFORCE(HasArgument("ends")); + CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); + + starts_host_.Resize(starts_.size()); + ends_host_.Resize(ends_.size()); + + memcpy( + starts_host_.mutable_data(), + starts_.data(), + sizeof(SIndex) * starts_.size()); + memcpy( + ends_host_.mutable_data(), + ends_.data(), + sizeof(SIndex) * ends_.size()); + statically_inited_ = true; + } } + + return SliceImplGpu( + output, data, starts_host_, ends_host_, &context_); } + private: + std::vector starts_; + std::vector ends_; + bool statically_inited_; + Tensor starts_host_{CPU}; + Tensor ends_host_{CPU}; - return SliceImplGpu( - output, data, starts_host_, ends_host_, &context_); -} +}; // class SliceOp -REGISTER_CUDA_OPERATOR(Slice, SliceOp); +REGISTER_CUDA_OPERATOR(Slice, SliceOp); template <> -bool SliceGradientOp::RunOnDevice() { - auto* gdata = Output(0); - auto& data = Input(0); +class SliceGradientOp : public Operator { + public: + USE_OPERATOR_FUNCTIONS(CUDAContext); + SliceGradientOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), + statically_inited_(false) {} + + AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp); + + bool RunOnDevice() override { + if (InputSize() == 4) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } + } - if (InputSize() == 4) { - starts_host_.CopyFrom(Input(1)); - ends_host_.CopyFrom(Input(2)); + template + bool DoRunWithType() { + auto* gdata = Output(0); + auto& data = Input(0); - auto& go = Input(3); + if (InputSize() == 4) { + starts_host_.CopyFrom(Input(1)); + ends_host_.CopyFrom(Input(2)); - return SliceImplGpu( - nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); - } else { - if (!statically_inited_) { - CAFFE_ENFORCE(HasArgument("starts")); - CAFFE_ENFORCE(HasArgument("ends")); - CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); - - starts_host_.Resize(starts_.size()); - ends_host_.Resize(ends_.size()); - - memcpy( - starts_host_.mutable_data(), - starts_.data(), - sizeof(int) * starts_.size()); - memcpy( - ends_host_.mutable_data(), - ends_.data(), - sizeof(int) * ends_.size()); - - statically_inited_ = true; - } - auto& go = Input(1); + auto& go = Input(3); - return SliceImplGpu( - nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); + return SliceImplGpu( + nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); + } else { + if (!statically_inited_) { + CAFFE_ENFORCE(HasArgument("starts")); + CAFFE_ENFORCE(HasArgument("ends")); + CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); + + starts_host_.Resize(starts_.size()); + ends_host_.Resize(ends_.size()); + + memcpy( + starts_host_.mutable_data(), + starts_.data(), + sizeof(SIndex) * starts_.size()); + memcpy( + ends_host_.mutable_data(), + ends_.data(), + sizeof(SIndex) * ends_.size()); + + statically_inited_ = true; + } + auto& go = Input(1); + + return SliceImplGpu( + nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); + } } -} -REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp); + private: + + std::vector starts_; + std::vector ends_; + bool statically_inited_; + Tensor starts_host_{CPU}; + Tensor ends_host_{CPU}; +}; // class SliceGradientOp +REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp); } // namespace caffe2 diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h index 8f291affb8e8d0..aa8d4e50f0f9d9 100644 --- a/caffe2/operators/slice_op.h +++ b/caffe2/operators/slice_op.h @@ -198,22 +198,26 @@ bool SliceImpl( } // namespace -template +template class SliceOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; SliceOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), - starts_(this->template GetRepeatedArgument("starts")), - ends_(this->template GetRepeatedArgument("ends")), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), statically_inited_(false) {} bool RunOnDevice() override { - return RunOnDeviceImpl(Input(0), Output(0)); + if (InputSize() > 1) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } } - protected: - bool RunOnDeviceImpl(const Tensor& data, Tensor* output) { + template + bool DoRunWithType() { if (InputSize() > 1) { starts_host_.CopyFrom(Input(1)); ends_host_.CopyFrom(Input(2)); @@ -238,31 +242,45 @@ class SliceOp : public Operator { } } + auto data = Input(0); + auto output = Output(0); + return SliceImpl( output, data, starts_host_, ends_host_, &context_); } AT_DISABLE_COPY_AND_ASSIGN(SliceOp); - private: - std::vector starts_; - std::vector ends_; + protected: + std::vector starts_; + std::vector ends_; bool statically_inited_; Tensor starts_host_{CPU}; Tensor ends_host_{CPU}; }; -template +template class SliceGradientOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; SliceGradientOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), - starts_(this->template GetRepeatedArgument("starts")), - ends_(this->template GetRepeatedArgument("ends")), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), statically_inited_(false) {} + AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp); + bool RunOnDevice() override { + if (InputSize() == 4) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } + } + + template + bool DoRunWithType() { auto* gdata = Output(0); auto& data = Input(0); @@ -301,11 +319,10 @@ class SliceGradientOp : public Operator { } } - AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp); - private: - std::vector starts_; - std::vector ends_; + + std::vector starts_; + std::vector ends_; bool statically_inited_; Tensor starts_host_{CPU}; Tensor ends_host_{CPU}; diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc index ece70ffd2425e1..c9ba13efb50258 100644 --- a/caffe2/operators/string_ops_test.cc +++ b/caffe2/operators/string_ops_test.cc @@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test { const std::string* checkAndGetOutput(int outputSize) { const auto* output = ws_.GetBlob("Y"); EXPECT_NE(output, nullptr); - EXPECT_TRUE(output->IsType(CPU)); + EXPECT_TRUE(output->IsTensorType(CPU)); const auto& outputTensor = output->Get(); EXPECT_EQ(outputTensor.ndim(), 1); EXPECT_EQ(outputTensor.dim(0), outputSize); diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc index 8f1e0895a28596..a6d395fe9ba647 100644 --- a/caffe2/operators/stylizer_ops.cc +++ b/caffe2/operators/stylizer_ops.cc @@ -82,7 +82,7 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp auto defaultNoiseSize = OperatorBase::GetSingleArgument( "noise_size", 491 /* prime to avoid artifacts */); - if (!noiseBlob->IsType(CPU)) { + if (!noiseBlob->IsTensorType(CPU)) { // Initialize random noise on first use. // Cache it to maintain temporal consistency. auto* t = noiseBlob->GetMutableTensor(CPU); diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 6c287c37f3d6c9..b370b5ecb438cc 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -52,14 +52,6 @@ REGISTER_CPU_OPERATOR( ScatterWeightedSumOp); REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); -// From CPU, copy it to whatever the current context -REGISTER_CPU_OPERATOR( - CopyFromCPUInput, - CopyOp); -REGISTER_CPU_OPERATOR( - CopyOnDeviceLike, - CopyOnDeviceLikeOp); -REGISTER_CPU_OPERATOR(Copy, CopyOp); REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp); REGISTER_CPU_OPERATOR(HasElements, HasElementsOp); REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp); @@ -379,133 +371,6 @@ Currently only works on CPU because of access to INDICES. "Update slices, with shape len(INDICES) + shape(X_0)[1:]") .Output(0, "DATA", "Has to be exactly the same tensor as the input 0"); -OPERATOR_SCHEMA(Copy) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .SetDoc(R"DOC( -Copy input tensor into output, potentially across devices. - -Github Links: - -- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc -- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h - - -
- - Example - -**Code** - -``` - -workspace.ResetWorkspace() - -op = core.CreateOperator( - "Copy", - ["input"], - ["output"] -) - -workspace.FeedBlob("input", np.random.rand(3,3)) -print("input:", workspace.FetchBlob("input")) -workspace.RunOperatorOnce(op) -print("output:", workspace.FetchBlob("output")) - -``` - -**Result** - -``` - -input: -[[0.16826761 0.68168217 0.55196001] - [0.19735483 0.34837823 0.69015595] - [0.09448514 0.57390828 0.37097193]] -output: -[[0.16826761 0.68168217 0.55196001] - [0.19735483 0.34837823 0.69015595] - [0.09448514 0.57390828 0.37097193]] - -``` - -
- -)DOC") - .Input(0, "input", "(*Tensor*): input tensor to copy") - .Output(0, "output", "(*Tensor*): copy of input tensor"); - -OPERATOR_SCHEMA(CopyGPUToCPU) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .DeviceInferenceFunction([](const OperatorDef& def) { - CAFFE_ENFORCE( - def.has_device_option(), - "CopyGPUToCPU op should have cuda device option."); - auto& cuda_option = def.device_option(); - auto cpu_option = DeviceOption(); - vector in_dev(def.input_size(), cuda_option); - vector out_dev(def.output_size(), cpu_option); - return std::make_pair(in_dev, out_dev); - }) - .SetDoc(R"DOC( -Copy tensor for GPU to CPU context. Must be run under GPU device option. -)DOC") - .Input(0, "input", "The input tensor.") - .Output(0, "output", "Tensor that will contain a copy of the input."); - -OPERATOR_SCHEMA(CopyCPUToGPU) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .DeviceInferenceFunction([](const OperatorDef& def) { - CAFFE_ENFORCE( - def.has_device_option(), - "CopyCPUToGPU op should have cuda device option."); - auto& cuda_option = def.device_option(); - auto cpu_option = DeviceOption(); - vector in_dev(def.input_size(), cpu_option); - vector out_dev(def.output_size(), cuda_option); - return std::make_pair(in_dev, out_dev); - }) - .SetDoc(R"DOC( -Copy tensor for CPU to GPU context. Must be run under GPU device option. -)DOC") - .Input(0, "input", "The input tensor.") - .Output(0, "output", "Tensor that will contain a copy of the input."); - -OPERATOR_SCHEMA(CopyFromCPUInput) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .DeviceInferenceFunction([](const OperatorDef& def) { - auto op_device = - def.has_device_option() ? def.device_option() : DeviceOption(); - auto cpu_option = DeviceOption(); - vector in_dev(def.input_size(), cpu_option); - vector out_dev(def.output_size(), op_device); - return std::make_pair(in_dev, out_dev); - }) - .SetDoc(R"DOC( -Take a CPU input tensor and copy it to an output in the current -Context (GPU or CPU). This may involves cross-device MemCpy. -)DOC") - .Input(0, "input", "The input CPU tensor.") - .Output(0, "output", "either a TensorCUDA or a TensorCPU"); - -OPERATOR_SCHEMA(CopyOnDeviceLike) - .NumInputs(2) - .NumOutputs(1) - .SetDoc("Copy input tensor into output to the specific device.") - .Input(0, "input", "The input tensor.") - .Input(1, "dst", "Tensor, on which device the copy will be performed.") - .Output(0, "output", "Tensor that will contain a copy of the input."); OPERATOR_SCHEMA(HasElements) .NumInputs(1) @@ -937,62 +802,6 @@ struct GetFlattenToVecGradient : public GradientMakerBase { }; REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient); -struct GetCopyGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "CopyOnDeviceLike", - "", - vector{GO(0), I(0)}, - vector{GI(0)}); - } -}; -REGISTER_GRADIENT(Copy, GetCopyGradient); - -struct GetGPUToCPUGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - if (g_output_[0].IsDense()) { - return SingleGradientDef( - "CopyCPUToGPU", "", vector{GO(0)}, vector{GI(0)}); - } else { - return vector{CreateOperatorDef( - "CopyCPUToGPU", - "", - std::vector{GO_I(0)}, - std::vector{GI_I(0)}), - CreateOperatorDef( - "CopyCPUToGPU", - "", - std::vector{GO_V(0)}, - std::vector{GI_V(0)})}; - } - } -}; -REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient); - -struct GetCPUToGPUGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - if (g_output_[0].IsDense()) { - return SingleGradientDef( - "CopyGPUToCPU", "", vector{GO(0)}, vector{GI(0)}); - } else { - return vector{CreateOperatorDef( - "CopyGPUToCPU", - "", - std::vector{GO_I(0)}, - std::vector{GI_I(0)}), - CreateOperatorDef( - "CopyGPUToCPU", - "", - std::vector{GO_V(0)}, - std::vector{GI_V(0)})}; - } - } -}; -REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient); - SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds); SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths); SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges); diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu index e771c4ee36e1cc..8272bbcad3e55c 100644 --- a/caffe2/operators/utility_ops.cu +++ b/caffe2/operators/utility_ops.cu @@ -36,28 +36,6 @@ bool SumOp::RunOnDevice() { return false; } -template <> -class CopyOnDeviceLikeOp - : public Operator { - public: - CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} - USE_OPERATOR_FUNCTIONS(CUDAContext); - - bool RunOnDevice() override { - auto& input = Input(0); - auto* output = OperatorBase::Output(0, CUDA); - CUDAContext context(GetGPUIDForPointer(Input(1).raw_data())); - output->ResizeLike(input); - context.template CopyItems( - input.meta(), - input.size(), - input.raw_data(), - output->raw_mutable_data(input.meta())); - return true; - } -}; - REGISTER_CUDA_OPERATOR(Print, PrintOp); REGISTER_CUDA_OPERATOR(Flatten, FlattenOp); REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp); @@ -66,27 +44,6 @@ REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp); REGISTER_CUDA_OPERATOR(Sum, SumOp); REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp); -// From CPU, copy it to whatever the current context -REGISTER_CUDA_OPERATOR( - CopyFromCPUInput, - CopyOp); - -// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context, -// since gpu code will be involved. -REGISTER_CUDA_OPERATOR( - CopyGPUToCPU, - CopyOp); -REGISTER_CUDA_OPERATOR( - CopyCPUToGPU, - CopyOp); -// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe -// involving different GPUs. -REGISTER_CUDA_OPERATOR(Copy, CopyOp); - -REGISTER_CUDA_OPERATOR( - CopyOnDeviceLike, - CopyOnDeviceLikeOp); - REGISTER_CUDA_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp); CAFFE_KNOWN_TYPE(const float*); diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index c9564dfa74a86a..9a615f53e25394 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -91,8 +91,8 @@ class PrintOp final : public Operator { return true; } - if (!this->template InputIsType(0, Context::GetDeviceType()) && - !this->template InputIsType(0, CPU)) { + if (!this->InputIsTensorType(0, Context::GetDeviceType()) && + !this->InputIsTensorType(0, CPU)) { LOG(INFO) << "Blob of type: " << OperatorBase::Inputs().at(0)->meta().name(); return true; @@ -113,7 +113,7 @@ class PrintOp final : public Operator { unsigned char, std::string>; - if (this->template InputIsType(0, CPU)) { + if (this->InputIsTensorType(0, CPU)) { return DispatchHelper::call( this, this->template Input(0, CPU)); } else { @@ -129,7 +129,7 @@ class PrintOp final : public Operator { // will handle memory deallocation itself so no smart pointer is needed. const TensorCPU* tensor; Tensor tensor_copy_if_needed(CPU); - if (this->template InputIsType(0, CPU)) { + if (this->InputIsTensorType(0, CPU)) { tensor = &this->template Input(0, CPU); } else { tensor_copy_if_needed.CopyFrom(Input(0), &context_); @@ -325,7 +325,7 @@ class WeightedSumOp : public Operator { template bool DoRunWithType() { - const int input_size = InputSize(); + const int input_size = this->InputSize(); CAFFE_ENFORCE_EQ(input_size % 2, 0); const auto& X0 = Input(0); const auto& weight0 = Input(1); @@ -698,33 +698,6 @@ class ScatterAssignOp : public Operator { INPUT_TAGS(DATA, INDICES, SLICES); }; -template -class CopyOp : public Operator { - public: - USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_SIMPLE_CTOR_DTOR(CopyOp); - - bool RunOnDevice() override { - auto& input = this->template Input(0, SrcContext::GetDeviceType()); - auto* output = - this->template Output(0, DstContext::GetDeviceType()); - output->ResizeLike(input); - this->context_.template CopyItems( - input.meta(), - input.size(), - input.raw_data(), - output->raw_mutable_data(input.meta())); - return true; - } -}; - -template -class CopyOnDeviceLikeOp : public CopyOp { - public: - CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) - : CopyOp(operator_def, ws) {} -}; - template class LengthsToSegmentIdsOp : public Operator { public: diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h index 258862b690e4a6..3e90341bcdd7ef 100644 --- a/caffe2/operators/while_op.h +++ b/caffe2/operators/while_op.h @@ -35,7 +35,7 @@ class WhileOp final : public Operator { bool RunOnDevice() override { CAFFE_ENFORCE( - this->template InputIsType(0, Context::GetDeviceType()), + this->InputIsTensorType(0, Context::GetDeviceType()), "Invalid condition in While operator: tensor expected"); const auto& condition = Input(0); diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 80e2308eabf3cd..f9956060b75cfd 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -264,8 +264,7 @@ std::unique_ptr convertToNeuralNetOperator( /// \brief Ingest a caffe2 protobuf model and output an NNModule. /// \param net The caffe2 protobuf NetDef -/// \param blobMap [optional][output] A pointer to a blobMap to be populated with all the output blobs of the NetDef by name->NodeRef -repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut) { +repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict) { repr::NNModule module; repr::NNGraph& dfg = module.dataFlow; repr::NNCFGraph& cfg = module.controlFlow; @@ -285,7 +284,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map> currentBasicBlock = auto bbNode = cfg.createNode(util::make_unique>()); @@ -323,17 +321,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map(input)); + } + } } for (const auto& outputName : net.external_output()) { @@ -345,9 +352,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map(instrNode); + auto* annotation = nnOp->getAnnotation(); + if (!annotation) { + auto new_annot = util::make_unique(); + new_annot->setOperatorDef(convertToOperatorDef(instrNode)); + nnOp->setAnnotation(std::move(new_annot)); + annotation = nnOp->getAnnotation(); + } + CAFFE_ENFORCE(isa(annotation)); + auto c2_annotation = dyn_cast(annotation); + return *c2_annotation; +} + caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m) { auto predictNet = caffe2::NetDef(); return convertToCaffe2Proto(m, predictNet); diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h index 31281ab90572ec..47b3b8c7a3ec3e 100644 --- a/caffe2/opt/converter.h +++ b/caffe2/opt/converter.h @@ -19,8 +19,12 @@ class Caffe2Annotation : public nom::repr::Annotation { : Annotation(AnnotationKind::Caffe2), Device(device) {} virtual ~Caffe2Annotation() {} - void setDevice(std::string device) { Device = device; } - const std::string getDevice() const { return Device; } + void setDevice(std::string device) { + Device = device; + } + const std::string getDevice() const { + return Device; + } void setDeviceType(int device) { DeviceType = device; @@ -33,6 +37,11 @@ class Caffe2Annotation : public nom::repr::Annotation { OpDef = opDef; OpDefExists = true; } + + bool hasOperatorDef() const { + return OpDefExists; + } + const caffe2::OperatorDef& getOperatorDef() const { CAFFE_ENFORCE( OpDefExists, @@ -57,7 +66,7 @@ class Caffe2Annotation : public nom::repr::Annotation { int DeviceType = caffe2::DeviceTypeProto::PROTO_CPU; }; -CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut = nullptr); +CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false); CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&); @@ -73,6 +82,10 @@ CAFFE2_API std::unique_ptr convertToNeuralNetOpera CAFFE2_API caffe2::OperatorDef convertToOperatorDef( const nom::repr::NNGraph::NodeRef& instrNode); +// If the annotation doesn't exist, attempt to add it +CAFFE2_API Caffe2Annotation +getOrAddCaffe2Annotation(nom::repr::NNGraph::NodeRef& instrNode); + class CAFFE2_API Converter { public: explicit Converter() = default; diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc index cb1c9028d5c12b..84dac93753d37a 100644 --- a/caffe2/predictor/predictor.cc +++ b/caffe2/predictor/predictor.cc @@ -10,7 +10,7 @@ void enforceIsTensor(Workspace* ws, const std::string& name) { auto blob = ws->GetBlob(name); CAFFE_ENFORCE(blob, "Blob does not exist: ", name); CAFFE_ENFORCE( - blob->template IsType(CPU), "Blob is not a CPU Tensor: ", name); + blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name); } TensorCPU* getTensor(Workspace* ws, const std::string& name) { diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto new file mode 100644 index 00000000000000..9e626d8d845260 --- /dev/null +++ b/caffe2/proto/torch.proto @@ -0,0 +1,550 @@ +syntax = "proto2"; + +import "caffe2/proto/caffe2.proto"; + +package torch; + +// Overview +// +// ONNX is an open specification that is comprised of the following components: +// +// 1) A definition of an extensible computation graph model. +// 2) Definitions of standard data types. +// 3) Definitions of built-in operators. +// +// This document describes the syntax of models and their computation graphs, +// as well as the standard data types. Together, they are referred to as the ONNX +// Intermediate Representation, or 'IR' for short. +// +// The normative semantic specification of the ONNX IR is found in docs/IR.md. +// Definitions of the built-in neural network operators may be found in docs/Operators.md. + +// Notes +// +// Release +// +// We are still in the very early stage of defining ONNX. The current +// version of ONNX is a starting point. While we are actively working +// towards a complete spec, we would like to get the community involved +// by sharing our working version of ONNX. +// +// Protobuf compatibility +// +// To simplify framework compatibility, ONNX is defined using the subset of +// protobuf that is compatible with both protobuf v2 and v3. This means that we +// do not use any protobuf features that are only available in one of the two +// versions. +// +// Here are the most notable contortions we have to carry out to work around +// these limitations: +// +// - No 'map' (added protobuf 3.0). We instead represent mappings as lists +// of key-value pairs, where order does not matter and duplicates +// are not allowed. + +// Versioning +// +// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md +// +// To be compatible with both proto2 and proto3, we will use a version number +// that is not defined by the default value but an explicit enum number. +enum Version { + // proto3 requires the first enum value to be zero. + // We add this just to appease the compiler. + _START_VERSION = 0; + // The version field is always serialized and we will use it to store the + // version that the graph is generated from. This helps us set up version + // control. + // For the IR, we are using simple numbers starting with with 0x00000001, + // which was the version we published on Oct 10, 2017. + IR_VERSION_2017_10_10 = 0x0000000000000001; + + // IR_VERSION 2 published on Oct 30, 2017 + // - Added type discriminator to AttributeProto to support proto3 users + IR_VERSION_2017_10_30 = 0x0000000000000002; + + // IR VERSION 3 published on Nov 3, 2017 + // - For operator versioning: + // - Added new message OperatorSetIdProto + // - Added opset_import in ModelProto + // - For vendor extensions, added domain in NodeProto + IR_VERSION_NEWEST_ONNX = 0x0000000000000003; + + // PYTORCH IR VERSION + IR_VERSION_NEWEST = 0x0000000000000103; +} + +// Attributes +// +// A named attribute containing either singular float, integer, string, graph, +// and tensor values, or repeated float, integer, string, graph, and tensor values. +// An AttributeProto MUST contain the name field, and *only one* of the +// following content fields, effectively enforcing a C/C++ union equivalent. +message AttributeProto { + + // Note: this enum is structurally identical to the OpSchema::AttrType + // enum defined in schema.h. If you rev one, you likely need to rev the other. + enum AttributeType { + UNDEFINED = 0; + FLOAT = 1; + INT = 2; + STRING = 3; + TENSOR = 4; + GRAPH = 5; + + FLOATS = 6; + INTS = 7; + STRINGS = 8; + TENSORS = 9; + GRAPHS = 10; + } + + // The name field MUST be present for this version of the IR. + optional string name = 1; // namespace Attribute + + // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. + // In this case, this AttributeProto does not contain data, and it's a reference of attribute + // in parent scope. + // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. + optional string ref_attr_name = 21; + + // A human-readable documentation for this attribute. Markdown is allowed. + optional string doc_string = 13; + + // The type field MUST be present for this version of the IR. + // For 0.0.1 versions of the IR, this field was not defined, and + // implementations needed to use has_field hueristics to determine + // which value field was in use. For IR_VERSION 0.0.2 or later, this + // field MUST be set and match the f|i|s|t|... field in use. This + // change was made to accomodate proto3 implementations. + optional AttributeType type = 20; // discriminator that indicates which field below is in use + + // Exactly ONE of the following fields must be present for this version of the IR + optional float f = 2; // float + optional int64 i = 3; // int + optional bytes s = 4; // UTF-8 string + optional TensorProto t = 5; // tensor value + optional GraphProto g = 6; // graph + // Do not use field below, it's deprecated. + // optional ValueProto v = 12; // value - subsumes everything but graph + + repeated float floats = 7; // list of floats + repeated int64 ints = 8; // list of ints + repeated bytes strings = 9; // list of UTF-8 strings + repeated TensorProto tensors = 10; // list of tensors + repeated GraphProto graphs = 11; // list of graph +} + +// Defines information on value, including the name, the type, and +// the shape of the value. +message ValueInfoProto { + // This field MUST be present in this version of the IR. + optional string name = 1; // namespace Value + // This field MUST be present in this version of the IR. + optional TypeProto type = 2; + // A human-readable documentation for this value. Markdown is allowed. + optional string doc_string = 3; +} + +// Nodes +// +// Computation graphs are made up of a DAG of nodes, which represent what is +// commonly called a "layer" or "pipeline stage" in machine learning frameworks. +// +// For example, it can be a node of type "Conv" that takes in an image, a filter +// tensor and a bias tensor, and produces the convolved output. +message NodeProto { + repeated string input = 1; // namespace Value + repeated string output = 2; // namespace Value + + // An optional identifier for this node in a graph. + // This field MAY be absent in ths version of the IR. + optional string name = 3; // namespace Node + + // The symbolic identifier of the Operator to execute. + optional string op_type = 4; // namespace Operator + // The domain of the OperatorSet that specifies the operator named by op_type. + optional string domain = 7; // namespace Domain + + // Additional named attributes. + repeated AttributeProto attribute = 5; + + // A human-readable documentation for this node. Markdown is allowed. + optional string doc_string = 6; + + // Additional annotations, attributes are defined in Schema + // To be added as annotations: + // string engine + // string list control_input + // int64 is_gradient_op + // string debug_info + repeated AttributeProto annotations = 8; + + // Besides the node type, PyTorhc also serialize ATen function signature + optional caffe2.DeviceOption device_option = 51; + optional string aten_function = 52; +} + +// Models +// +// ModelProto is a top-level file/container format for bundling a ML model and +// associating its computation graph with metadata. +// +// The semantics of the model are described by the associated GraphProto. +// +// Model ==> Caffe2 MetaNetDef +// ==> PyTorch Module +message ModelProto { + // The version of the IR this model targets. See Version enum above. + // This field MUST be present. + optional int64 ir_version = 1; + + // The OperatorSets this model relies on. + // All ModelProtos MUST have at least one entry that + // specifies which version of the ONNX OperatorSet is + // being imported. + // + // All nodes in the ModelProto's graph will bind against the operator + // with the same-domain/same-op_type operator with the HIGHEST version + // in the referenced operator sets. + repeated OperatorSetIdProto opset_import = 8; + + // The name of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_name = 2; + + // The version of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_version = 3; + + // Domain name of the model. + // We use reverse domain names as name space indicators. For example: + // `com.facebook.fair` or `com.microsoft.cognitiveservices` + // + // Together with `model_version` and GraphProto.name, this forms the unique identity of + // the graph. + optional string domain = 4; + + // The version of the graph encoded. See Version enum below. + optional int64 model_version = 5; + + // A human-readable documentation for this model. Markdown is allowed. + optional string doc_string = 6; + + // The parameterized graph that is evaluated to execute the model. + // The main graph, in single graph case, it is ONNX compatible. + optional GraphProto graph = 7; + + // The remaining nets in MetaNetDef. + // Submodules and methods in PyTorch. + repeated GraphProto methods = 15; + + // Named metadata values; keys should be distinct. + // Many meta data in MetaNetDef and preditor are piggy backed here. + // 1) project + // 2) model_class + // 3) internal_version + // 4) predictor_type + // 5) predictor_id + // 6) execute_plan + // 7) applicationSpecificInfo (another string map, need to verify it has no duplicate.) + // 8) engine + // 9) publish time + repeated StringStringEntryProto metadata_props = 14; + + // Model name + optional string name = 16; + + // Model name + repeated AttributeProto annotations = 17; + + // Mapping from list name to blob name list, must be string list type. + // Equivalent to blobs in MetaNetDef. + repeated AttributeProto blob_lists = 51; + + // Mapping from plan name to serialized plan, must be string list type. + // Equivalent to plans in MetaNetDef. + repeated AttributeProto plans = 52; +}; + +// StringStringEntryProto follows the pattern for cross-proto-version maps. +// See https://developers.google.com/protocol-buffers/docs/proto3#maps +message StringStringEntryProto { + optional string key = 1; + optional string value= 2; +}; + +// Graphs +// +// A graph defines the computational logic of a model and is comprised of a parameterized +// list of nodes that form a directed acyclic graph based on their inputs and outputs. +// This is the equivalent of the "network" or "graph" in many deep learning +// frameworks. +// Graph ==> NetDef in Caffe2 +// ==> Submodule/Method in PyTorch +message GraphProto { + // The nodes in the graph, sorted topologically. + repeated NodeProto node = 1; + + // The name of the graph. + optional string name = 2; // namespace Graph + + // A list of named tensor values, used to specify constant inputs of the graph. + // Each TensorProto entry must have a distinct name (within the list) that + // also appears in the input list. + repeated TensorProto initializer = 5; + + // A human-readable documentation for this graph. Markdown is allowed. + optional string doc_string = 10; + + // The inputs and outputs of the graph. + repeated ValueInfoProto input = 11; + repeated ValueInfoProto output = 12; + + // Information for the values in the graph. The ValueInfoProto.name's + // must be distinct. It is optional for a value to appear in value_info list. + repeated ValueInfoProto value_info = 13; + + // Additional annotations. + repeated AttributeProto annotations = 14; + + // DO NOT USE the following fields, they were deprecated from earlier versions. + // repeated string input = 3; + // repeated string output = 4; + // optional int64 ir_version = 6; + // optional int64 producer_version = 7; + // optional string producer_tag = 8; + // optional string domain = 9; +} + +// Tensors +// +// A serialized tensor value. +message TensorProto { + enum DataType { + UNDEFINED = 0; + // Basic types. + FLOAT = 1; // float + UINT8 = 2; // uint8_t + INT8 = 3; // int8_t + UINT16 = 4; // uint16_t + INT16 = 5; // int16_t + INT32 = 6; // int32_t + INT64 = 7; // int64_t + STRING = 8; // string + BOOL = 9; // bool + + // Advanced types + FLOAT16 = 10; + DOUBLE = 11; + UINT32 = 12; + UINT64 = 13; + COMPLEX64 = 14; // complex with float32 real and imaginary components + COMPLEX128 = 15; // complex with float64 real and imaginary components + // Future extensions go here. + + // Special data type, real type information is stored in ValueInfoProto. + // If data_type is SPECIAL, raw_data should be used. + SPECIAL = 51; + } + + // The shape of the tensor. + repeated int64 dims = 1; + repeated int64 strides = 14; + + // The data type of the tensor. + optional DataType data_type = 2; + + // For very large tensors, we may want to store them in chunks, in which + // case the following fields will specify the segment that is stored in + // the current TensorProto. + message Segment { + optional int64 begin = 1; + optional int64 end = 2; + optional int64 chuck_num = 51; + optional int64 chuck_id = 52; + } + // Used as offset in the external shared data. + optional Segment segment = 3; + + // Tensor content must be organized in row-major order. + // + // Depending on the data_type field, exactly one of the fields below with + // name ending in _data is used to store the elements of the tensor. + + // For float and complex64 values + // Complex64 tensors are encoded as a single array of floats, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. + repeated float float_data = 4 [packed = true]; + + // For int32, uint8, int8, uint16, int16, bool, and float16 values + // float16 values must be bit-wise converted to an uint16_t prior + // to writing to the buffer. + // When this field is present, the data_type field MUST be + // INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16 + repeated int32 int32_data = 5 [packed = true]; + + // For strings. + // Each element of string_data is a UTF-8 encoded Unicode + // string. No trailing null, no leading BOM. The protobuf "string" + // scalar type is not used to match ML community conventions. + // When this field is present, the data_type field MUST be STRING + repeated bytes string_data = 6; + + // For int64. + // When this field is present, the data_type field MUST be INT64 + repeated int64 int64_data = 7 [packed = true]; + + // Optionally, a name for the tensor. + optional string name = 8; // namespace Value + + // A human-readable documentation for this tensor. Markdown is allowed. + optional string doc_string = 12; + + // Serializations can either use one of the fields above, or use this + // raw bytes field. The only exception is the string case, where one is + // required to store the content in the repeated bytes string_data field. + // + // When this raw_data field is used to store tensor value, elements MUST + // be stored in as fixed-width, little-endian order. + // Floating-point data types MUST be stored in IEEE 754 format. + // Complex64 elements must be written as two consecutive FLOAT values, real component first. + // Complex128 elements must be written as two consecutive DOUBLE values, real component first. + // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // + // Note: the advantage of specific field rather than the raw_data field is + // that in some cases (e.g. int data), protobuf does a better packing via + // variable length storage, and may lead to smaller binary footprint. + // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED + optional bytes raw_data = 9; + + // For double + // Complex64 tensors are encoded as a single array of doubles, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 + repeated double double_data = 10 [packed = true]; + + // For uint64 and uint32 values + // When this field is present, the data_type field MUST be + // UINT32 or UINT64 + repeated uint64 uint64_data = 11 [packed = true]; + + // External data by file name + optional string external_data = 13; + + // If two tensors represent the same weights/content, use alias. + // Must exist a TensorProto named alias in the initializer list. + // To avoid the duplicate tensor in attribute, such as value in Constant node. + // This is useful, if everything is stored just in the proto. + optional string alias = 16; + + // Additional annotations. + repeated AttributeProto annotations = 17; + + // Device info + optional caffe2.DeviceOption device_option = 51; + + // For PyTorch serialized tensor. + optional int64 require_gradient = 52; + optional int64 is_buffer = 53; +} + +// Defines a tensor shape. A dimension can be either an integer value +// or a symbolic variable. A symbolic variable represents an unknown +// dimension. +message TensorShapeProto { + message Dimension { + oneof value { + int64 dim_value = 1; + string dim_param = 2; // namespace Shape + }; + // Standard denotation can optionally be used to denote tensor + // dimensions with standard semantic descriptions to ensure + // that operations are applied to the correct axis of a tensor. + // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition + // for pre-defined dimension denotations. + optional string denotation = 3; + }; + // To represent a scalar, using no dim to represent 0-d tensor. + repeated Dimension dim = 1; + + repeated Dimension stride = 51; +} + +// Types +// +// The standard ONNX data types. +message TypeProto { + + message Tensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST be present for this version of the IR. + optional TensorProto.DataType elem_type = 1; + optional TensorShapeProto shape = 2; + } + + // Sequence type: List, Tuple + message Sequence { + // elem_type and elem_type_list cannot appear together. + // If all the element types are the same, we use elem_type, + // otherwise, we specify the type of each element in elem_type_list. + optional TypeProto elem_type = 1; + repeated TypeProto elem_type_list = 51; + enum SequenceType { + UNDEFINED = 0; + LIST = 1; + TUPLE = 2; + } + optional SequenceType sequence_type = 52; + } + + // Map, (not necessary at this moment) + message Map { + optional TensorProto.DataType key_type = 1; + optional TypeProto value_type = 2; + } + + // Special type of blobs, based on the type_name, we can choose the right + // serializer and deserialzier. + message SpecialBlob { + optional string type_name = 1; + } + + oneof value { + // The type of a tensor. + Tensor tensor_type = 1; + Sequence sequence_type = 4; + Map map_type = 5; + SpecialBlob special_type = 51; + } + + // An optional denotation can be used to denote the whole + // type with a standard semantic description as to what is + // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition + // for pre-defined type denotations. + optional string denotation = 6; +} + +// Operator Sets +// +// OperatorSets are uniquely identified by a (domain, opset_version) pair. +message OperatorSetIdProto { + // The domain of the operator set being identified. + // The empty string ("") or absence of this field implies the operator + // set that is defined as part of the ONNX specification. + // This field MUST be present in this version of the IR when referring to any other operator set. + optional string domain = 1; + + // The version of the operator set being identified. + // This field MUST be present in this version of the IR. + optional int64 version = 2; +} diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py new file mode 100644 index 00000000000000..9d615a30833371 --- /dev/null +++ b/caffe2/python/compatibility.py @@ -0,0 +1,8 @@ +from six import PY2, PY3 + +if PY2: + import collections + container_abcs = collections +elif PY3: + import collections.abc + container_abcs = collections.abc diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py index e1716857bb16b0..05b753b8fd397e 100644 --- a/caffe2/python/examples/resnet50_trainer.py +++ b/caffe2/python/examples/resnet50_trainer.py @@ -22,8 +22,10 @@ from caffe2.python.predictor_constants import predictor_constants as predictor_constants ''' -Parallelized multi-GPU distributed trainer for Resnet 50. Can be used to train -on imagenet data, for example. +Parallelized multi-GPU distributed trainer for Resne(X)t. +Can be used to train on imagenet data, for example. +The default parameters can train a standard Resnet-50 (1x64d), and parameters +can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d). To run the trainer in single-machine multi-gpu mode by setting num_shards = 1. @@ -39,14 +41,23 @@ ''' logging.basicConfig() -log = logging.getLogger("resnet50_trainer") +log = logging.getLogger("ResNe(X)t_trainer") log.setLevel(logging.DEBUG) dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops') dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops') -def AddImageInput(model, reader, batch_size, img_size, dtype, is_test): +def AddImageInput( + model, + reader, + batch_size, + img_size, + dtype, + is_test, + mean_per_channel=None, + std_per_channel=None, +): ''' The image input operator loads image and label data from the reader and applies transformations to the images (random cropping, mirroring, ...). @@ -58,6 +69,9 @@ def AddImageInput(model, reader, batch_size, img_size, dtype, is_test): output_type=dtype, use_gpu_transform=True if model._device_type == 1 else False, use_caffe_datum=True, + mean_per_channel=mean_per_channel, + std_per_channel=std_per_channel, + # mean_per_channel takes precedence over mean mean=128., std=128., scale=256, @@ -166,6 +180,7 @@ def RunEpoch( # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) + test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. @@ -194,19 +209,25 @@ def RunEpoch( data_parallel_model.GetLearningRateBlobNames(train_model)[0] ) test_accuracy = 0 - if (test_model is not None): + test_accuracy_top5 = 0 + if test_model is not None: # Run 100 iters of testing ntests = 0 - for _ in range(0, 100): + for _ in range(test_epoch_iters): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar(workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy' )) + test_accuracy_top5 += np.asscalar(workspace.FetchBlob( + "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5' + )) ntests += 1 test_accuracy /= ntests + test_accuracy_top5 /= ntests else: test_accuracy = (-1) + test_accuracy_top5 = (-1) explog.log( input_count=num_images, @@ -216,7 +237,8 @@ def RunEpoch( 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, - 'test_accuracy': test_accuracy, + 'top1_test_accuracy': test_accuracy, + 'top5_test_accuracy': test_accuracy_top5, } ) assert loss < 40, "Exploded gradients :(" @@ -243,6 +265,17 @@ def Train(args): total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" + # Verify valid image mean/std per channel + if args.image_mean_per_channel: + assert \ + len(args.image_mean_per_channel) == args.num_channels, \ + "The number of channels of image mean doesn't match input" + + if args.image_std_per_channel: + assert \ + len(args.image_std_per_channel) == args.num_channels, \ + "The number of channels of image std doesn't match input" + # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) @@ -262,7 +295,7 @@ def Train(args): 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( - name="resnet50", arg_scope=train_arg_scope + name='resnext' + str(args.num_layers), arg_scope=train_arg_scope ) num_shards = args.num_shards @@ -324,7 +357,7 @@ def Train(args): rendezvous = None # Model building functions - def create_resnet50_model_ops(model, loss_scale): + def create_resnext_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) @@ -333,11 +366,14 @@ def create_resnet50_model_ops(model, loss_scale): BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): - pred = resnet.create_resnet50( + pred = resnet.create_resnext( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, + num_layers=args.num_layers, + num_groups=args.resnext_num_groups, + num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True, ) @@ -348,7 +384,8 @@ def create_resnet50_model_ops(model, loss_scale): softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) - brew.accuracy(model, [softmax, "label"], "accuracy") + brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) + brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): @@ -408,6 +445,8 @@ def add_image_input(model): img_size=args.image_size, dtype=args.dtype, is_test=False, + mean_per_channel=args.image_mean_per_channel, + std_per_channel=args.image_std_per_channel, ) def add_post_sync_ops(model): @@ -423,7 +462,7 @@ def add_post_sync_ops(model): data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, - forward_pass_builder_fun=create_resnet50_model_ops, + forward_pass_builder_fun=create_resnext_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, @@ -449,7 +488,9 @@ def add_post_sync_ops(model): 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( - name="resnet50_test", arg_scope=test_arg_scope, init_params=False + name='resnext' + str(args.num_layers) + "_test", + arg_scope=test_arg_scope, + init_params=False, ) test_reader = test_model.CreateDB( @@ -466,12 +507,14 @@ def test_input_fn(model): img_size=args.image_size, dtype=args.dtype, is_test=True, + mean_per_channel=args.image_mean_per_channel, + std_per_channel=args.image_std_per_channel, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, - forward_pass_builder_fun=create_resnet50_model_ops, + forward_pass_builder_fun=create_resnext_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, @@ -497,7 +540,8 @@ def test_input_fn(model): else: log.warning("The format of load_model_path doesn't match!") - expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( + expname = "resnext_%d_gpu%d_b%d_L%d_lr%.2f_v2" % ( + args.num_layers, args.num_gpus, total_batch_size, args.num_labels, @@ -534,12 +578,24 @@ def test_input_fn(model): def main(): # TODO: use argv parser = argparse.ArgumentParser( - description="Caffe2: Resnet-50 training" + description="Caffe2: ResNe(X)t training" ) parser.add_argument("--train_data", type=str, default=None, required=True, help="Path to training data (or 'null' to simulate)") + parser.add_argument("--num_layers", type=int, default=50, + help="The number of layers in ResNe(X)t model") + parser.add_argument("--resnext_num_groups", type=int, default=1, + help="The cardinality of resnext") + parser.add_argument("--resnext_width_per_group", type=int, default=64, + help="The cardinality of resnext") parser.add_argument("--test_data", type=str, default=None, help="Path to test data") + parser.add_argument("--image_mean_per_channel", type=float, nargs='+', + help="The per channel mean for the images") + parser.add_argument("--image_std_per_channel", type=float, nargs='+', + help="The per channel standard deviation for the images") + parser.add_argument("--test_epoch_size", type=int, default=50000, + help="Number of test images") parser.add_argument("--db_type", type=str, default="lmdb", help="Database type (such as lmdb or leveldb)") parser.add_argument("--gpus", type=str, @@ -576,7 +632,7 @@ def main(): help="Port of Redis server (for rendezvous)") parser.add_argument("--file_store_path", type=str, default="/tmp", help="Path to directory to use for rendezvous") - parser.add_argument("--save_model_name", type=str, default="resnet50_model", + parser.add_argument("--save_model_name", type=str, default="resnext_model", help="Save the trained model to a given name") parser.add_argument("--load_model_path", type=str, default=None, help="Load previously saved model to continue training") @@ -598,6 +654,7 @@ def main(): Train(args) + if __name__ == '__main__': workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) main() diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index 64174f6e71c676..5d41fdfb18262b 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -2229,7 +2229,7 @@ def ref_nhwc(x, scale, bias): in_place=st.booleans(), **hu.gcs) def test_unsafe_coalesce(self, sizes, in_place, gc, dc): - gAlignment = 32 + gAlignment = 64 Xs = [np.random.randn(size) .astype(np.random.choice([np.float32, np.float64, np.uint8])) for size in sizes] diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py index d96739b66c865b..4c3661b284dc3c 100644 --- a/caffe2/python/layers/sparse_lookup.py +++ b/caffe2/python/layers/sparse_lookup.py @@ -28,6 +28,12 @@ def get_sparse_lookup_predictor_version(version): return version +def get_sparse_lookup_trainer_version(version): + assert version in {'fp32', 'fp16'},\ + "Unexpected version of sparse_lookup layer {0}".format(version) + return version + + def _is_id_list(input_record): return schema.equal_schemas(input_record, IdList) @@ -72,10 +78,12 @@ def __init__(self, model, input_record, inner_shape, reducer, "{} should have categorical limit > 0, but got {}".format( get_key(input_record)(), input_dim)) - scale = math.sqrt(1.0 / input_dim) + self.input_dim = input_dim self.shape = [input_dim] + inner_shape - self.weight_init = weight_init if weight_init else ( - 'UniformFill', {'min': -scale, 'max': scale}) + + default_init_op = self._get_default_init_op() + + self.weight_init = weight_init or default_init_op if _is_id_list(self.input_record): sparse_key = self.input_record.items() @@ -141,6 +149,25 @@ def get_8bits_compatible_parameters(self, fused=True): ) return [RowwiseQuantized8BitsWeight(self.w, self.scale_bias)] + def _get_default_init_op(self): + scale = math.sqrt(1.0 / self.input_dim) + + cur_scope = get_current_scope() + trainer_version = get_sparse_lookup_trainer_version( + **cur_scope.get(get_sparse_lookup_trainer_version.__name__, + {'version': 'fp32'})) + + if trainer_version == 'fp32': + default_weight_init = ('UniformFill', {'min': -scale, 'max': scale}) + elif trainer_version == 'fp16': + default_weight_init = ("Float16UniformFill", {'min': -scale, 'max': scale}) + else: + raise NotImplementedError( + "Train version {} is not currently supported".format(trainer_version) + ) + + return default_weight_init + def _gather_wrapper(self, net, version, in_indices, out): # Gather can work on all kinds of input data types, and output # data with the same type. Convert the output of Gather to float, diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index ae8b6cd889ef38..146d5eb53cebe9 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -86,6 +86,131 @@ def mkl_tmp(name): fix_BoxWithNMSLimit(net) +def rewrite_run_net_simple_xrayocr_lstm(net, ideep=True): + # For xrayocr model with lstm, only rewrite the non-lstm part of the net to + # enable mkl, then copy the temporary output blob at the break point + # and all external inputs for lstm part to cpu, and execuate rest of the net + # (two lstm) on cpu + # This only works for the xrayocr lstm model which uses the first 'Shape' op + # to decide the break point, and after two lstm it's external_output + # directly so there's no need to copy back to ideep/mkl + + def mkl_tmp(name): + return "{}__MKL__".format(name) + + def cpu_tmp(name): + return "{}__CPU__".format(name) + + input_blob = net.external_input[0] + if input_blob != net.op[0].input[0]: + raise Exception( + "Input blob: {} is not consumed by first op: {}".format( + input_blob, net.op[0])) + # Modify input/outputs to point to copied MKL blobs. + from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL" + to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU" + copy_input_op = core.CreateOperator( + from_cpu, input_blob, mkl_tmp(input_blob)) + net.op[0].input[0] = mkl_tmp(input_blob) + + # the net may contain some external_inputs falsely added during ONNX->Caffe2 + # This should be taken care of in early steps during pytorch_to_caffe2, + # but if not it can cause issue in follow up steps, so check here to confirm + for input_blob in net.external_input: + for op in net.op: + # look for if the external_input blob is output of any op in the net + assert input_blob not in op.output + + external_output = None + external_inputs_to_cpu = set() + find_first_shape_op = False + cpu_op_start_idx = -1 + for op_idx, op in enumerate(net.op): + # the first Shape op mark the starting point of LSTM chunk of the net + if not find_first_shape_op: + if op.type == 'Shape': + external_output = op.input + find_first_shape_op = True + cpu_op_start_idx = op_idx + else: + # any external input in the LSTM part need to be copied to CPU + for in_blob in op.input: + if in_blob in net.external_input: + external_inputs_to_cpu.add(in_blob) + + # make sure we found the expected break point of the net + assert external_output is not None + + # create op to copy external input blobs used in LSTM part from IDEEP to CPU + copy_extra_input_ops = [] + for in_blob in external_inputs_to_cpu: + copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob, + cpu_tmp(in_blob))) + # rename input blobs in LSTM part to use the CPU copy + for op in net.op[cpu_op_start_idx:]: + renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob) + for blob in op.input] + op.input[:] = renamed_input + + copy_output_ops = [ + core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob) + for output_blob in external_output] + + for output_blob in external_output: + last_producer_idx = last_producer(net.op, output_blob) + renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob) + for blob in net.op[last_producer_idx].output] + net.op[last_producer_idx].output[:] = renamed_outputs + + # rearrange all ops in correct order + ops = [copy_input_op] + net.op[:cpu_op_start_idx] \ + + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:] + del net.op[:] + net.op.extend(ops) + + device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN + for op in net.op: + # the first Shape op mark the starting point of LSTM chunk of the net + if op.type == 'Shape': + # all LSTM ops should run on CPU + device = caffe2_pb2.CPU + op.device_option.MergeFrom( + core.DeviceOption(device_type=device)) + op.engine = "" + + # RecurrentNetwork has a nested step_net that needs special treatment + if op.type == 'RecurrentNetwork': + for arg in op.arg: + if arg.name == 'step_net': + for nested_op in arg.n.op: + # set device to CPU + nested_op.device_option.MergeFrom( + core.DeviceOption(device_type=device)) + nested_op.engine = "" + + # rename inputs in op of nested net + renamed_input = [] + for blob in nested_op.input: + renamed_input.append(blob + if blob not in external_inputs_to_cpu + else cpu_tmp(blob)) + nested_op.input[:] = renamed_input + + # rename external inputs of nested net + new_external_input = [] + for blob in arg.n.external_input: + new_external_input.append(blob + if blob not in external_inputs_to_cpu + else cpu_tmp(blob)) + arg.n.external_input[:] = new_external_input + + if ideep: + # Temporarily disbale conv+relu fusion until we verify further + # net.ParseFromString( + # C.transform_optimizeForIDEEP(net.SerializeToString())) + fix_BoxWithNMSLimit(net) + + def rewrite_model_helper_simple(model, ideep=True): model = copy.deepcopy(model) # All parameter initialization should run on MKL diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py index 60e00ed1a1ae24..7c6c6dc27fe10a 100644 --- a/caffe2/python/models/resnet.py +++ b/caffe2/python/models/resnet.py @@ -6,9 +6,12 @@ from __future__ import print_function from caffe2.python import brew +import logging + ''' -Utility for creating ResNets -See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015 +Utility for creating ResNe(X)t +"Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015 +"Aggregated Residual Transformations for Deep Neural Networks" by Xie et. al. 2016 ''' @@ -17,16 +20,33 @@ class ResNetBuilder(): Helper class for constructing residual blocks. ''' - def __init__(self, model, prev_blob, no_bias, is_test, spatial_bn_mom=0.9): + def __init__( + self, + model, + prev_blob, + no_bias, + is_test, + bn_epsilon=1e-5, + bn_momentum=0.9, + ): self.model = model self.comp_count = 0 self.comp_idx = 0 self.prev_blob = prev_blob self.is_test = is_test - self.spatial_bn_mom = spatial_bn_mom + self.bn_epsilon = bn_epsilon + self.bn_momentum = bn_momentum self.no_bias = 1 if no_bias else 0 - def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0): + def add_conv( + self, + in_filters, + out_filters, + kernel, + stride=1, + group=1, + pad=0, + ): self.comp_idx += 1 self.prev_blob = brew.conv( self.model, @@ -37,6 +57,7 @@ def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0): weight_init=("MSRAFill", {}), kernel=kernel, stride=stride, + group=group, pad=pad, no_bias=self.no_bias, ) @@ -56,8 +77,8 @@ def add_spatial_bn(self, num_filters): self.prev_blob, 'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx), num_filters, - epsilon=1e-3, - momentum=self.spatial_bn_mom, + epsilon=self.bn_epsilon, + momentum=self.bn_momentum, is_test=self.is_test, ) return self.prev_blob @@ -71,7 +92,8 @@ def add_bottleneck( input_filters, # num of feature maps from preceding layer base_filters, # num of filters internally in the component output_filters, # num of feature maps to output - down_sampling=False, + stride=1, + group=1, spatial_batch_norm=True, ): self.comp_idx = 0 @@ -82,7 +104,7 @@ def add_bottleneck( input_filters, base_filters, kernel=1, - stride=1 + stride=1, ) if spatial_batch_norm: @@ -95,8 +117,9 @@ def add_bottleneck( base_filters, base_filters, kernel=3, - stride=(1 if down_sampling is False else 2), - pad=1 + stride=stride, + group=group, + pad=1, ) if spatial_batch_norm: @@ -109,9 +132,10 @@ def add_bottleneck( last_conv = self.add_spatial_bn(output_filters) # Summation with input signal (shortcut) - # If we need to increase dimensions (feature maps), need to - # do a projection for the short cut - if (output_filters > input_filters): + # When the number of feature maps mismatch between the input + # and output (this usually happens when the residual stage + # changes), we need to do a projection for the short cut + if output_filters != input_filters: shortcut_blob = brew.conv( self.model, shortcut_blob, @@ -120,7 +144,7 @@ def add_bottleneck( output_filters, weight_init=("MSRAFill", {}), kernel=1, - stride=(1 if down_sampling is False else 2), + stride=stride, no_bias=self.no_bias, ) if spatial_batch_norm: @@ -129,8 +153,8 @@ def add_bottleneck( shortcut_blob, 'shortcut_projection_%d_spatbn' % self.comp_count, output_filters, - epsilon=1e-3, - momentum=self.spatial_bn_mom, + epsilon=self.bn_epsilon, + momentum=self.bn_momentum, is_test=self.is_test, ) @@ -144,6 +168,8 @@ def add_bottleneck( # Keep track of number of high level components if this ResNetBuilder self.comp_count += 1 + return output_filters + def add_simple_block( self, input_filters, @@ -205,28 +231,102 @@ def add_simple_block( self.comp_count += 1 +def create_resnet_32x32( + model, data, num_input_channels, num_groups, num_labels, is_test=False +): + ''' + Create residual net for smaller images (sec 4.2 of He et. al (2015)) + num_groups = 'n' in the paper + ''' + # conv1 + maxpool + brew.conv( + model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1 + ) + brew.spatial_bn( + model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test + ) + brew.relu(model, 'conv1_spatbn', 'relu1') + + # Number of blocks as described in sec 4.2 + filters = [16, 32, 64] + + builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test) + prev_filters = 16 + for groupidx in range(0, 3): + for blockidx in range(0, 2 * num_groups): + builder.add_simple_block( + prev_filters if blockidx == 0 else filters[groupidx], + filters[groupidx], + down_sampling=(True if blockidx == 0 and + groupidx > 0 else False)) + prev_filters = filters[groupidx] + + # Final layers + brew.average_pool( + model, builder.prev_blob, 'final_avg', kernel=8, stride=1 + ) + brew.fc(model, 'final_avg', 'last_out', 64, num_labels) + softmax = brew.softmax(model, 'last_out', 'softmax') + return softmax + + +RESNEXT_BLOCK_CONFIG = { + 18: (2, 2, 2, 2), + 34: (3, 4, 6, 3), + 50: (3, 4, 6, 3), + 101: (3, 4, 23, 3), + 152: (3, 8, 36, 3), + 200: (3, 24, 36, 3), +} + +RESNEXT_STRIDES = [1, 2, 2, 2] + +logging.basicConfig() +log = logging.getLogger("resnext_builder") +log.setLevel(logging.DEBUG) + + # The conv1 and final_avg kernel/stride args provide a basic mechanism for # adapting resnet50 for different sizes of input images. -def create_resnet50( +def create_resnext( model, data, num_input_channels, num_labels, + num_layers, + num_groups, + num_width_per_group, label=None, is_test=False, no_loss=False, - no_bias=0, + no_bias=1, conv1_kernel=7, conv1_stride=2, final_avg_kernel=7, + log=None, + bn_epsilon=1e-5, + bn_momentum=0.9, ): + if num_layers not in RESNEXT_BLOCK_CONFIG: + log.error("{}-layer is invalid for resnext config".format(num_layers)) + + num_blocks = RESNEXT_BLOCK_CONFIG[num_layers] + strides = RESNEXT_STRIDES + num_filters = [64, 256, 512, 1024, 2048] + + if num_layers in [18, 34]: + num_filters = [64, 64, 128, 256, 512] + + # the number of features before the last FC layer + num_features = num_filters[-1] + # conv1 + maxpool - brew.conv( + conv_blob = brew.conv( model, data, 'conv1', num_input_channels, - 64, + num_filters[0], weight_init=("MSRAFill", {}), kernel=conv1_kernel, stride=conv1_stride, @@ -234,41 +334,40 @@ def create_resnet50( no_bias=no_bias ) - brew.spatial_bn( + bn_blob = brew.spatial_bn( model, - 'conv1', + conv_blob, 'conv1_spatbn_relu', - 64, - epsilon=1e-3, - momentum=0.1, + num_filters[0], + epsilon=bn_epsilon, + momentum=bn_momentum, is_test=is_test ) - brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu') - brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2) + relu_blob = brew.relu(model, bn_blob, bn_blob) + max_pool = brew.max_pool(model, relu_blob, 'pool1', kernel=3, stride=2, pad=1) # Residual blocks... - builder = ResNetBuilder(model, 'pool1', no_bias=no_bias, - is_test=is_test, spatial_bn_mom=0.1) - - # conv2_x (ref Table 1 in He et al. (2015)) - builder.add_bottleneck(64, 64, 256) - builder.add_bottleneck(256, 64, 256) - builder.add_bottleneck(256, 64, 256) - - # conv3_x - builder.add_bottleneck(256, 128, 512, down_sampling=True) - for _ in range(1, 4): - builder.add_bottleneck(512, 128, 512) - - # conv4_x - builder.add_bottleneck(512, 256, 1024, down_sampling=True) - for _ in range(1, 6): - builder.add_bottleneck(1024, 256, 1024) + builder = ResNetBuilder(model, max_pool, no_bias=no_bias, + is_test=is_test, bn_epsilon=1e-5, bn_momentum=0.9) + + inner_dim = num_groups * num_width_per_group + + # 4 different kinds of residual blocks + for residual_idx in range(4): + residual_num = num_blocks[residual_idx] + residual_stride = strides[residual_idx] + dim_in = num_filters[residual_idx] + + for blk_idx in range(residual_num): + dim_in = builder.add_bottleneck( + dim_in, + inner_dim, + num_filters[residual_idx + 1], # dim out + stride=residual_stride if blk_idx == 0 else 1, + group=num_groups, + ) - # conv5_x - builder.add_bottleneck(1024, 512, 2048, down_sampling=True) - builder.add_bottleneck(2048, 512, 2048) - builder.add_bottleneck(2048, 512, 2048) + inner_dim *= 2 # Final layers final_avg = brew.average_pool( @@ -282,7 +381,7 @@ def create_resnet50( # Final dimension of the "image" is reduced to 7x7 last_out = brew.fc( - model, final_avg, 'last_out_L{}'.format(num_labels), 2048, num_labels + model, final_avg, 'last_out_L{}'.format(num_labels), num_features, num_labels ) if no_loss: @@ -301,40 +400,35 @@ def create_resnet50( return brew.softmax(model, last_out, "softmax") -def create_resnet_32x32( - model, data, num_input_channels, num_groups, num_labels, is_test=False +# The conv1 and final_avg kernel/stride args provide a basic mechanism for +# adapting resnet50 for different sizes of input images. +def create_resnet50( + model, + data, + num_input_channels, + num_labels, + label=None, + is_test=False, + no_loss=False, + no_bias=0, + conv1_kernel=7, + conv1_stride=2, + final_avg_kernel=7, ): - ''' - Create residual net for smaller images (sec 4.2 of He et. al (2015)) - num_groups = 'n' in the paper - ''' - # conv1 + maxpool - brew.conv( - model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1 - ) - brew.spatial_bn( - model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test - ) - brew.relu(model, 'conv1_spatbn', 'relu1') - - # Number of blocks as described in sec 4.2 - filters = [16, 32, 64] - - builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test) - prev_filters = 16 - for groupidx in range(0, 3): - for blockidx in range(0, 2 * num_groups): - builder.add_simple_block( - prev_filters if blockidx == 0 else filters[groupidx], - filters[groupidx], - down_sampling=(True if blockidx == 0 and - groupidx > 0 else False)) - prev_filters = filters[groupidx] - - # Final layers - brew.average_pool( - model, builder.prev_blob, 'final_avg', kernel=8, stride=1 + # resnet50 is a special case for ResNeXt50-1x64d + return create_resnext( + model, + data, + num_input_channels, + num_labels, + num_layers=50, + num_groups=1, + num_width_per_group=64, + label=label, + is_test=is_test, + no_loss=no_loss, + no_bias=no_bias, + conv1_kernel=conv1_kernel, + conv1_stride=conv1_stride, + final_avg_kernel=final_avg_kernel, ) - brew.fc(model, 'final_avg', 'last_out', 64, num_labels) - softmax = brew.softmax(model, 'last_out', 'softmax') - return softmax diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py index 708eae6b2a071c..abe1971680a7e9 100644 --- a/caffe2/python/nomnigraph.py +++ b/caffe2/python/nomnigraph.py @@ -68,7 +68,10 @@ def render(s): NeuralNetOperator = C.NeuralNetOperator +Operator = C.NeuralNetOperator NeuralNetData = C.NeuralNetData +Data = C.NeuralNetData NNSubgraph = C.NNSubgraph NNMatchGraph = C.NNMatchGraph Graph = C.Graph +Annotation = C.Annotation diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 7739ac05f2979f..9288364bbcb944 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -174,3 +174,52 @@ def test_convertToProto(self): assert a == b for a, b in zip(new_netdef.external_output, net.Proto().external_output): assert a == b + + def test_node_interactions(self): + nn = ng.NNModule() + dfg = nn.dataFlow + test1 = dfg.createNode(ng.Operator("test1")) + test2 = dfg.createNode(ng.Operator("test2")) + x = dfg.createNode(ng.Data("x")) + dfg.createEdge(test1, x) + dfg.createEdge(x, test2) + p = test2.getOperatorPredecessors() + assert len(p) == 1 + assert p[0] == test1 + + # Add another node + test3 = dfg.createNode(ng.Operator("test3")) + y = dfg.createNode(ng.Data("y")) + dfg.createEdge(test3, y) + dfg.createEdge(y, test2) + p = test2.getOperatorPredecessors() + assert len(p) == 2 + assert test1 in p + assert test3 in p + + # Successors + assert len(test2.getOperatorSuccessors()) == 0 + assert len(test1.getOperatorSuccessors()) == 1 + assert test1.getOperatorSuccessors()[0] == test2 + + # Check all the nodes are valid (pybind ownership test) + for node in [test1, test2, test3]: + assert node.isOperator() + for node in [x, y]: + assert node.isTensor() + + def test_annotation_basic(self): + annot = ng.Annotation() + annot.setDevice("woot") + assert annot.getDevice() == "woot" + annot.setDeviceType(7) + assert annot.getDeviceType() == 7 + + def test_annotation_from_graph(self): + nn = ng.NNModule() + node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp")) + annot = node.getAnnotation() + annot.setDeviceType(7) + node.setAnnotation(annot) + new_annot = node.getAnnotation() + assert new_annot.getDeviceType() == 7 diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 79dadb091488e6..7eacaf327ad264 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -25,6 +25,7 @@ import caffe2 from caffe2.python import core, workspace, rnn_cell, gru_cell +from caffe2.python.compatibility import container_abcs from caffe2.python.model_helper import ModelHelper from caffe2.proto import caffe2_pb2 import caffe2.python.utils @@ -140,7 +141,7 @@ class Caffe2Backend(Backend): # If you increase this, make SURE you cross-reference all BC-breaking # changes from one version to the next, and any that you did not # implement, mark as broken in _broken_operators - _known_opset_version = 7 + _known_opset_version = 9 # This dictionary will record operators which are KNOWN to be # broken, so we give a good error message rather than do something @@ -778,7 +779,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version) if isinstance(ops, Caffe2Ops): return ops - if not isinstance(ops, collections.Iterable): + if not isinstance(ops, container_abcs.Iterable): ops = [ops] return Caffe2Ops(ops, [], []) diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index 5fd470c932ac59..379ef65af904a6 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -12,11 +12,11 @@ from __future__ import unicode_literals import itertools -import collections import logging import re from caffe2.python import core as caffe2_core +from caffe2.python.compatibility import container_abcs from caffe2.proto import caffe2_legacy_pb2 from enum import Enum from onnx import (defs, checker, helper, numpy_helper, mapping, @@ -156,7 +156,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes): const_tensors = [] if isinstance(nodes, tuple): nodes, const_tensors = nodes - if not isinstance(nodes, collections.Iterable): + if not isinstance(nodes, container_abcs.Iterable): nodes = [nodes] return nodes, const_tensors diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py index 0d35110740825b..5be8b689f115cb 100644 --- a/caffe2/python/operator_test/activation_ops_test.py +++ b/caffe2/python/operator_test/activation_ops_test.py @@ -11,12 +11,13 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.mkl_test_util as mu +import caffe2.python.serialized_test.serialized_test_util as serial import unittest -class TestActivations(hu.HypothesisTestCase): - @given(X=hu.tensor(), in_place=st.booleans(), +class TestActivations(serial.SerializedTestCase): + @serial.given(X=hu.tensor(), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), **mu.gcs) def test_relu(self, X, in_place, engine, gc, dc): if gc == mu.mkl_do: @@ -74,7 +75,7 @@ def relu_grad_ref(g_out, outputs, fwd_inputs): output_to_grad="X" if in_place else "Y", grad_reference=relu_grad_ref) - @given(X=hu.tensor(elements=st.floats(-3.0, 3.0)), + @serial.given(X=hu.tensor(elements=st.floats(-3.0, 3.0)), n=st.floats(min_value=0.5, max_value=2.0), in_place=st.booleans(), **hu.gcs) def test_relu_n(self, X, n, in_place, gc, dc): @@ -100,7 +101,7 @@ def relu_n_ref(X): self.assertDeviceChecks(dc, op, [X], [0]) self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.005) - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), alpha=st.floats(min_value=0.1, max_value=2.0), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) @@ -169,7 +170,7 @@ def prelu_ref(X, W): # Gradient check wrt W self.assertGradientChecks(gc, op, [X, W], 1, [0], stepsize=1e-2) - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), alpha=st.floats(min_value=0.1, max_value=2.0), inplace=st.booleans(), **hu.gcs) diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py index 9112d50f38df86..2976b06108ff51 100644 --- a/caffe2/python/operator_test/adadelta_test.py +++ b/caffe2/python/operator_test/adadelta_test.py @@ -12,9 +12,10 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestAdadelta(hu.HypothesisTestCase): +class TestAdadelta(serial.SerializedTestCase): @staticmethod def ref_adadelta(param_in, mom_in, @@ -44,7 +45,7 @@ def ref_adadelta(param_in, return (param_out.astype(np.float32), mom_out.astype(np.float32), mom_delta_out.astype(np.float32)) - @given(inputs=hu.tensors(n=4), + @serial.given(inputs=hu.tensors(n=4), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, @@ -136,7 +137,7 @@ def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay, ref_using_fp16 ], ref_sparse) - @given(inputs=hu.tensors(n=3), + @serial.given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py index e4101e92cf01cb..69aead865d1c3f 100644 --- a/caffe2/python/operator_test/adagrad_test.py +++ b/caffe2/python/operator_test/adagrad_test.py @@ -15,11 +15,13 @@ from caffe2.python.operator_test.adagrad_test_helper import ( ref_adagrad, adagrad_sparse_test_helper ) +import caffe2.python.serialized_test.serialized_test_util as serial import unittest import os -class TestAdagrad(hu.HypothesisTestCase): + +class TestAdagrad(serial.SerializedTestCase): @staticmethod def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.mean(np.square(grad)) @@ -27,7 +29,7 @@ def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon): param_out = param_in + grad_adj return (param_out, mom_out) - @given(inputs=hu.tensors(n=3), + @serial.given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, @@ -112,7 +114,7 @@ def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc): return adagrad_sparse_test_helper(self, inputs, lr, epsilon, None, ref_adagrad, gc, dc) - @given(inputs=hu.tensors(n=2), + @serial.given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, @@ -223,7 +225,7 @@ def ref_row_wise_sparse(param, momentum, indices, grad, lr): [param, momentum, indices, grad, lr], ref_row_wise_sparse) - @given(inputs=hu.tensors(n=1), + @serial.given(inputs=hu.tensors(n=1), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py index 9cf3f7c06b4385..8209b1c0493095 100644 --- a/caffe2/python/operator_test/adam_test.py +++ b/caffe2/python/operator_test/adam_test.py @@ -34,15 +34,18 @@ def ref_adam(param, mom1, mom2, grad, LR, ITER, @staticmethod def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER, - beta1, beta2, epsilon): + beta1, beta2, epsilon, output_grad=False): t = ITER + 1 - corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \ + corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad)) - param_out = param + corrected_local_rate * mom1_out / \ - (np.sqrt(mom2_out) + epsilon) - return (param_out, mom1_out, mom2_out) + grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon) + param_out = param + LR * grad_out + if output_grad: + return param_out, mom1_out, mom2_out, grad_out + else: + return param_out, mom1_out, mom2_out @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), @@ -176,6 +179,76 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER): ref_sparse, input_device_options=input_device_options) + @given(inputs=hu.tensors(n=4), + ITER=st.integers(min_value=0, max_value=10000), + LR=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta1=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta2=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + epsilon=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + data_strategy=st.data(), + **hu.gcs) + def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, + data_strategy, gc, dc): + param, mom1, mom2, grad = inputs + mom2 = np.absolute(mom2) + ITER = np.array([ITER], dtype=np.int64) + LR = np.array([LR], dtype=np.float32) + + # Create an indexing array containing values which index into grad + indices = data_strategy.draw( + hu.tensor( + max_dim=1, + min_value=1, + max_value=grad.shape[0], + dtype=np.int64, + elements=st.sampled_from(np.arange(grad.shape[0])), + ), + ) + + # Verify that the generated indices are unique + hypothesis.assume( + np.array_equal( + np.unique(indices.flatten()), + np.sort(indices.flatten()))) + + # Sparsify grad + grad = grad[indices] + + op = core.CreateOperator( + "SparseAdam", + ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], + ["param", "mom1", "mom2", "output_grad"], + beta1=beta1, beta2=beta2, epsilon=epsilon) + + def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, + beta1, beta2, epsilon, output_grad): + param_out = np.copy(param) + mom1_out = np.copy(mom1) + mom2_out = np.copy(mom2) + grad_out = np.copy(grad) + + for i, index in enumerate(indices): + param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ + self.ref_adam(param[index], mom1[index], mom2[index], + grad[i], LR, ITER, + beta1, beta2, epsilon, output_grad) + return (param_out, mom1_out, mom2_out, grad_out) + + # Iter lives on the CPU + input_device_options = {'iter': hu.cpu_do} + + self.assertReferenceChecks( + gc, op, + [param, mom1, mom2, indices, grad, LR, ITER], + functools.partial( + ref_sparse_output_grad, + beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), + input_device_options=input_device_options) + @given(inputs=hu.tensors(n=3), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, @@ -252,6 +325,87 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER): ref_row_wise_sparse, input_device_options=input_device_options) + @given(inputs=hu.tensors(n=3), + ITER=st.integers(min_value=0, max_value=10000), + LR=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta1=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta2=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + epsilon=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + data_strategy=st.data(), + **hu.gcs_cpu_only) + def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, + epsilon, data_strategy, gc, dc): + param, mom1, grad = inputs + ITER = np.array([ITER], dtype=np.int64) + LR = np.array([LR], dtype=np.float32) + + # Create a 1D row-wise average 2nd moment tensor. + mom2 = data_strategy.draw( + hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], + elements=hu.elements_of_type(dtype=np.float32)) + ) + mom2 = np.absolute(mom2) + + # Create an indexing array containing values which index into grad + indices = data_strategy.draw( + hu.tensor( + max_dim=1, + min_value=1, + max_value=grad.shape[0], + dtype=np.int64, + elements=st.sampled_from(np.arange(grad.shape[0])), + ), + ) + + # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment + # tensor that is strictly 1-dimensional and equal in length to the + # first dimension of the parameters, so indices must also be + # 1-dimensional. + indices = indices.flatten() + + hypothesis.note('indices.shape: %s' % str(indices.shape)) + + # Verify that the generated indices are unique + hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) + + # Sparsify grad + grad = grad[indices] + + op = core.CreateOperator( + "RowWiseSparseAdam", + ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], + ["param", "mom1", "mom2", "output_grad"], + beta1=beta1, beta2=beta2, epsilon=epsilon) + + def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, + beta1, beta2, epsilon, output_grad): + param_out = np.copy(param) + mom1_out = np.copy(mom1) + mom2_out = np.copy(mom2) + grad_out = np.copy(grad) + + for i, index in enumerate(indices): + param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ + self.ref_row_wise_adam(param[index], mom1[index], mom2[index], + grad[i], LR, ITER, + beta1, beta2, epsilon, output_grad) + return (param_out, mom1_out, mom2_out, grad_out) + + # Iter lives on the CPU + input_device_options = {'iter': hu.cpu_do} + + self.assertReferenceChecks( + gc, op, + [param, mom1, mom2, indices, grad, LR, ITER], + functools.partial( + ref_row_wise_sparse_output_grad, + beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), + input_device_options=input_device_options) + if __name__ == "__main__": import unittest diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py index 6e56da29b7f6a9..bcce4efc8ec529 100644 --- a/caffe2/python/operator_test/affine_channel_op_test.py +++ b/caffe2/python/operator_test/affine_channel_op_test.py @@ -2,16 +2,15 @@ from __future__ import division from __future__ import print_function -import numpy as np - from caffe2.python import core -from hypothesis import given - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given import hypothesis.strategies as st +import numpy as np -class TestAffineChannelOp(hu.HypothesisTestCase): +class TestAffineChannelOp(serial.SerializedTestCase): def affine_channel_nchw_ref(self, X, scale, bias): dims = X.shape N = dims[0] @@ -30,9 +29,10 @@ def affine_channel_nhwc_ref(self, X, scale, bias): Y = X * scale + bias return [Y.reshape(dims)] - @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5), - W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), - is_learnable=st.booleans(), in_place=st.booleans(), **hu.gcs) + @serial.given(N=st.integers(1, 5), C=st.integers(1, 5), + H=st.integers(1, 5), W=st.integers(1, 5), + order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(), + in_place=st.booleans(), **hu.gcs) def test_affine_channel_2d( self, N, C, H, W, order, is_learnable, in_place, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py index 6492189808a72c..9bdea7ecf5cdbf 100644 --- a/caffe2/python/operator_test/arg_ops_test.py +++ b/caffe2/python/operator_test/arg_ops_test.py @@ -9,11 +9,13 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestArgOps(hu.HypothesisTestCase): - @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), - keepdims=st.booleans(), **hu.gcs) +class TestArgOps(serial.SerializedTestCase): + @serial.given( + X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), + keepdims=st.booleans(), **hu.gcs) def test_argmax(self, X, axis, keepdims, gc, dc): if axis >= len(X.shape): axis %= len(X.shape) @@ -32,8 +34,9 @@ def argmax_ref(X): self.assertReferenceChecks(gc, op, [X], argmax_ref) self.assertDeviceChecks(dc, op, [X], [0]) - @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), - keepdims=st.booleans(), **hu.gcs) + @serial.given( + X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), + keepdims=st.booleans(), **hu.gcs) def test_argmin(self, X, axis, keepdims, gc, dc): if axis >= len(X.shape): axis %= len(X.shape) diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py index f8bc77ba9e5a7a..7252499352ee84 100644 --- a/caffe2/python/operator_test/batch_box_cox_test.py +++ b/caffe2/python/operator_test/batch_box_cox_test.py @@ -7,6 +7,7 @@ from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -51,8 +52,8 @@ def _inputs(draw): ) -class TestBatchBoxCox(hu.HypothesisTestCase): - @given( +class TestBatchBoxCox(serial.SerializedTestCase): + @serial.given( inputs=_inputs(), **hu.gcs_cpu_only ) diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py index 711240de9cfdfc..301941afb590c3 100644 --- a/caffe2/python/operator_test/batch_bucketize_op_test.py +++ b/caffe2/python/operator_test/batch_bucketize_op_test.py @@ -6,13 +6,14 @@ import numpy as np from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -class TestBatchBucketize(hu.HypothesisTestCase): - @given(**hu.gcs_cpu_only) +class TestBatchBucketize(serial.SerializedTestCase): + @serial.given(**hu.gcs_cpu_only) def test_batch_bucketize_example(self, gc, dc): op = core.CreateOperator('BatchBucketize', ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"], diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py index 2db25e73892563..91d49b76ee4119 100644 --- a/caffe2/python/operator_test/batch_moments_op_test.py +++ b/caffe2/python/operator_test/batch_moments_op_test.py @@ -2,16 +2,15 @@ from __future__ import division from __future__ import print_function -import numpy as np - from caffe2.python import core -from hypothesis import given - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given import hypothesis.strategies as st +import numpy as np -class TestBatchMomentsOp(hu.HypothesisTestCase): +class TestBatchMomentsOp(serial.SerializedTestCase): def batch_moments_nchw_ref(self, X): dims = X.shape N = dims[0] @@ -29,9 +28,9 @@ def batch_moments_nhwc_ref(self, X): var = np.mean(np.square(X), axis=0) return [mu, var] - @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5), - W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), - **hu.gcs) + @serial.given(N=st.integers(1, 5), C=st.integers(1, 5), + H=st.integers(1, 5), W=st.integers(1, 5), + order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) def test_batch_moments_2d(self, N, C, H, W, order, gc, dc): op = core.CreateOperator( "BatchMoments", diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py index a8e04fb2e14750..a47cc44e593c8b 100644 --- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py +++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py @@ -3,17 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np - from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu +import numpy as np -class TestBatchSparseToDense(hu.HypothesisTestCase): +class TestBatchSparseToDense(serial.SerializedTestCase): - @given( + @serial.given( batch_size=st.integers(5, 10), dense_last_dim=st.integers(5, 10), default_value=st.floats(min_value=2.0, max_value=3.0), diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py index b54a4435513be7..f76891bae4dc89 100644 --- a/caffe2/python/operator_test/bbox_transform_test.py +++ b/caffe2/python/operator_test/bbox_transform_test.py @@ -2,9 +2,11 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -201,8 +203,8 @@ def generate_rois_rotated(roi_counts, im_dims): return rotated_rois -class TestBBoxTransformOp(hu.HypothesisTestCase): - @given( +class TestBBoxTransformOp(serial.SerializedTestCase): + @serial.given( num_rois=st.integers(1, 10), num_classes=st.integers(1, 10), im_dim=st.integers(100, 600), diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 638248d60bafe5..8811f5667503b8 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -2,18 +2,18 @@ from __future__ import division from __future__ import print_function -import numpy as np -from hypothesis import assume, given -import hypothesis.strategies as st - from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import assume, given +import hypothesis.strategies as st +import numpy as np -class TestBooleanMaskOp(hu.HypothesisTestCase): +class TestBooleanMaskOp(serial.SerializedTestCase): - @given(x=hu.tensor(min_dim=1, + @serial.given(x=hu.tensor(min_dim=1, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0)), **hu.gcs) @@ -54,7 +54,7 @@ def _dtype_conversion(x, dtype, gc, dc): x = x.astype(dtype) return x, dc - @given(x=hu.tensor(min_dim=2, + @serial.given(x=hu.tensor(min_dim=2, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0)), dtype=st.sampled_from([np.float32, np.float16]), diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py index 86b2fedb49a245..e3bc9f248d3a26 100644 --- a/caffe2/python/operator_test/boolean_unmask_test.py +++ b/caffe2/python/operator_test/boolean_unmask_test.py @@ -3,16 +3,15 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np -from hypothesis import given -import hypothesis.strategies as st - from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st +import numpy as np -class TestUnmaskOp(hu.HypothesisTestCase): - @given(N=st.integers(min_value=2, max_value=20), +class TestUnmaskOp(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=2, max_value=20), dtype=st.sampled_from([ np.bool_, np.int8, diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py index 8cd9acbd6a5d3c..52155c0a5d7649 100644 --- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py +++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py @@ -3,13 +3,13 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np -import unittest +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st - -import caffe2.python.hypothesis_test_util as hu -from caffe2.python import core +import unittest +import numpy as np def get_op(input_len, output_len, args): @@ -64,8 +64,8 @@ def gen_multiple_boxes(centers, scores, count, num_classes): return ret_box, ret_scores -class TestBoxWithNMSLimitOp(hu.HypothesisTestCase): - @given(**HU_CONFIG) +class TestBoxWithNMSLimitOp(serial.SerializedTestCase): + @serial.given(**HU_CONFIG) def test_simple(self, gc): in_centers = [(0, 0), (20, 20), (50, 50)] in_scores = [0.9, 0.8, 0.6] diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py index 79293fc6453232..130364261ea166 100644 --- a/caffe2/python/operator_test/ceil_op_test.py +++ b/caffe2/python/operator_test/ceil_op_test.py @@ -4,17 +4,18 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np import unittest -class TestCeil(hu.HypothesisTestCase): +class TestCeil(serial.SerializedTestCase): - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_ceil(self, X, gc, dc, engine): diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py index 5c59b8d6f05c1b..e516288b436c38 100644 --- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py +++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py @@ -3,17 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import assume, given import hypothesis.strategies as st import numpy as np -from caffe2.python import core -import caffe2.python.hypothesis_test_util as hu -from caffe2.proto import caffe2_pb2 import unittest -class TestChannelBackpropStats(hu.HypothesisTestCase): - @given( +class TestChannelBackpropStats(serial.SerializedTestCase): + @serial.given( size=st.integers(7, 10), inputChannels=st.integers(1, 10), batchSize=st.integers(1, 3), diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py index e17d7a736063c4..34417fd1847337 100644 --- a/caffe2/python/operator_test/channel_shuffle_test.py +++ b/caffe2/python/operator_test/channel_shuffle_test.py @@ -3,14 +3,15 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np -import caffe2.python.hypothesis_test_util as hu from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st +import numpy as np -class ChannelShuffleOpsTest(hu.HypothesisTestCase): +class ChannelShuffleOpsTest(serial.SerializedTestCase): def _channel_shuffle_nchw_ref(self, X, group): dims = X.shape N = dims[0] @@ -31,7 +32,7 @@ def _channel_shuffle_nhwc_ref(self, X, group): Y = np.transpose(X, axes=(0, 1, 3, 2)) return [Y.reshape(dims)] - @given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5), + @serial.given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5), H=st.integers(1, 5), W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc): diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py index 2a238e67542f5a..f1daddee7721dd 100644 --- a/caffe2/python/operator_test/channel_stats_op_test.py +++ b/caffe2/python/operator_test/channel_stats_op_test.py @@ -3,17 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import assume, given import hypothesis.strategies as st import numpy as np -from caffe2.python import core -import caffe2.python.hypothesis_test_util as hu -from caffe2.proto import caffe2_pb2 import unittest -class TestChannelStats(hu.HypothesisTestCase): - @given( +class TestChannelStats(serial.SerializedTestCase): + @serial.given( size=st.integers(7, 10), inputChannels=st.integers(1, 10), batchSize=st.integers(1, 3), diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py index 38499a69eb1d90..46163d30dedc2f 100644 --- a/caffe2/python/operator_test/clip_op_test.py +++ b/caffe2/python/operator_test/clip_op_test.py @@ -10,10 +10,11 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestClip(hu.HypothesisTestCase): - @given(X=hu.tensor(), +class TestClip(serial.SerializedTestCase): + @serial.given(X=hu.tensor(), min_=st.floats(min_value=-2, max_value=0), max_=st.floats(min_value=0, max_value=2), inplace=st.booleans(), diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py index bea2133d5182ee..042b4ef2a8326c 100644 --- a/caffe2/python/operator_test/clip_tensor_op_test.py +++ b/caffe2/python/operator_test/clip_tensor_op_test.py @@ -4,15 +4,15 @@ from __future__ import unicode_literals from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestClipTensorByScalingOp(hu.HypothesisTestCase): +class TestClipTensorByScalingOp(serial.SerializedTestCase): - @given(n=st.integers(5, 8), d=st.integers(2, 4), + @serial.given(n=st.integers(5, 8), d=st.integers(2, 4), threshold=st.floats(0.1, 10), additional_threshold=st.floats(0.1, 10), use_additional_threshold=st.booleans(), diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py index facb675a4944ec..e37738801745c2 100644 --- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py +++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals import numpy as np -import unittest import os +import unittest from hypothesis import given, settings import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -from caffe2.python import core, utils from caffe2.proto import caffe2_pb2 +from caffe2.python import core, utils +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial # # Should match original Detectron code at @@ -129,17 +130,17 @@ def collect_and_distribute_fpn_rpn_ref(*inputs): return outputs -class TestCollectAndDistributeFpnRpnProposals(hu.HypothesisTestCase): +class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase): @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") - @given(proposal_count=st.integers(min_value=1000, max_value=8000), - rpn_min_level=st.integers(min_value=1, max_value=4), - rpn_num_levels=st.integers(min_value=1, max_value=6), - roi_min_level=st.integers(min_value=1, max_value=4), - roi_num_levels=st.integers(min_value=1, max_value=6), - rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), - roi_canonical_scale=st.integers(min_value=100, max_value=300), - roi_canonical_level=st.integers(min_value=1, max_value=8), - **hu.gcs_cpu_only) + @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000), + rpn_min_level=st.integers(min_value=1, max_value=4), + rpn_num_levels=st.integers(min_value=1, max_value=6), + roi_min_level=st.integers(min_value=1, max_value=4), + roi_num_levels=st.integers(min_value=1, max_value=6), + rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), + roi_canonical_scale=st.integers(min_value=100, max_value=300), + roi_canonical_level=st.integers(min_value=1, max_value=8), + **hu.gcs_cpu_only) def test_collect_and_dist( self, proposal_count, diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py index af8d5486e6e6ab..3d2c4ae31946d9 100644 --- a/caffe2/python/operator_test/concat_split_op_test.py +++ b/caffe2/python/operator_test/concat_split_op_test.py @@ -3,13 +3,14 @@ from __future__ import division from __future__ import print_function -import numpy as np -import hypothesis.strategies as st -import unittest -import caffe2.python.hypothesis_test_util as hu from caffe2.proto import caffe2_pb2 from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given +import hypothesis.strategies as st +import numpy as np +import unittest @st.composite @@ -44,8 +45,8 @@ def _tensor_splits(draw, add_axis=False): ) -class TestConcatSplitOps(hu.HypothesisTestCase): - @given(tensor_splits=_tensor_splits(), +class TestConcatSplitOps(serial.SerializedTestCase): + @serial.given(tensor_splits=_tensor_splits(), **hu.gcs) def test_concat(self, tensor_splits, gc, dc): axis, _, splits = tensor_splits @@ -92,7 +93,7 @@ def test_concat_add_axis(self, tensor_splits, gc, dc): for i in range(len(splits)): self.assertGradientChecks(gc, op, splits, i, [0]) - @given(tensor_splits=_tensor_splits(), + @serial.given(tensor_splits=_tensor_splits(), split_as_arg=st.booleans(), **hu.gcs) def test_split(self, tensor_splits, split_as_arg, gc, dc): @@ -127,7 +128,7 @@ def split_ref(input, split=split_info): self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad) self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad) - @given( + @serial.given( inputs=hu.lengths_tensor( dtype=np.float32, min_value=1, diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py index b96b530d687751..88d8fd8b7a27a3 100644 --- a/caffe2/python/operator_test/conditional_test.py +++ b/caffe2/python/operator_test/conditional_test.py @@ -2,15 +2,15 @@ from __future__ import division from __future__ import print_function -import numpy as np -from hypothesis import given -import hypothesis.strategies as st from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st +import numpy as np -class TestConditionalOp(hu.HypothesisTestCase): - @given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only) +class TestConditionalOp(serial.SerializedTestCase): + @serial.given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only) def test_conditional(self, rows_num, gc, dc): op = core.CreateOperator( "Conditional", ["condition", "data_t", "data_f"], "output" diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py new file mode 100644 index 00000000000000..54a57dfd51928d --- /dev/null +++ b/caffe2/python/operator_test/conftest.py @@ -0,0 +1,39 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import caffe2.python.serialized_test.serialized_test_util as serial + + +def pytest_addoption(parser): + parser.addoption( + '-G', + '--generate-serialized', + action='store_true', + dest='generate', + help='generate output files (default=false, compares to current files)', + ) + parser.addoption( + '-O', + '--output', + default=serial.DATA_DIR, + dest='output', + help='output directory (default: %(default)s)' + ) + parser.addoption( + '-D', + '--disable-serialized-check', + action='store_true', + dest='disable', + help='disable checking serialized tests' + ) + + +def pytest_configure(config): + generate = config.getoption('generate', default=False) + output = config.getoption('output', default=serial.DATA_DIR) + disable = config.getoption('disable', default=False) + serial._output_context.__setattr__('should_generate_output', generate) + serial._output_context.__setattr__('output_dir', output) + serial._output_context.__setattr__('disable_serialized_check', disable) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 8e65a9324535a6..d29d724b89c29d 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -13,11 +13,13 @@ from caffe2.python import brew, core, workspace import caffe2.python.hypothesis_test_util as hu from caffe2.python.model_helper import ModelHelper +import caffe2.python.serialized_test.serialized_test_util as serial import caffe2.python._import_c_extension as C import unittest import os + def _cudnn_supports( dilation=False, nhwc=False, @@ -54,7 +56,7 @@ def _cudnn_convolution_algo_count(direction): return st.sampled_from([-1]) -class TestConvolution(hu.HypothesisTestCase): +class TestConvolution(serial.SerializedTestCase): # CUDNN does NOT support different padding values and we skip it @given(op_type=st.sampled_from(["Conv", "Conv2D"]), stride_h=st.integers(1, 3), @@ -636,14 +638,15 @@ def test_use_cudnn_engine_interactions(self): self.assertEqual(model.Proto().op[-1].engine, expected_engine) - @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4), - G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4), - H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(), - order=st.sampled_from(["NCHW", "NHWC"]), - force_algo_fwd=_cudnn_convolution_algo_count("fwd"), - force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"), - force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"), - **hu.gcs) + @serial.given( + op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4), + G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4), + H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(), + order=st.sampled_from(["NCHW", "NHWC"]), + force_algo_fwd=_cudnn_convolution_algo_count("fwd"), + force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"), + force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"), + **hu.gcs) def test_1x1_conv(self, op_type, N, G, DX, DY, H, W, use_bias, order, force_algo_fwd, force_algo_dgrad, force_algo_wgrad, gc, dc): diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py index d67df5fd3e1f32..1124df94e67ae7 100644 --- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py +++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py @@ -9,10 +9,11 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestCosineEmbeddingCriterion(hu.HypothesisTestCase): - @given(N=st.integers(min_value=10, max_value=20), +class TestCosineEmbeddingCriterion(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=10, max_value=20), seed=st.integers(min_value=0, max_value=65535), margin=st.floats(min_value=-0.5, max_value=0.5), **hu.gcs) diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py index 51d2bbc6f484ab..4deef35c5bb506 100644 --- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py @@ -2,10 +2,12 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from collections import defaultdict, Counter from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -15,9 +17,9 @@ DEFAULT_PRUNE_THRESHOLD = 0.001 -class TestCTCBeamSearchDecoderOp(hu.HypothesisTestCase): +class TestCTCBeamSearchDecoderOp(serial.SerializedTestCase): - @given( + @serial.given( batch=st.sampled_from([1, 2, 4]), max_time=st.sampled_from([1, 8, 64]), alphabet_size=st.sampled_from([1, 2, 32, 128, 512]), diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py index 0e638e8155e9e7..98079d2b026ae0 100644 --- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py @@ -2,18 +2,19 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np - import unittest -class TestCTCGreedyDecoderOp(hu.HypothesisTestCase): +class TestCTCGreedyDecoderOp(serial.SerializedTestCase): - @given( + @serial.given( batch=st.sampled_from([2, 4, 128, 256]), max_time=st.sampled_from([2, 10, 30, 50]), num_classes=st.sampled_from([2, 10, 26, 40]), diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py index 0a5f9a38a4a7c6..753b94d20f1f54 100644 --- a/caffe2/python/operator_test/distance_op_test.py +++ b/caffe2/python/operator_test/distance_op_test.py @@ -3,15 +3,16 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st +import numpy as np -class DistanceTest(hu.HypothesisTestCase): - @given(n=st.integers(1, 3), +class DistanceTest(serial.SerializedTestCase): + @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_cosine_similarity(self, n, dim, gc, dc): @@ -32,7 +33,7 @@ def test_cosine_similarity(self, n, dim, gc, dc): self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) - @given(inputs=hu.tensors(n=2, + @serial.given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32), @@ -57,7 +58,7 @@ def dot_ref(X, Y): # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0]) - @given(n=st.integers(1, 3), + @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_L1_distance(self, n, dim, gc, dc): @@ -88,7 +89,7 @@ def test_L1_distance(self, n, dim, gc, dc): self.assertGradientChecks(gc, op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) - @given(n=st.integers(1, 3), + @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_L2_distance(self, n, dim, gc, dc): diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index 89d63b7e13286b..2bbd9ba4efe114 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -10,11 +10,12 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestDropout(hu.HypothesisTestCase): +class TestDropout(serial.SerializedTestCase): - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), in_place=st.booleans(), ratio=st.floats(0, 0.999), engine=st.sampled_from(["", "CUDNN"]), diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py index c67a84921a4ad0..8c7df5f33625b2 100644 --- a/caffe2/python/operator_test/elementwise_linear_op_test.py +++ b/caffe2/python/operator_test/elementwise_linear_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestElementwiseLinearOp(hu.HypothesisTestCase): +class TestElementwiseLinearOp(serial.SerializedTestCase): - @given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs) + @serial.given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs) # @given(n=st.integers(2, 50), d=st.integers(2, 50), **hu.gcs_cpu_only) def test(self, n, d, gc, dc): X = np.random.rand(n, d).astype(np.float32) diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py index 7279dd5e1fb99c..8f665a06cd9e9a 100644 --- a/caffe2/python/operator_test/elementwise_logical_ops_test.py +++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu - import numpy as np import unittest @@ -21,7 +21,7 @@ def rowmux(select_vec, left, right): return mux(select, left, right) -class TestWhere(hu.HypothesisTestCase): +class TestWhere(serial.SerializedTestCase): def test_reference(self): self.assertTrue(( @@ -35,7 +35,7 @@ def test_reference(self): [[3], [4]])[0] ).all()) - @given(N=st.integers(min_value=1, max_value=10), + @serial.given(N=st.integers(min_value=1, max_value=10), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_cpu_only) def test_where(self, N, gc, dc, engine): @@ -107,9 +107,9 @@ def test_rowwhere_dim2(self, N, gc, dc, engine): self.assertReferenceChecks(gc, op, [C, X, Y], rowmux) -class TestIsMemberOf(hu.HypothesisTestCase): +class TestIsMemberOf(serial.SerializedTestCase): - @given(N=st.integers(min_value=1, max_value=10), + @serial.given(N=st.integers(min_value=1, max_value=10), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_cpu_only) def test_is_member_of(self, N, gc, dc, engine): diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index e767a0db161a9f..161f5fc0724b14 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -11,10 +11,11 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial # TODO(jiayq): make them hypothesis tests for better coverage. -class TestElementwiseBroadcast(hu.HypothesisTestCase): +class TestElementwiseBroadcast(serial.SerializedTestCase): @given(**hu.gcs) def test_broadcast_Add(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. @@ -168,7 +169,7 @@ def test_broadcast_Sub(self, gc, dc): self.assertDeviceChecks(dc, op, [X, Y], [0]) self.assertGradientChecks(gc, op, [X, Y], 1, [0]) - @given(**hu.gcs) + @serial.given(**hu.gcs) def test_broadcast_powt(self, gc, dc): np.random.seed(101) diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py index efd056c8f1654d..4b608a4418dddc 100644 --- a/caffe2/python/operator_test/expand_op_test.py +++ b/caffe2/python/operator_test/expand_op_test.py @@ -7,11 +7,12 @@ from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestExpandOp(hu.HypothesisTestCase): +class TestExpandOp(serial.SerializedTestCase): def _rand_shape(self, X_shape, max_length): length = np.random.randint(max_length) shape = np.ones(length, dtype=np.int64) @@ -39,7 +40,7 @@ def ref(X, shape): self.assertDeviceChecks(dc, op, [X, shape], [0]) self.assertGradientChecks(gc, op, [X, shape], 0, [0]) - @given(X=hu.tensor(max_dim=5, dtype=np.float32), + @serial.given(X=hu.tensor(max_dim=5, dtype=np.float32), **hu.gcs) def test_expand_rand_shape(self, X, gc, dc): shape = self._rand_shape(X.shape, 5) diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py index 1c444da6a8b80e..d10e19e4932ade 100644 --- a/caffe2/python/operator_test/fc_operator_test.py +++ b/caffe2/python/operator_test/fc_operator_test.py @@ -7,11 +7,12 @@ from caffe2.python import core from hypothesis import assume, given, settings, HealthCheck import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestFcOperator(hu.HypothesisTestCase): +class TestFcOperator(serial.SerializedTestCase): def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc): if dtype == np.float16: # fp16 only supported with CUDA @@ -76,7 +77,7 @@ def fc_tranposed_op(X, W, b): threshold=threshold, stepsize=stepsize) @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much]) - @given(n=st.integers(1, 5), + @serial.given(n=st.integers(1, 5), m=st.integers(0, 5), k=st.integers(1, 5), multi_dim=st.sampled_from([True, False]), diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py index df13cba4a3b961..df7cd1dc4e7960 100644 --- a/caffe2/python/operator_test/filler_ops_test.py +++ b/caffe2/python/operator_test/filler_ops_test.py @@ -3,13 +3,13 @@ from __future__ import print_function from __future__ import unicode_literals -import hypothesis.strategies as st - -from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 -from hypothesis import given +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given +import hypothesis.strategies as st import numpy as np @@ -19,7 +19,7 @@ def _fill_diagonal(shape, value): return (result,) -class TestFillerOperator(hu.HypothesisTestCase): +class TestFillerOperator(serial.SerializedTestCase): @given(**hu.gcs) def test_shape_error(self, gc, dc): @@ -127,7 +127,7 @@ def test_uniform_fill_using_arg(self, gc, dc): self.assertNotEqual(min_data, max_data) - @given( + @serial.given( shape=st.sampled_from( [ [3, 3], @@ -168,9 +168,9 @@ def test_diagonal_fill_op_int(self, gc, dc): # Check against numpy reference self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal) - @given(lengths=st.lists(st.integers(min_value=0, max_value=10), - min_size=0, - max_size=10), + @serial.given(lengths=st.lists(st.integers(min_value=0, max_value=10), + min_size=0, + max_size=10), **hu.gcs) def test_lengths_range_fill(self, lengths, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py index febf6efeef76ec..153724a5f49d29 100644 --- a/caffe2/python/operator_test/find_op_test.py +++ b/caffe2/python/operator_test/find_op_test.py @@ -4,20 +4,18 @@ from __future__ import unicode_literals from caffe2.python import core -import hypothesis.strategies as st -from hypothesis import given - - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st import numpy as np -class TestFindOperator(hu.HypothesisTestCase): +class TestFindOperator(serial.SerializedTestCase): - @given(n=st.sampled_from([1, 4, 8, 31, 79, 150]), - idxsize=st.sampled_from([2, 4, 8, 1000, 5000]), - **hu.gcs) + @serial.given(n=st.sampled_from([1, 4, 8, 31, 79, 150]), + idxsize=st.sampled_from([2, 4, 8, 1000, 5000]), + **hu.gcs) def test_find(self, n, idxsize, gc, dc): maxval = 10 diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py index 08f079b5f8cfdd..fcd20278c6caba 100644 --- a/caffe2/python/operator_test/flexible_top_k_test.py +++ b/caffe2/python/operator_test/flexible_top_k_test.py @@ -3,15 +3,16 @@ from __future__ import print_function from __future__ import unicode_literals -from collections import OrderedDict -import numpy as np - from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from collections import OrderedDict +from hypothesis import given +import numpy as np -class TestFlexibleTopK(hu.HypothesisTestCase): + +class TestFlexibleTopK(serial.SerializedTestCase): def flexible_top_k_ref(self, X, k): X_flat = X.reshape((-1, X.shape[-1])) indices_ref = np.ndarray(shape=sum(k), dtype=np.int32) @@ -38,7 +39,7 @@ def flexible_top_k_ref(self, X, k): return (values_ref, indices_ref) - @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only) + @serial.given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only) def test_flexible_top_k(self, X, gc, dc): X = X.astype(dtype=np.float32) k_shape = (int(X.size / X.shape[-1]), ) diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py index aac1e81efa6570..4cbd269620673d 100644 --- a/caffe2/python/operator_test/floor_op_test.py +++ b/caffe2/python/operator_test/floor_op_test.py @@ -4,17 +4,18 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np - import unittest -class TestFloor(hu.HypothesisTestCase): +class TestFloor(serial.SerializedTestCase): - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_floor(self, X, gc, dc, engine): diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py index 2c2bc33910c560..d5ab8e58cec0f2 100644 --- a/caffe2/python/operator_test/gather_ops_test.py +++ b/caffe2/python/operator_test/gather_ops_test.py @@ -7,12 +7,13 @@ from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import hypothesis.extra.numpy as hnp -class TestGatherOps(hu.HypothesisTestCase): - @given(rows_num=st.integers(1, 10000), +class TestGatherOps(serial.SerializedTestCase): + @serial.given(rows_num=st.integers(1, 10000), index_num=st.integers(0, 5000), **hu.gcs) def test_gather_ops(self, rows_num, index_num, gc, dc): @@ -52,8 +53,8 @@ def _inputs(draw): ) -class TestBatchGatherOps(hu.HypothesisTestCase): - @given(inputs=_inputs(), +class TestBatchGatherOps(serial.SerializedTestCase): + @serial.given(inputs=_inputs(), **hu.gcs) def test_batch_gather_ops(self, inputs, gc, dc): data, ind = inputs diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py index d653dd3297bcd2..a16b92ba7d6992 100644 --- a/caffe2/python/operator_test/gather_ranges_op_test.py +++ b/caffe2/python/operator_test/gather_ranges_op_test.py @@ -6,8 +6,8 @@ from caffe2.python import core, workspace from hypothesis import given from hypothesis import strategies as st - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np @@ -121,8 +121,9 @@ def gather_ranges_to_dense_with_key(data, ranges, key, lengths): return outputs -class TestGatherRanges(hu.HypothesisTestCase): - @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only) +class TestGatherRanges(serial.SerializedTestCase): + @serial.given( + boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only) def test_gather_ranges(self, boarders_and_data, gc, dc): boarders, data = boarders_and_data @@ -142,7 +143,7 @@ def boarders_to_range(boarders): reference=gather_ranges, ) - @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only) + @serial.given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only) def test_gather_ranges_split(self, tensor_splits, gc, dc): data, ranges, lengths, _ = tensor_splits diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py index 98ebc9bed4012d..56902bb444efde 100644 --- a/caffe2/python/operator_test/glu_op_test.py +++ b/caffe2/python/operator_test/glu_op_test.py @@ -4,20 +4,21 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import assume, given, settings, HealthCheck import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np import unittest -class TestGlu(hu.HypothesisTestCase): +class TestGlu(serial.SerializedTestCase): # Suppress filter_too_much health check. # Reproduce by commenting @settings and uncommenting @seed. # @seed(302934307671667531413257853548643485645) @settings(suppress_health_check=[HealthCheck.filter_too_much]) - @given( + @serial.given( X=hu.tensor(), axis=st.integers(min_value=0, max_value=3), **hu.gcs diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py index caa9121e924a0c..febf05136e4ded 100644 --- a/caffe2/python/operator_test/group_norm_op_test.py +++ b/caffe2/python/operator_test/group_norm_op_test.py @@ -2,16 +2,16 @@ from __future__ import division from __future__ import print_function -import numpy as np - from caffe2.python import core -from hypothesis import given - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + +from hypothesis import given import hypothesis.strategies as st +import numpy as np -class TestGroupNormOp(hu.HypothesisTestCase): +class TestGroupNormOp(serial.SerializedTestCase): def group_norm_nchw_ref(self, X, gamma, beta, group, epsilon): dims = X.shape N = dims[0] @@ -40,10 +40,11 @@ def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon): Y = gamma * (X - mu) / std + beta return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)] - @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5), - H=st.integers(2, 5), W=st.integers(2, 5), - epsilon=st.floats(min_value=1e-5, max_value=1e-4), - order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) + @serial.given( + N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5), + H=st.integers(2, 5), W=st.integers(2, 5), + epsilon=st.floats(min_value=1e-5, max_value=1e-4), + order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) def test_group_norm_2d( self, N, G, D, H, W, epsilon, order, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py index 1292d843d0a827..ed8945b7927e90 100644 --- a/caffe2/python/operator_test/gru_test.py +++ b/caffe2/python/operator_test/gru_test.py @@ -7,6 +7,7 @@ from caffe2.python.model_helper import ModelHelper from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from caffe2.proto import caffe2_pb2 from functools import partial @@ -246,11 +247,11 @@ def generate_input_state(n, d): return hidden_t, model.net -class GRUCellTest(hu.HypothesisTestCase): +class GRUCellTest(serial.SerializedTestCase): # Test just for GRUUnitOp @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") - @given( + @serial.given( seed=st.integers(0, 2**32 - 1), input_tensor=gru_unit_op_input(), fwd_only=st.booleans(), diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py index dbb308f680021b..252855b39e4f25 100644 --- a/caffe2/python/operator_test/hyperbolic_ops_test.py +++ b/caffe2/python/operator_test/hyperbolic_ops_test.py @@ -6,11 +6,12 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestHyperbolicOps(hu.HypothesisTestCase): +class TestHyperbolicOps(serial.SerializedTestCase): def _test_hyperbolic_op(self, op_name, np_ref, X, in_place, engine, gc, dc): op = core.CreateOperator( op_name, @@ -30,15 +31,15 @@ def ref(X): self.assertDeviceChecks(dc, op, [X], [0]) self.assertGradientChecks(gc, op, [X], 0, [0]) - @given(X=hu.tensor(dtype=np.float32), **hu.gcs) + @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs) def test_sinh(self, X, gc, dc): self._test_hyperbolic_op("Sinh", np.sinh, X, False, "", gc, dc) - @given(X=hu.tensor(dtype=np.float32), **hu.gcs) + @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs) def test_cosh(self, X, gc, dc): self._test_hyperbolic_op("Cosh", np.cosh, X, False, "", gc, dc) - @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(), + @serial.given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_tanh(self, X, in_place, engine, gc, dc): self._test_hyperbolic_op("Tanh", np.tanh, X, in_place, engine, gc, dc) diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py index 1f8b7344b74b9a..6a3678abdbe3fb 100644 --- a/caffe2/python/operator_test/index_hash_ops_test.py +++ b/caffe2/python/operator_test/index_hash_ops_test.py @@ -2,15 +2,17 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestIndexHashOps(hu.HypothesisTestCase): - @given( +class TestIndexHashOps(serial.SerializedTestCase): + @serial.given( indices=st.sampled_from([ np.int32, np.int64 ]).flatmap(lambda dtype: hu.tensor(min_dim=1, max_dim=1, dtype=dtype)), diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py index a91154a4e45f6d..1d072a230ae3a7 100644 --- a/caffe2/python/operator_test/instance_norm_test.py +++ b/caffe2/python/operator_test/instance_norm_test.py @@ -8,11 +8,13 @@ from caffe2.python import core, model_helper, brew import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import unittest import os -class TestInstanceNorm(hu.HypothesisTestCase): + +class TestInstanceNorm(serial.SerializedTestCase): def _get_inputs(self, N, C, H, W, order): if order == 'NCHW': @@ -131,7 +133,7 @@ def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean, atol=1e-4, rtol=1e-4) - @given(gc=hu.gcs['gc'], + @serial.given(gc=hu.gcs['gc'], dc=hu.gcs['dc'], N=st.integers(2, 10), C=st.integers(3, 10), diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py index 011720d109d6d2..6f9e5d90572ab1 100644 --- a/caffe2/python/operator_test/integral_image_ops_test.py +++ b/caffe2/python/operator_test/integral_image_ops_test.py @@ -2,15 +2,16 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -from hypothesis import given + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestIntegralImageOps(hu.HypothesisTestCase): - @given(batch_size=st.integers(1, 3), +class TestIntegralImageOps(serial.SerializedTestCase): + @serial.given(batch_size=st.integers(1, 3), height=st.integers(7, 10), width=st.integers(7, 10), channels=st.integers(1, 8), @@ -45,7 +46,7 @@ def integral_image(im): self.assertDeviceChecks(dc, op, [im], [0]) self.assertReferenceChecks(gc, op, [im], integral_image) - @given(batch_size=st.integers(1, 3), + @serial.given(batch_size=st.integers(1, 3), height=st.integers(7, 10), width=st.integers(7, 10), channels=st.integers(1, 8), diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py index 97a1fbaeec2222..51faa14b9029fe 100644 --- a/caffe2/python/operator_test/jsd_ops_test.py +++ b/caffe2/python/operator_test/jsd_ops_test.py @@ -2,9 +2,11 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -24,8 +26,8 @@ def jsd_grad(go, o, pq_list): return [np.log(p * (1 - m) / (1 - p) / m) / 2. * go, None] -class TestJSDOps(hu.HypothesisTestCase): - @given(n=st.integers(10, 100), **hu.gcs_cpu_only) +class TestJSDOps(serial.SerializedTestCase): + @serial.given(n=st.integers(10, 100), **hu.gcs_cpu_only) def test_bernoulli_jsd(self, n, gc, dc): p = np.random.rand(n).astype(np.float32) q = np.random.rand(n).astype(np.float32) diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index fa0958afd1f99a..59203fd960c88b 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -4,17 +4,18 @@ from __future__ import unicode_literals from caffe2.python import brew, core +from caffe2.python.model_helper import ModelHelper from hypothesis import given import caffe2.python.hypothesis_test_util as hu -import unittest -import os +import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np +import os +import unittest -from caffe2.python.model_helper import ModelHelper -class TestLayerNormOp(hu.HypothesisTestCase): +class TestLayerNormOp(serial.SerializedTestCase): @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") - @given(X=hu.tensors(n=1), **hu.gcs) + @serial.given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_grad_op(self, X, gc, dc): X = X[0] if len(X.shape) == 1: diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py index 2284fdbba0785c..84e1307568f22d 100644 --- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py +++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py @@ -3,16 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals -from hypothesis import given -import hypothesis.strategies as st from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given +import hypothesis.strategies as st import numpy as np -class TestLearningRateAdaption(hu.HypothesisTestCase): - @given(inputs=hu.tensors(n=2), +class TestLearningRateAdaption(serial.SerializedTestCase): + @serial.given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr_alpha=st.floats(min_value=0.01, max_value=0.99, diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py index 73710e520f9495..3677239817d7e7 100644 --- a/caffe2/python/operator_test/learning_rate_op_test.py +++ b/caffe2/python/operator_test/learning_rate_op_test.py @@ -5,6 +5,7 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st @@ -15,8 +16,8 @@ import numpy as np -class TestLearningRate(hu.HypothesisTestCase): - @given(**hu.gcs_cpu_only) +class TestLearningRate(serial.SerializedTestCase): + @serial.given(**hu.gcs_cpu_only) def test_alter_learning_rate_op(self, gc, dc): iter = np.random.randint(low=1, high=1e5, size=1) active_period = int(np.random.randint(low=1, high=1e3, size=1)) diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py new file mode 100644 index 00000000000000..20ac2b25ba1103 --- /dev/null +++ b/caffe2/python/operator_test/length_split_op_test.py @@ -0,0 +1,152 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.python import core +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st +import numpy as np + + +class TestLengthSplitOperator(serial.SerializedTestCase): + + def _length_split_op_ref(self, input_lengths, n_split_array): + output = [] + n_split = n_split_array[0] + for x in input_lengths: + mod = x % n_split + val = x // n_split + 1 + for _ in range(n_split): + if mod > 0: + output.append(val) + mod -= 1 + else: + output.append(val - 1) + return [np.array(output).astype(np.int32)] + + @serial.given(**hu.gcs_cpu_only) + def test_length_split_edge(self, gc, dc): + input_lengths = np.array([3, 4, 5]).astype(np.int32) + n_split_ = np.array([5]).astype(np.int32) + # Expected output: + # [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1] + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0]) + + @given(**hu.gcs_cpu_only) + def test_length_split_arg(self, gc, dc): + input_lengths = np.array([9, 4, 5]).astype(np.int32) + n_split = 3 + # Expected output: + # [3, 3, 3, 2, 1, 1, 2, 2, 1] + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths'], + ['Y'], n_split=n_split + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths], + reference=lambda x : self._length_split_op_ref(x, [n_split]), + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths], [0]) + + @given(**hu.gcs_cpu_only) + def test_length_split_override_arg(self, gc, dc): + input_lengths = np.array([9, 4, 5]).astype(np.int32) + n_split_ignored = 2 + n_split_used = np.array([3]).astype(np.int32) + + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], n_split=n_split_ignored + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_used], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_used], [0]) + + @given(m=st.integers(1, 100), n_split=st.integers(1, 20), + **hu.gcs_cpu_only) + def test_length_split_even_divide(self, m, n_split, gc, dc): + # multiples of n_split + input_lengths = np.random.randint(100, size=m).astype(np.int32) * n_split + n_split_ = np.array([n_split]).astype(np.int32) + + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0]) + + @given(m=st.integers(1, 100), n_split=st.integers(1, 20), + **hu.gcs_cpu_only) + def test_length_split_random(self, m, n_split, gc, dc): + input_lengths = np.random.randint(100, size=m).astype(np.int32) + n_split_ = np.array([n_split]).astype(np.int32) + + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0]) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py index f879b702cd5092..d9cd2b2446045d 100644 --- a/caffe2/python/operator_test/lengths_pad_op_test.py +++ b/caffe2/python/operator_test/lengths_pad_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthsPadOp(hu.HypothesisTestCase): +class TestLengthsPadOp(serial.SerializedTestCase): - @given( + @serial.given( inputs=hu.lengths_tensor( dtype=np.float32, min_value=1, diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index d37904c08ec08b..4a9a6b0ff1a9da 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -6,16 +6,17 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthsTileOp(hu.HypothesisTestCase): +class TestLengthsTileOp(serial.SerializedTestCase): - @given( + @serial.given( inputs=st.integers(min_value=1, max_value=20).flatmap( lambda size: st.tuples( - hu.arrays([size]), + hu.arrays([size], dtype=np.float32), hu.arrays([size], dtype=np.int32, elements=st.integers(min_value=0, max_value=20)), ) @@ -32,7 +33,7 @@ def lengths_tile_op(data, lengths): op = core.CreateOperator( "LengthsTile", ["data", "lengths"], - ["output"] + ["output"], ) self.assertReferenceChecks( diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py index 6ffb5fc4fa843f..8bc27c31144f48 100644 --- a/caffe2/python/operator_test/lengths_top_k_ops_test.py +++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py @@ -2,15 +2,17 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthsTopKOps(hu.HypothesisTestCase): - @given(N=st.integers(min_value=0, max_value=10), +class TestLengthsTopKOps(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=0, max_value=10), K=st.integers(min_value=1, max_value=10), **hu.gcs_cpu_only) def test_lengths_top_k_op(self, N, K, gc, dc): diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py index c690b3aed3891e..b98100168df022 100644 --- a/caffe2/python/operator_test/listwise_l2r_operator_test.py +++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py index 29ff72cd1e72db..49051442350e28 100644 --- a/caffe2/python/operator_test/locally_connected_op_test.py +++ b/caffe2/python/operator_test/locally_connected_op_test.py @@ -8,10 +8,11 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestLocallyConnectedOp(hu.HypothesisTestCase): - @given(N=st.integers(1, 3), +class TestLocallyConnectedOp(serial.SerializedTestCase): + @serial.given(N=st.integers(1, 3), C=st.integers(1, 3), H=st.integers(1, 5), W=st.integers(1, 5), diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py index a6ea88e55737e5..e57bdb7a1d41df 100644 --- a/caffe2/python/operator_test/loss_ops_test.py +++ b/caffe2/python/operator_test/loss_ops_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLossOps(hu.HypothesisTestCase): +class TestLossOps(serial.SerializedTestCase): - @given(n=st.integers(1, 8), **hu.gcs) + @serial.given(n=st.integers(1, 8), **hu.gcs) def test_averaged_loss(self, n, gc, dc): X = np.random.rand(n).astype(np.float32) diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py index 28f8e0a20f0a90..9e0168eacf9354 100644 --- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py +++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py @@ -3,16 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st import numpy as np -from caffe2.python import core -import caffe2.python.hypothesis_test_util as hu - -class TestMarginRankingCriterion(hu.HypothesisTestCase): - @given(N=st.integers(min_value=10, max_value=20), +class TestMarginRankingCriterion(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=10, max_value=20), seed=st.integers(min_value=0, max_value=65535), margin=st.floats(min_value=-0.5, max_value=0.5), **hu.gcs) diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py index 4661c7715ec518..0772aee5c9b285 100644 --- a/caffe2/python/operator_test/math_ops_test.py +++ b/caffe2/python/operator_test/math_ops_test.py @@ -7,12 +7,13 @@ from hypothesis import given from hypothesis import strategies as st import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np import unittest -class TestMathOps(hu.HypothesisTestCase): +class TestMathOps(serial.SerializedTestCase): @given(X=hu.tensor(), exponent=st.floats(min_value=2.0, max_value=3.0), @@ -31,7 +32,7 @@ def powf_grad(g_out, outputs, fwd_inputs): output_to_grad="Y", grad_reference=powf_grad), - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), exponent=st.floats(min_value=-3.0, max_value=3.0), **hu.gcs) def test_sign(self, X, exponent, gc, dc): diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py index 67fdf2cf5ffe11..1872a129e569c8 100644 --- a/caffe2/python/operator_test/matmul_op_test.py +++ b/caffe2/python/operator_test/matmul_op_test.py @@ -13,10 +13,11 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestMatMul(hu.HypothesisTestCase): - @given( +class TestMatMul(serial.SerializedTestCase): + @serial.given( M=st.integers(min_value=1, max_value=10), K=st.integers(min_value=1, max_value=10), N=st.integers(min_value=1, max_value=10), @@ -125,7 +126,7 @@ def matmul_ref(X, Y, axis_a, axis_b, trans_a, trans_b): self.assertGradientChecks(gc, op, [X, Y], 1, [0]) -class TestBatchMatMul(hu.HypothesisTestCase): +class TestBatchMatMul(serial.SerializedTestCase): @settings(max_examples=30) @given( C=st.integers(min_value=0, max_value=3), # number of batch dims @@ -214,7 +215,7 @@ def matmul_ref(X, Y, trans_a, trans_b, dtype): # Check over multiple devices self.assertDeviceChecks(dc, op, [X, Y], [0]) - @given( + @serial.given( C_1=st.integers(min_value=0, max_value=3), # number of batch dims C_2=st.integers(min_value=0, max_value=3), M=st.integers(min_value=1, max_value=10), diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py index cbb1adc954784d..77c6b82625b139 100644 --- a/caffe2/python/operator_test/mean_op_test.py +++ b/caffe2/python/operator_test/mean_op_test.py @@ -4,16 +4,17 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np - import unittest -class TestMean(hu.HypothesisTestCase): - @given( +class TestMean(serial.SerializedTestCase): + @serial.given( k=st.integers(1, 5), n=st.integers(1, 10), m=st.integers(1, 10), diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py index 1b4322e0624f70..9f3302c6e75a3f 100644 --- a/caffe2/python/operator_test/merge_id_lists_op_test.py +++ b/caffe2/python/operator_test/merge_id_lists_op_test.py @@ -3,15 +3,13 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np - -from hypothesis import given -import hypothesis.strategies as st - from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.extra.numpy as hnp +import hypothesis.strategies as st +import numpy as np @st.composite @@ -53,7 +51,7 @@ def merge_arrays(vs, offs, j): return merged_lengths, merged_values -class TestMergeIdListsOp(hu.HypothesisTestCase): +class TestMergeIdListsOp(serial.SerializedTestCase): def test_merge_id_lists_ref(self): # Verify that the reference implementation is correct! lengths_0 = np.array([3, 0, 4], dtype=np.int32) @@ -69,8 +67,7 @@ def test_merge_id_lists_ref(self): np.testing.assert_array_equal(merged_lengths, expected_lengths) np.testing.assert_array_equal(merged_values, expected_values) - @given(inputs=id_list_batch(), - **hu.gcs_cpu_only) + @serial.given(inputs=id_list_batch(), **hu.gcs_cpu_only) def test_merge_id_lists_op(self, inputs, gc, dc): num_inputs = int(len(inputs) / 2) op = core.CreateOperator( diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py index fa456c8382f64b..ae9d9158f5062b 100644 --- a/caffe2/python/operator_test/moments_op_test.py +++ b/caffe2/python/operator_test/moments_op_test.py @@ -7,12 +7,13 @@ from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st -import numpy as np import itertools as it +import numpy as np -class TestMomentsOp(hu.HypothesisTestCase): +class TestMomentsOp(serial.SerializedTestCase): def run_moments_test(self, X, axes, keepdims, gc, dc): if axes is None: op = core.CreateOperator( @@ -41,7 +42,7 @@ def ref(X): self.assertDeviceChecks(dc, op, [X], [0, 1]) self.assertGradientChecks(gc, op, [X], 0, [0, 1]) - @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(), + @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(), num_axes=st.integers(1, 4), **hu.gcs) def test_moments(self, X, keepdims, num_axes, gc, dc): self.run_moments_test(X, None, keepdims, gc, dc) diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py index 7bfceb61121ca2..39e358f30d386e 100644 --- a/caffe2/python/operator_test/momentum_sgd_test.py +++ b/caffe2/python/operator_test/momentum_sgd_test.py @@ -5,6 +5,7 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis from hypothesis import given @@ -13,8 +14,8 @@ import unittest -class TestMomentumSGD(hu.HypothesisTestCase): - @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs) +class TestMomentumSGD(serial.SerializedTestCase): + @serial.given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs) def test_momentum_sgd(self, n, nesterov, gc, dc): param = np.random.rand(n).astype(np.float32) grad = np.random.rand(n).astype(np.float32) @@ -69,7 +70,7 @@ def momentum_sgd(grad, param_momentum, lr, param=None): reference=momentum_sgd ) - @given( + @serial.given( inputs=hu.tensors(n=3), momentum=st.floats(min_value=0.1, max_value=0.9), nesterov=st.booleans(), diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py index d37955ac1b4d2e..309236a281a492 100644 --- a/caffe2/python/operator_test/negate_gradient_op_test.py +++ b/caffe2/python/operator_test/negate_gradient_op_test.py @@ -3,18 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np +from caffe2.python import workspace, core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -from caffe2.python import workspace, core +import numpy as np -class TestNegateGradient(hu.HypothesisTestCase): +class TestNegateGradient(serial.SerializedTestCase): - @given(X=hu.tensor(), - inplace=st.booleans(), - **hu.gcs) + @serial.given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs) def test_forward(self, X, inplace, gc, dc): def neg_grad_ref(X): return (X,) diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py index c1d02de1d09690..42fde4c9452251 100644 --- a/caffe2/python/operator_test/numpy_tile_op_test.py +++ b/caffe2/python/operator_test/numpy_tile_op_test.py @@ -11,10 +11,11 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestNumpyTile(hu.HypothesisTestCase): - @given(ndim=st.integers(min_value=1, max_value=4), +class TestNumpyTile(serial.SerializedTestCase): + @serial.given(ndim=st.integers(min_value=1, max_value=4), seed=st.integers(min_value=0, max_value=65536), **hu.gcs_cpu_only) def test_numpy_tile(self, ndim, seed, gc, dc): diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py index da1c11fbc2cc2e..19e6ee10e3ddf9 100644 --- a/caffe2/python/operator_test/one_hot_ops_test.py +++ b/caffe2/python/operator_test/one_hot_ops_test.py @@ -7,6 +7,7 @@ from caffe2.proto import caffe2_pb2 from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -25,8 +26,8 @@ def _one_hots(): max_size=sum(x[1])))) -class TestOneHotOps(hu.HypothesisTestCase): - @given( +class TestOneHotOps(serial.SerializedTestCase): + @serial.given( x=hu.tensor( min_dim=2, max_dim=2, dtype=np.int32, elements=st.integers(min_value=0, max_value=10)), @@ -56,7 +57,7 @@ def ref(x, lens, vals): op = core.CreateOperator('BatchOneHot', ["X", "LENS", "VALS"], ["Y"]) self.assertReferenceChecks(gc, op, [x, lens, vals], ref) - @given( + @serial.given( x=hu.tensor( min_dim=2, max_dim=2, dtype=np.float32, elements=st.integers(min_value=-5, max_value=5)), @@ -108,7 +109,7 @@ def ref(x, lens, boundaries): ["X", "LENS", "BOUNDARIES"], ["Y"]) self.assertReferenceChecks(gc, op, [x, lens, boundaries], ref) - @given( + @serial.given( hot_indices=hu.tensor( min_dim=1, max_dim=1, dtype=np.int64, elements=st.integers(min_value=0, max_value=42)), @@ -134,7 +135,7 @@ def one_hot_ref(hot_indices, size): one_hot_ref, input_device_options={'size': core.DeviceOption(caffe2_pb2.CPU)}) - @given(hot_indices=_one_hots()) + @serial.given(hot_indices=_one_hots()) def test_segment_one_hot(self, hot_indices): index_size, lengths, indices = hot_indices diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py index 0cba8053d53e05..eaf0ef58ba30e6 100644 --- a/caffe2/python/operator_test/onnx_while_test.py +++ b/caffe2/python/operator_test/onnx_while_test.py @@ -2,17 +2,18 @@ from __future__ import division from __future__ import print_function -import numpy as np -import hypothesis.strategies as st -import unittest -import caffe2.python.hypothesis_test_util as hu +from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given -from caffe2.proto import caffe2_pb2 +import hypothesis.strategies as st +import numpy as np +import unittest -class TestONNXWhile(hu.HypothesisTestCase): - @given( +class TestONNXWhile(serial.SerializedTestCase): + @serial.given( condition=st.booleans(), max_trip_count=st.integers(0, 100), save_scopes=st.booleans(), diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py index d54ac26c55fb36..5d3fd0e60f5e08 100644 --- a/caffe2/python/operator_test/order_switch_test.py +++ b/caffe2/python/operator_test/order_switch_test.py @@ -1,14 +1,17 @@ from __future__ import absolute_import, division, print_function, unicode_literals import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st + from caffe2.python import core from hypothesis import given class OrderSwitchOpsTest(hu.HypothesisTestCase): - @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs) - def test_nchw2nhwc(self, X, gc, dc): - op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], device_option=gc) + @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), + engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) + def test_nchw2nhwc(self, X, engine, gc, dc): + op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], engine=engine) def nchw2nhwc_ref(X): X_reshaped = X.transpose((0,) + tuple(range(2, X.ndim)) + (1,)) @@ -18,12 +21,14 @@ def nchw2nhwc_ref(X): self.assertGradientChecks(gc, op, [X], 0, [0]) self.assertDeviceChecks(dc, op, [X], [0]) - @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs) - def test_nhwc2nchw(self, X, gc, dc): - op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], device_option=gc) + @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), + engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) + def test_nhwc2nchw(self, X, engine, gc, dc): + op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], engine=engine) def nhwc2nchw_ref(X): - X_reshaped = X.transpose((0, X.ndim - 1) + tuple(range(1, X.ndim - 1))) + X_reshaped = X.transpose( + (0, X.ndim - 1) + tuple(range(1, X.ndim - 1))) return (X_reshaped,) self.assertReferenceChecks(gc, op, [X], nhwc2nchw_ref) diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py index 3935ca8c8f17ab..f6674ed625bd45 100644 --- a/caffe2/python/operator_test/pack_ops_test.py +++ b/caffe2/python/operator_test/pack_ops_test.py @@ -5,6 +5,7 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given from hypothesis import strategies as st @@ -12,7 +13,7 @@ import time -class TestTensorPackOps(hu.HypothesisTestCase): +class TestTensorPackOps(serial.SerializedTestCase): def pack_segments_ref(self, return_presence_mask=False, max_length=None): def pack_segments_ref(lengths, data, max_length=max_length): @@ -53,7 +54,7 @@ def pack_segments_ref(lengths, data, max_length=max_length): return pack_segments_ref - @given( + @serial.given( num_seq=st.integers(10, 100), cell_size=st.integers(1, 10), **hu.gcs diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py index a5a3d6de537d01..6bf2315ca0c52e 100644 --- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py +++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestPackRNNSequenceOperator(hu.HypothesisTestCase): +class TestPackRNNSequenceOperator(serial.SerializedTestCase): - @given(n=st.integers(0, 10), k=st.integers(1, 5), + @serial.given(n=st.integers(0, 10), k=st.integers(1, 5), dim=st.integers(1, 5), **hu.gcs_cpu_only) def test_pack_rnn_seqence(self, n, k, dim, gc, dc): lengths = np.random.randint(k, size=n).astype(np.int32) + 1 @@ -47,7 +48,7 @@ def pack_op(values, lengths): # Gradient check self.assertGradientChecks(gc, op, [values, lengths], 0, [0]) - @given(n=st.integers(0, 10), k=st.integers(2, 5), + @serial.given(n=st.integers(0, 10), k=st.integers(2, 5), dim=st.integers(1, 5), **hu.gcs_cpu_only) def test_unpack_rnn_seqence(self, n, k, dim, gc, dc): lengths = np.random.randint(k, size=n).astype(np.int32) + 1 diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py index ee5e001a91e2ce..43cd10c231887e 100644 --- a/caffe2/python/operator_test/pad_test.py +++ b/caffe2/python/operator_test/pad_test.py @@ -2,16 +2,18 @@ from __future__ import division from __future__ import print_function -import numpy as np -import hypothesis.strategies as st -import unittest -import caffe2.python.hypothesis_test_util as hu from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given +import hypothesis.strategies as st +import numpy as np +import unittest -class TestPad(hu.HypothesisTestCase): - @given(pad_t=st.integers(-5, 0), +class TestPad(serial.SerializedTestCase): + @serial.given(pad_t=st.integers(-5, 0), pad_l=st.integers(-5, 0), pad_b=st.integers(-5, 0), pad_r=st.integers(-5, 0), diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py index f09dcdcde2b6d5..83d67f4beea966 100644 --- a/caffe2/python/operator_test/piecewise_linear_transform_test.py +++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np - import unittest -class TestPiecewiseLinearTransform(hu.HypothesisTestCase): +class TestPiecewiseLinearTransform(serial.SerializedTestCase): def constrain(self, v, min_val, max_val): def constrain_internal(x): return min(max(x, min_val), max_val) @@ -31,7 +32,7 @@ def transform(self, x, bounds, slopes, intercepts): y = slopes[index] * x_ + intercepts[index] return y - @given(n=st.integers(1, 100), **hu.gcs) + @serial.given(n=st.integers(1, 100), **hu.gcs) def test_multi_predictions_params_from_arg(self, n, gc, dc): slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32) intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32) diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py index 9cdeafaae50bc9..5882f7aef8346d 100644 --- a/caffe2/python/operator_test/weighted_sum_test.py +++ b/caffe2/python/operator_test/weighted_sum_test.py @@ -14,7 +14,7 @@ class TestWeightedSumOp(serial.SerializedTestCase): - @serial.given_and_seeded( + @serial.given( n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), seed=st.integers(min_value=0, max_value=65535), diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 2ebcf1d92a1240..482d16a0dfa6a6 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -8,6 +8,8 @@ from collections import namedtuple, defaultdict from past.builtins import basestring +import logging + import numpy as np from caffe2.python import core, scope, utils, workspace @@ -20,6 +22,8 @@ AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"]) _optimizer_instance_count = defaultdict(int) +logger = logging.getLogger(__name__) + class Optimizer(object): def __init__(self): @@ -554,6 +558,8 @@ def _run(self, net, param_init_net, param_info): ) if self.rowWise: + assert self.engine == "SIMD", "Got {}".format(self.engine) + shapes, types = workspace.InferShapesAndTypes([param_init_net]) if str(param) not in shapes: # Type/shape inference is not available for this param, fallback @@ -577,13 +583,24 @@ def _run(self, net, param_init_net, param_info): shape=[shapes[str(param)][0]], value=0.0 ) - else: - param_squared_sum = param_init_net.ConstantFill( - [param], - str(param) + "_squared_sum", - value=0.0 - ) + if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16": + shapes, types = workspace.InferShapesAndTypes([param_init_net]) + assert str(param) in shapes, shapes + shape = shapes[str(param)] + + param_squared_sum = param_init_net.Float16ConstantFill( + [], + str(param) + "_squared_sum", + value=0.0, + shape=shape, + ) + else: + param_squared_sum = param_init_net.ConstantFill( + [param], + str(param) + "_squared_sum", + value=0.0 + ) self._aux_params.local.append(param_squared_sum) @@ -604,7 +621,7 @@ def _run(self, net, param_init_net, param_info): [param, param_squared_sum, grad.indices, grad.values, lr], [param, param_squared_sum], epsilon=self.epsilon, - engine=self.engine + engine=self.engine, ) else: output_args = [param, param_squared_sum] @@ -913,19 +930,6 @@ def _run(self, net, param_init_net, param_info): **(self.init_kwargs) ) - if self.use_lr_adaption: - effective_grad = param_init_net.ConstantFill( - [param], - param + "_effgrad", - value=0.0 - ) - self._aux_params.local.append(effective_grad) - net.LearningRateAdaption( - [lr, grad, effective_grad], - [lr], - lr_alpha=self.lr_alpha, - normalized_lr_adaption=self.normalized_lr_adaption) - m1 = param_init_net.ConstantFill( [param], param + "_first_moment", @@ -956,35 +960,45 @@ def _run(self, net, param_init_net, param_info): 'If SparseAdam with rowWise=True, gradient must be '\ 'a gradientslice. PLease ensure that rowWise is not enabled '\ 'for the dense Adam optimizer, as it is not supported.' + + output_blobs = [param, m1, m2] + if self.use_lr_adaption: + effective_grad = str(param) + '_effective_grad' + output_blobs.append(effective_grad) + if isinstance(grad, core.GradientSlice): grad = self.dedup(net, self.sparse_dedup_aggregator, grad) if self.rowWise: op = 'RowWiseSparseAdam' else: op = 'SparseAdam' + net.__getattr__(op)( [param, m1, m2, grad.indices, grad.values, lr, iteration], - [param, m1, m2], + output_blobs, beta1=self.beta1, beta2=self.beta2, - epsilon=self.epsilon - ) + epsilon=self.epsilon) + if self.use_lr_adaption: + net.LearningRateAdaption( + [lr, grad.values, effective_grad], + [lr], + lr_alpha=self.lr_alpha, + normalized_lr_adaption=self.normalized_lr_adaption) else: + net.Adam( + [param, m1, m2, grad, lr, iteration], + output_blobs, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon) if self.use_lr_adaption: - net.Adam( - [param, m1, m2, grad, lr, iteration], - [param, m1, m2, effective_grad], - beta1=self.beta1, - beta2=self.beta2, - epsilon=self.epsilon) - else: - net.Adam( - [param, m1, m2, grad, lr, iteration], - [param, m1, m2], - beta1=self.beta1, - beta2=self.beta2, - epsilon=self.epsilon) + net.LearningRateAdaption( + [lr, grad, effective_grad], + [lr], + lr_alpha=self.lr_alpha, + normalized_lr_adaption=self.normalized_lr_adaption) def scale_learning_rate(self, scale): self.alpha *= scale diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py index ee38fe52df8c4e..ade4e6ac9248e8 100644 --- a/caffe2/python/pipeline.py +++ b/caffe2/python/pipeline.py @@ -324,7 +324,8 @@ def _pipe_step( elif hasattr(input, 'reader'): reader = input.reader() else: - raise ValueError('in must be a reader, queue or stream.') + raise ValueError( + 'Input must be a reader, queue or stream. Got {}'.format(type(input))) if processor is not None: reader = ProcessingReader(reader, processor) diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h index 8b6f5e1a5c3033..e0122fdcc9983a 100644 --- a/caffe2/python/pybind_state_dlpack.h +++ b/caffe2/python/pybind_state_dlpack.h @@ -108,16 +108,21 @@ class DLPackWrapper { } tensor->Resize(dims); - const auto& meta = DLTypeToCaffe(dlTensor->dtype); + caffe2::TypeMeta meta = DLTypeToCaffe(dlTensor->dtype); + at::Device device = at::Device(tensor->GetDeviceType()); tensor->ShareExternalPointer( - ((int8_t*)dlTensor->data) + dlTensor->byte_offset, + at::DataPtr( + (void*)(((int8_t*)dlTensor->data) + dlTensor->byte_offset), + static_cast(dlMTensor), + [](void* t_ptr) -> void { + DLManagedTensor* mt_ptr = static_cast(t_ptr); + if (mt_ptr->destructor) { + mt_ptr->destructor(mt_ptr); + } + }, + device), meta, - 0, - [dlMTensor](void*) { - if (dlMTensor->destructor) { - dlMTensor->destructor(dlMTensor); - } - }); + 0); } Tensor* tensor; diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc index 9c15bc2145d5a9..8c547cf8eccca6 100644 --- a/caffe2/python/pybind_state_gpu.cc +++ b/caffe2/python/pybind_state_gpu.cc @@ -10,7 +10,9 @@ #include #include +#ifdef CAFFE2_USE_CUDNN #include "caffe2/core/common_cudnn.h" +#endif // CAFFE2_USE_CUDNN #include "caffe2/core/context_gpu.h" #include "caffe2/operators/operator_fallback_gpu.h" #include "caffe2/python/pybind_state_registry.h" @@ -39,10 +41,12 @@ namespace py = pybind11; void addCUDAGlobalMethods(py::module& m) { m.def("num_cuda_devices", &NumCudaDevices); m.def("get_cuda_version", &CudaVersion); +#ifdef CAFFE2_USE_CUDNN m.def("get_cudnn_version", &cudnnCompiledVersion); m.attr("cudnn_convolution_fwd_algo_count") = py::int_((int) CUDNN_CONVOLUTION_FWD_ALGO_COUNT); m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT); m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT); +#endif m.def("get_cuda_peer_access_pattern", []() { std::vector> pattern; CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern)); diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc index fbfe143f66cee0..1f05d3bd1beeb4 100644 --- a/caffe2/python/pybind_state_nomni.cc +++ b/caffe2/python/pybind_state_nomni.cc @@ -198,7 +198,45 @@ void addNomnigraphMethods(pybind11::module& m) { CAFFE_ENFORCE(nn::is(n)); return nn::get(n); }, - py::return_value_policy::reference_internal); + py::return_value_policy::reference_internal) + .def( + "getAnnotation", + [](NNGraph::NodeRef n) { return getOrAddCaffe2Annotation(n); }) + .def( + "setAnnotation", + [](NNGraph::NodeRef n, Caffe2Annotation annot) { + auto* nnOp = nn::get(n); + nnOp->setAnnotation( + nom::util::make_unique(annot)); + }) + .def( + "getOperatorPredecessors", + [](NNGraph::NodeRef n) { + CAFFE_ENFORCE(nn::is(n)); + std::vector pred; + for (const auto& inEdge : n->getInEdges()) { + auto data = inEdge->tail(); + if (nn::hasProducer(data)) { + pred.emplace_back(nn::getProducer(data)); + } + } + return pred; + }, + py::return_value_policy::reference) + .def( + "getOperatorSuccessors", + [](NNGraph::NodeRef n) { + CAFFE_ENFORCE(nn::is(n)); + std::vector succ; + for (const auto& outEdge : n->getOutEdges()) { + auto data = outEdge->head(); + for (const auto& consumer : nn::getConsumers(data)) { + succ.emplace_back(consumer); + } + } + return succ; + }, + py::return_value_policy::reference); py::class_ nnop(m, "NeuralNetOperator"); py::class_ nndata(m, "NeuralNetData"); @@ -233,9 +271,11 @@ void addNomnigraphMethods(pybind11::module& m) { auto nnOp = nn::get(node); return opName == nnOp->getName(); }); - return g->createNode( - nom::matcher::MatchNode( - match, true, 1, !strict)); + auto node = nom::matcher::MatchNode(match); + if (!strict) { + node.nonTerminal(); + } + return g->createNode(std::move(node)); }, py::return_value_policy::reference_internal, py::arg("node"), @@ -243,9 +283,11 @@ void addNomnigraphMethods(pybind11::module& m) { .def( "createNode", [](nn::NNMatchGraph* g, nom::repr::Tensor& tensor, bool strict) { - return g->createNode( - nom::matcher::MatchNode( - nn::matchTensor(), true, 1, !strict)); + auto node = nn::NNMatchNode(nn::matchTensor()); + if (!strict) { + node.nonTerminal(); + } + return g->createNode(std::move(node)); }, py::return_value_policy::reference_internal, py::arg("tensor"), @@ -255,9 +297,11 @@ void addNomnigraphMethods(pybind11::module& m) { [](nn::NNMatchGraph* g, bool strict) { auto match = nn::NNNodeMatchCriteria( [](NNGraph::NodeRef node) { return true; }); - return g->createNode( - nom::matcher::MatchNode( - match, true, 1, !strict)); + auto node = nom::matcher::MatchNode(match); + if (!strict) { + node.nonTerminal(); + } + return g->createNode(std::move(node)); }, py::return_value_policy::reference_internal, py::arg("strict") = false) @@ -276,6 +320,14 @@ void addNomnigraphMethods(pybind11::module& m) { } return NNSubgraph(); }); + + // Annotation API + py::class_ annotation(m, "Annotation"); + annotation.def(py::init<>()) + .def("setDevice", &Caffe2Annotation::setDevice) + .def("getDevice", &Caffe2Annotation::getDevice) + .def("setDeviceType", &Caffe2Annotation::setDeviceType) + .def("getDeviceType", &Caffe2Annotation::getDeviceType); } REGISTER_PYBIND_ADDITION(addNomnigraphMethods); diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md index 00d104d309f652..2885ed290cab8e 100644 --- a/caffe2/python/serialized_test/README.md +++ b/caffe2/python/serialized_test/README.md @@ -4,9 +4,15 @@ Major functionality lives in `serialized_test_util.py` ## How to use 1. Extend the test case class from `SerializedTestCase` -2. Change the `@given` decorator to `@given_and_seeded`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run. -3. Change a call to `unittest.main()` in `__main__` to `testWithArgs`. -4. Run your test `python caffe2/python/operator_test/my_test.py -g` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one folder per test function -5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. +2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run. +3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`. +4. Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one npz file per test function. Use `-O` to change the output directory. +5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. + +##Additional Notes If we'd like to extend the test framework beyond that for operator tests, we can create a new subfolder for them inside `caffe2/python/serialized_test/data`. + +Note, we currently don't support using other hypothesis decorators on top of `given_and_seeded`. Hypothis has some handling to explicitly check that `@given` is on the bottom of the decorator stack. + +If there are multiple calls to assertReferenceChecks in a test function, we'll serialize and write the last one. The actual input checked may then differ if we refactor a test function that calls this multiple times, though the serialized test should still pass since we then use the serialized input to generate a dynamic output. diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip new file mode 100644 index 00000000000000..415a47d71c3166 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip new file mode 100644 index 00000000000000..e4584245ab112c Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip new file mode 100644 index 00000000000000..0dc8e48877d431 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip new file mode 100644 index 00000000000000..07e439a921cfcf Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip new file mode 100644 index 00000000000000..2bdb95bdaf798c Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip new file mode 100644 index 00000000000000..adac8479290e62 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip new file mode 100644 index 00000000000000..9326bfd4df1da4 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip new file mode 100644 index 00000000000000..27c7db4e9a17de Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip new file mode 100644 index 00000000000000..8ddd81fbf44a9f Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip b/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip new file mode 100644 index 00000000000000..cb08f9f86e6df4 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip new file mode 100644 index 00000000000000..88ec3cc8dbba70 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip new file mode 100644 index 00000000000000..a0e1408b5a731b Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip b/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip new file mode 100644 index 00000000000000..5a115b00e3b154 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip new file mode 100644 index 00000000000000..73717a440d9581 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip new file mode 100644 index 00000000000000..bb95ce7149a696 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip new file mode 100644 index 00000000000000..eaddc47759a9c0 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip new file mode 100644 index 00000000000000..f51ee2ee182b35 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip new file mode 100644 index 00000000000000..668efa6e1643b5 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip new file mode 100644 index 00000000000000..12692067370534 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip b/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip new file mode 100644 index 00000000000000..166b4b1d8022ce Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip new file mode 100644 index 00000000000000..ccdd2257ffc71b Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip new file mode 100644 index 00000000000000..928a74f90cec5d Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip new file mode 100644 index 00000000000000..2c3e35be43a175 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip new file mode 100644 index 00000000000000..1c9f5abc9bd0c5 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip new file mode 100644 index 00000000000000..f0e22405a92f76 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip new file mode 100644 index 00000000000000..d19477ff24f940 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip b/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip new file mode 100644 index 00000000000000..016c1495758286 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip new file mode 100644 index 00000000000000..a1768f9b2d700f Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip new file mode 100644 index 00000000000000..709d1674d3052d Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip new file mode 100644 index 00000000000000..9939456c7dbc89 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip new file mode 100644 index 00000000000000..6a22bf9d5b9c41 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip new file mode 100644 index 00000000000000..2b3813c79aee7e Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip new file mode 100644 index 00000000000000..37b399e1584e51 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip new file mode 100644 index 00000000000000..11701717ee81e2 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip new file mode 100644 index 00000000000000..9b87be841729ba Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip new file mode 100644 index 00000000000000..c6a3150b72d008 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip new file mode 100644 index 00000000000000..867437401d2f48 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip new file mode 100644 index 00000000000000..ef9530a9a11ce2 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip new file mode 100644 index 00000000000000..a8e45abf3c8ea5 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip new file mode 100644 index 00000000000000..02cb6d516ae68e Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip new file mode 100644 index 00000000000000..61e709080e5c22 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip new file mode 100644 index 00000000000000..407edcbba9a970 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip new file mode 100644 index 00000000000000..21b93d67f95a1d Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip new file mode 100644 index 00000000000000..334cc694c49e57 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip new file mode 100644 index 00000000000000..08d071b420c216 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip new file mode 100644 index 00000000000000..338cb152f91aef Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip b/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip new file mode 100644 index 00000000000000..23cc6cdedec7f1 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip new file mode 100644 index 00000000000000..234c13761bd8a8 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip b/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip new file mode 100644 index 00000000000000..91b7a6ea038fd2 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip new file mode 100644 index 00000000000000..2886b5689db08e Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip new file mode 100644 index 00000000000000..53885e5840d91f Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip new file mode 100644 index 00000000000000..8396b7975691ac Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip new file mode 100644 index 00000000000000..012725912304cd Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip new file mode 100644 index 00000000000000..9fd13e82fb758d Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip b/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip new file mode 100644 index 00000000000000..72fe453f766f85 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip b/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip new file mode 100644 index 00000000000000..ce0af76aa78704 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip new file mode 100644 index 00000000000000..1e1eceec001f50 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip new file mode 100644 index 00000000000000..e2295f316c3a3e Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip new file mode 100644 index 00000000000000..87eab5db10f061 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip new file mode 100644 index 00000000000000..00a30f3611f954 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip new file mode 100644 index 00000000000000..fa921405003358 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip new file mode 100644 index 00000000000000..a878aedf18c943 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip new file mode 100644 index 00000000000000..1425d94c588e3a Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip new file mode 100644 index 00000000000000..e723b255e952f7 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip b/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip new file mode 100644 index 00000000000000..0bdb30fab892a3 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip new file mode 100644 index 00000000000000..b38de79996df64 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip new file mode 100644 index 00000000000000..0d37ccb28e3ed2 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip new file mode 100644 index 00000000000000..cdc2079dc27758 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip new file mode 100644 index 00000000000000..86d5742dd78e16 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip b/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip new file mode 100644 index 00000000000000..ddaa761fab5a68 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip new file mode 100644 index 00000000000000..2ef77b58d9b0d4 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip b/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip new file mode 100644 index 00000000000000..2de39276663ad2 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip b/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip new file mode 100644 index 00000000000000..0d9b65472803fc Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip b/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip new file mode 100644 index 00000000000000..4c2a40e90619e0 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip b/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip new file mode 100644 index 00000000000000..6255a751d925c2 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip new file mode 100644 index 00000000000000..305f6fa75e5fa6 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip new file mode 100644 index 00000000000000..3a72d02000f275 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip new file mode 100644 index 00000000000000..bffbc66c8ea8b1 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip new file mode 100644 index 00000000000000..2cf316c359a415 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip new file mode 100644 index 00000000000000..652b0dc0909105 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip new file mode 100644 index 00000000000000..3ff14dcf91ec12 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip new file mode 100644 index 00000000000000..437698665e19ad Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip new file mode 100644 index 00000000000000..3998133efd3545 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip new file mode 100644 index 00000000000000..855579a23a3524 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip b/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip new file mode 100644 index 00000000000000..8c09282a249a2d Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip new file mode 100644 index 00000000000000..76d2671c4fbcbe Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip new file mode 100644 index 00000000000000..d0db2a7d338495 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip new file mode 100644 index 00000000000000..20e95bdf8633c8 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip new file mode 100644 index 00000000000000..f6917e3c4ffa23 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip new file mode 100644 index 00000000000000..5b29d053454923 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip b/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip new file mode 100644 index 00000000000000..09b2dd7e7fb953 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip new file mode 100644 index 00000000000000..8438af3a9f7867 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip new file mode 100644 index 00000000000000..953e275453a5a9 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip new file mode 100644 index 00000000000000..945f89561e9d77 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip b/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip new file mode 100644 index 00000000000000..2339652431c7d4 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip new file mode 100644 index 00000000000000..7bdda94c3fdba6 Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip differ diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb deleted file mode 100644 index ba59745bd14a7b..00000000000000 Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb and /dev/null differ diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz deleted file mode 100644 index 3f35572017ab82..00000000000000 Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz and /dev/null differ diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb deleted file mode 100644 index 8fae4791be423c..00000000000000 Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb and /dev/null differ diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz deleted file mode 100644 index 543a127bedee0e..00000000000000 Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz and /dev/null differ diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py index ad79591fb2e1bf..feb5d8e127cb88 100644 --- a/caffe2/python/serialized_test/serialized_test_util.py +++ b/caffe2/python/serialized_test/serialized_test_util.py @@ -7,14 +7,15 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import gradient_checker import caffe2.python.hypothesis_test_util as hu -from hypothesis import given, seed, settings +import hypothesis as hy import inspect -import numpy +import numpy as np import os import re import shutil import sys import threading +from zipfile import ZipFile operator_test_type = 'operator_test' TOP_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -23,10 +24,10 @@ _output_context = threading.local() -def given_and_seeded(*given_args, **given_kwargs): +def given(*given_args, **given_kwargs): def wrapper(f): - hyp_func = given(*given_args, **given_kwargs)(f) - fixed_seed_func = seed(0)(settings(max_examples=1)(given( + hyp_func = hy.given(*given_args, **given_kwargs)(f) + fixed_seed_func = hy.seed(0)(hy.settings(max_examples=1)(hy.given( *given_args, **given_kwargs)(f))) def func(self, *args, **kwargs): @@ -38,21 +39,36 @@ def func(self, *args, **kwargs): return wrapper +def _getGradientOrNone(op_proto): + try: + grad_ops, _ = gradient_checker.getGradientForOp(op_proto) + return grad_ops + except Exception: + return [] + + +# necessary to support converting jagged lists into numpy arrays +def _transformList(l): + ret = np.empty(len(l), dtype=np.object) + for (i, arr) in enumerate(l): + ret[i] = arr + return ret + + +def _prepare_dir(path): + if os.path.exists(path): + shutil.rmtree(path) + os.makedirs(path) + + class SerializedTestCase(hu.HypothesisTestCase): should_serialize = False def get_output_dir(self): - class_path = inspect.getfile(self.__class__) - file_name_components = os.path.basename(class_path).split('.') - test_file = file_name_components[0] - - function_name_components = self.id().split('.') - test_function = function_name_components[-1] - output_dir_arg = getattr(_output_context, 'output_dir', DATA_DIR) output_dir = os.path.join( - output_dir_arg, operator_test_type, test_file + '.' + test_function) + output_dir_arg, operator_test_type) if os.path.exists(output_dir): return output_dir @@ -65,30 +81,55 @@ def get_output_dir(self): output_dir_fallback = os.path.join(cwd, serialized_dir, DATA_SUFFIX) output_dir = os.path.join( output_dir_fallback, - operator_test_type, - test_file + '.' + test_function) + operator_test_type) return output_dir + def get_output_filename(self): + class_path = inspect.getfile(self.__class__) + file_name_components = os.path.basename(class_path).split('.') + test_file = file_name_components[0] + + function_name_components = self.id().split('.') + test_function = function_name_components[-1] + + return test_file + '.' + test_function + def serialize_test(self, inputs, outputs, grad_ops, op, device_option): - def prepare_dir(path): - if os.path.exists(path): - shutil.rmtree(path) - os.makedirs(path) output_dir = self.get_output_dir() - prepare_dir(output_dir) - for (i, grad) in enumerate(grad_ops): - grad_path = os.path.join(output_dir, 'gradient_{}.pb'.format(i)) - with open(grad_path, 'wb') as f: - f.write(grad.SerializeToString()) + test_name = self.get_output_filename() + full_dir = os.path.join(output_dir, test_name) + _prepare_dir(full_dir) + + inputs = _transformList(inputs) + outputs = _transformList(outputs) device_type = int(device_option.device_type) - op_path = os.path.join(output_dir, 'operator_{}.pb'.format(device_type)) + + op_path = os.path.join(full_dir, 'op.pb') + grad_paths = [] + inout_path = os.path.join(full_dir, 'inout') + with open(op_path, 'wb') as f: f.write(op.SerializeToString()) - numpy.savez_compressed( - os.path.join(output_dir, 'inputs'), inputs=inputs) - numpy.savez_compressed( - os.path.join(output_dir, 'outputs'), outputs=outputs) + for (i, grad) in enumerate(grad_ops): + grad_path = os.path.join(full_dir, 'grad_{}.pb'.format(i)) + grad_paths.append(grad_path) + with open(grad_path, 'wb') as f: + f.write(grad.SerializeToString()) + + np.savez_compressed( + inout_path, + inputs=inputs, + outputs=outputs, + device_type=device_type) + + with ZipFile(os.path.join(output_dir, test_name + '.zip'), 'w') as z: + z.write(op_path, 'op.pb') + z.write(inout_path + '.npz', 'inout.npz') + for path in grad_paths: + z.write(path, os.path.basename(path)) + + shutil.rmtree(full_dir) def compare_test(self, inputs, outputs, grad_ops, atol=1e-7, rtol=1e-7): @@ -98,48 +139,50 @@ def parse_proto(x): return proto source_dir = self.get_output_dir() + test_name = self.get_output_filename() + full_dir = os.path.join(source_dir, test_name) + _prepare_dir(full_dir) + with ZipFile(os.path.join(source_dir, test_name + '.zip')) as z: + loaded = z.extractall(full_dir) + + op_path = os.path.join(full_dir, 'op.pb') + inout_path = os.path.join(full_dir, 'inout.npz') + loaded = np.load(inout_path, encoding='bytes') # load serialized input and output - loaded_inputs = numpy.load( - os.path.join(source_dir, 'inputs.npz'), encoding='bytes')['inputs'] + loaded_inputs = loaded['inputs'].tolist() inputs_equal = True for (x, y) in zip(inputs, loaded_inputs): - if not numpy.array_equal(x, y): + if not np.array_equal(x, y): inputs_equal = False - loaded_outputs = numpy.load(os.path.join( - source_dir, 'outputs.npz'), encoding='bytes')['outputs'] + loaded_outputs = loaded['outputs'].tolist() # load operator - found_op = False - for i in os.listdir(source_dir): - op_file = os.path.join(source_dir, i) - match = re.search('operator_(.+?)\.pb', i) - if os.path.isfile(op_file) and match: - with open(op_file, 'rb') as f: - loaded_op = f.read() - op_proto = parse_proto(loaded_op) - device_type = int(match.group(1)) - device_option = caffe2_pb2.DeviceOption(device_type=device_type) - grad_ops, _ = gradient_checker.getGradientForOp(op_proto) - found_op = True - break + with open(op_path, 'rb') as f: + loaded_op = f.read() + + op_proto = parse_proto(loaded_op) + device_type = loaded['device_type'] + device_option = caffe2_pb2.DeviceOption(device_type=int(device_type)) # if inputs are not the same, run serialized input through serialized op if not inputs_equal: - self.assertTrue(found_op) outputs = hu.runOpOnInput(device_option, op_proto, loaded_inputs) + grad_ops = _getGradientOrNone(op_proto) # assert outputs are equal for (x, y) in zip(outputs, loaded_outputs): - numpy.testing.assert_allclose(x, y, atol=atol, rtol=rtol) + np.testing.assert_allclose(x, y, atol=atol, rtol=rtol) # assert gradient op is equal for i in range(len(grad_ops)): - with open(os.path.join(source_dir, 'gradient_{}.pb'.format(i)), 'rb') as f: + with open(os.path.join(full_dir, 'grad_{}.pb'.format(i)), 'rb') as f: loaded_grad = f.read() grad_proto = parse_proto(loaded_grad) self.assertTrue(grad_proto == grad_ops[i]) + shutil.rmtree(full_dir) + def assertSerializedOperatorChecks( self, inputs, @@ -149,7 +192,7 @@ def assertSerializedOperatorChecks( device_option, ): if self.should_serialize: - if getattr(_output_context, 'should_write_output', False): + if getattr(_output_context, 'should_generate_output', False): self.serialize_test( inputs, outputs, gradient_operator, op, device_option) else: @@ -180,29 +223,34 @@ def assertReferenceChecks( atol, outputs_to_check, ) - grad_ops, _ = gradient_checker.getGradientForOp(op) - self.assertSerializedOperatorChecks( - inputs, - outs, - grad_ops, - op, - device_option, - ) + if not getattr(_output_context, 'disable_serialized_check', False): + grad_ops = _getGradientOrNone(op) + self.assertSerializedOperatorChecks( + inputs, + outs, + grad_ops, + op, + device_option, + ) def testWithArgs(): parser = argparse.ArgumentParser() parser.add_argument( - '-g', '--generate-serialized', action='store_true', dest='write', + '-G', '--generate-serialized', action='store_true', dest='generate', help='generate output files (default=false, compares to current files)') parser.add_argument( - '-o', '--output', default=DATA_DIR, + '-O', '--output', default=DATA_DIR, help='output directory (default: %(default)s)') + parser.add_argument( + '-D', '--disable-serialized_check', action='store_true', dest='disable', + help='disable checking serialized tests') parser.add_argument('unittest_args', nargs='*') args = parser.parse_args() sys.argv[1:] = args.unittest_args - _output_context.__setattr__('should_write_output', args.write) + _output_context.__setattr__('should_generate_output', args.generate) _output_context.__setattr__('output_dir', args.output) + _output_context.__setattr__('disable_serialized_check', args.disable) import unittest unittest.main() diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 1a579b519fe09c..383b8410ea6ae2 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -396,8 +396,8 @@ def test_converterEnforceUnusedInputs(self): net = core.Net("net") net.Relu(["X"], ["Y"]) net.Proto().external_input.extend(["fake"]) - with self.assertRaises(Exception): - transformer.AddNNPACK(net) # just testing the converter + # This should now work + transformer.AddNNPACK(net) # just testing the converter def test_converterEnforceUnusedOutputs(self): net = core.Net("net") diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py index 75124add41cecd..5e87df8058e017 100644 --- a/caffe2/python/utils.py +++ b/caffe2/python/utils.py @@ -6,13 +6,13 @@ from __future__ import unicode_literals from caffe2.proto import caffe2_pb2 +from caffe2.python.compatibility import container_abcs from future.utils import viewitems from google.protobuf.message import DecodeError, Message from google.protobuf import text_format import sys import copy -import collections import functools import numpy as np from six import integer_types, binary_type, text_type, string_types @@ -120,7 +120,7 @@ def MakeArgument(key, value): """Makes an argument based on the value type.""" argument = caffe2_pb2.Argument() argument.name = key - iterable = isinstance(value, collections.Iterable) + iterable = isinstance(value, container_abcs.Iterable) # Fast tracking common use case where a float32 array of tensor parameters # needs to be serialized. The entire array is guaranteed to have the same diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc index 25414622bad754..623e93a07e3251 100644 --- a/caffe2/sgd/adam_op.cc +++ b/caffe2/sgd/adam_op.cc @@ -34,7 +34,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output .Output(0, "output_param", "Updated parameters") .Output(1, "output_moment_1", "Updated first moment") .Output(2, "output_moment_2", "Updated second moment") - .Output(3, "output_grad", "Effective grad") + .Output(3, "output_grad", "Optional Effective gradient") .Arg("beta1", "Default 0.9") .Arg("beta2", "Default 0.999") .Arg("epsilon", "Default 1e-5"); @@ -42,7 +42,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output REGISTER_CPU_OPERATOR(SparseAdam, SparseAdamOp); OPERATOR_SCHEMA(SparseAdam) .NumInputs(7) - .NumOutputs(3) + .NumOutputs(3, 4) .EnforceInplace({{0, 0}, {1, 1}, {2, 2}}) .SetDoc(R"DOC( @@ -62,6 +62,7 @@ OPERATOR_SCHEMA(SparseAdam) .Output(0, "output_param", "Updated parameters") .Output(1, "output_moment_1", "Updated first moment") .Output(2, "output_moment_2", "Updated second moment") + .Output(3, "output_grad", "Optional Effective gradient") .Arg("beta1", "Default 0.9") .Arg("beta2", "Default 0.999") .Arg("epsilon", "Default 1e-5"); @@ -71,7 +72,7 @@ REGISTER_CPU_OPERATOR( RowWiseSparseAdamOp); OPERATOR_SCHEMA(RowWiseSparseAdam) .NumInputs(7) - .NumOutputs(3) + .NumOutputs(3, 4) .EnforceInplace({{0, 0}, {1, 1}, {2, 2}}) .SetDoc(R"DOC( @@ -95,6 +96,7 @@ OPERATOR_SCHEMA(RowWiseSparseAdam) .Output(0, "output_param", "Updated parameters") .Output(1, "output_moment_1", "Updated first moment") .Output(2, "output_moment_2", "Updated second moment") + .Output(3, "output_grad", "Optional Effective gradient") .Arg("beta1", "Default 0.9") .Arg("beta2", "Default 0.999") .Arg("epsilon", "Default 1e-5"); diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h index dadf7f4ee22015..699ba7aa5d23b1 100644 --- a/caffe2/sgd/adam_op.h +++ b/caffe2/sgd/adam_op.h @@ -88,7 +88,7 @@ class AdamOp final : public Operator { epsilon_(this->template GetSingleArgument("epsilon", 1e-5f)) {} bool RunOnDevice() override { // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(ITER, CPU)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size()); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size()); @@ -195,58 +195,118 @@ class SparseAdamOp final : public Operator { auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data(); auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); - for (auto i = 0; i < n; ++i) { - auto idx = indices[i]; + if (OutputSize() == 3) { + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + paramOut[idx] = paramIn[idx] + + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; - if (block_size == 1) { - float gi = gradIn[i]; - float mi = moment1Out[idx] = - moment1In[idx] * beta1_ + gi * (1 - beta1_); - float vi = moment2Out[idx] = - moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); - paramOut[idx] = - paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); +#ifndef NDEBUG + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); +#endif - } else { - auto offsetI = i * block_size; - auto offsetIdx = idx * block_size; + adam_compute( + block_size, + paramIn + offsetIdx, + gradIn + offsetI, + moment1In + offsetIdx, + moment2In + offsetIdx, + paramOut + offsetIdx, + moment1Out + offsetIdx, + moment2Out + offsetIdx, + beta1_, + beta2_, + epsilon_, + correction, + lr, + &context_); + } + } + } else { + Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); + auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_); + paramOut[idx] = paramIn[idx] + lr[0] * ngi; + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; #ifndef NDEBUG - CAFFE_ENFORCE_GE( - Input(PARAM).size(), - block_size + offsetIdx, - this->debug_def().input(PARAM), - ", out of bound, idx:", - idx, - " for input i:", - i, - " and block size:", - block_size); - CAFFE_ENFORCE_GE( - Input(GRAD).size(), - block_size + offsetI, - this->debug_def().input(GRAD), - ", out of bound idx, idx:", - idx, - " for input i:", - i); + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); #endif - adam_compute( - block_size, - paramIn + offsetIdx, - gradIn + offsetI, - moment1In + offsetIdx, - moment2In + offsetIdx, - paramOut + offsetIdx, - moment1Out + offsetIdx, - moment2Out + offsetIdx, - beta1_, - beta2_, - epsilon_, - correction, - lr, - &context_); + adam_compute_output_grad( + block_size, + paramIn + offsetIdx, + gradIn + offsetI, + moment1In + offsetIdx, + moment2In + offsetIdx, + paramOut + offsetIdx, + moment1Out + offsetIdx, + moment2Out + offsetIdx, + gradOut + offsetI, + beta1_, + beta2_, + epsilon_, + correction, + lr, + &context_); + } } } return true; @@ -257,7 +317,7 @@ class SparseAdamOp final : public Operator { T beta2_; T epsilon_; INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER); - OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2); + OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD); }; template @@ -305,61 +365,126 @@ class RowWiseSparseAdamOp final : public Operator { auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data(); auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); - for (auto i = 0; i < n; ++i) { - auto idx = indices[i]; + if (OutputSize() == 3) { + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + paramOut[idx] = paramIn[idx] + + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; - if (block_size == 1) { - float gi = gradIn[i]; - float mi = moment1Out[idx] = - moment1In[idx] * beta1_ + gi * (1 - beta1_); - float vi = moment2Out[idx] = - moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); - paramOut[idx] = - paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); +#ifndef NDEBUG + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); +#endif - } else { - auto offsetI = i * block_size; - auto offsetIdx = idx * block_size; + const float* w = paramIn + offsetIdx; + const float* g = gradIn + offsetI; + const float* m1 = moment1In + offsetIdx; + const float* m2 = moment2In + idx; + float* nw = paramOut + offsetIdx; + float* nm1 = moment1Out + offsetIdx; + float* nm2 = moment2Out + idx; + + float m2_sum = 0.; + for (auto j = 0; j < block_size; ++j) { + float gj = g[j]; + m2_sum += gj * gj; + } + float vi = nm2[0] = + m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); + for (auto j = 0; j < block_size; ++j) { + float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); + nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + } + } + } + } else { + Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); + auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_); + paramOut[idx] = paramIn[idx] + lr[0] * ngi; + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; #ifndef NDEBUG - CAFFE_ENFORCE_GE( - Input(PARAM).size(), - block_size + offsetIdx, - this->debug_def().input(PARAM), - ", out of bound, idx:", - idx, - " for input i:", - i, - " and block size:", - block_size); - CAFFE_ENFORCE_GE( - Input(GRAD).size(), - block_size + offsetI, - this->debug_def().input(GRAD), - ", out of bound idx, idx:", - idx, - " for input i:", - i); + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); #endif - const float* w = paramIn + offsetIdx; - const float* g = gradIn + offsetI; - const float* m1 = moment1In + offsetIdx; - const float* m2 = moment2In + idx; - float* nw = paramOut + offsetIdx; - float* nm1 = moment1Out + offsetIdx; - float* nm2 = moment2Out + idx; - - float m2_sum = 0.; - for (auto j = 0; j < block_size; ++j) { - float gj = g[j]; - m2_sum += gj * gj; - } - float vi = nm2[0] = - m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); - for (auto j = 0; j < block_size; ++j) { - float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); - nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + const float* w = paramIn + offsetIdx; + const float* g = gradIn + offsetI; + const float* m1 = moment1In + offsetIdx; + const float* m2 = moment2In + idx; + float* nw = paramOut + offsetIdx; + float* nm1 = moment1Out + offsetIdx; + float* nm2 = moment2Out + idx; + float* ng = gradOut + offsetI; + + float m2_sum = 0.; + for (auto j = 0; j < block_size; ++j) { + float gj = g[j]; + m2_sum += gj * gj; + } + float vi = nm2[0] = + m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); + for (auto j = 0; j < block_size; ++j) { + float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); + float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_); + nw[j] = w[j] + lr[0] * ngi; + } } } } @@ -371,7 +496,7 @@ class RowWiseSparseAdamOp final : public Operator { T beta2_; T epsilon_; INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER); - OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2); + OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD); }; } // namespace caffe2 diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu index 41f70ca51d577b..2e142af682f795 100644 --- a/caffe2/sgd/adam_op_gpu.cu +++ b/caffe2/sgd/adam_op_gpu.cu @@ -1,6 +1,6 @@ -#include "caffe2/sgd/adam_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" +#include "caffe2/sgd/adam_op.h" namespace caffe2 { @@ -95,6 +95,55 @@ void adam_compute( N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr); } +__global__ void AdamComputeOutputGrad( + int N, + const float* w, + const float* g, + const float* m, + const float* v, + float* nw, + float* nm, + float* nv, + float* ng, + float beta1, + float beta2, + float eps_hat, + float correction, + const float* lr) { + CUDA_1D_KERNEL_LOOP(i, N) { + float gi = g[i]; + float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); + float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); + float ngi = ng[i] = correction * mi / (sqrtf(vi) + eps_hat); + nw[i] = w[i] + lr[0] * ngi; + } +} + +template <> +void adam_compute_output_grad( + int N, + const float* w, + const float* g, + const float* m, + const float* v, + float* nw, + float* nm, + float* nv, + float* ng, + float beta1, + float beta2, + float eps_hat, + float correction, + const float* lr, + CUDAContext* context) { + AdamComputeOutputGrad<<< + CAFFE_GET_BLOCKS(N), + CAFFE_CUDA_NUM_THREADS, + 0, + context->cuda_stream()>>>( + N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr); +} + template __global__ void SparseAdamKernel( const size_t N, @@ -123,9 +172,44 @@ __global__ void SparseAdamKernel( } } +template +__global__ void SparseAdamOutputGradKernel( + const size_t N, + const size_t grad_slice_sz, + const float beta1, + const float beta2, + const float epsilon, + float* param, + float* mom1, + float* mom2, + float* output_grad, + const SIndex* indices, + const float* grad, + const float correction, + const float* lr, + const float iter) { + CUDA_1D_KERNEL_LOOP(i, N) { + const size_t gradIdx = i; + const SIndex index = indices[i / grad_slice_sz]; + const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz); + + float m1n = mom1[paramIdx] = + mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1); + float m2n = mom2[paramIdx] = + mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2); + float gradOut = output_grad[gradIdx] = + correction * m1n / (sqrt(m2n) + epsilon); + param[paramIdx] += lr[0] * gradOut; + } +} + template <> template bool SparseAdamOp::DoRunWithType() { + Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM)); + Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1)); + Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2)); + auto N = Input(GRAD).size(); auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim()); const auto iter = @@ -133,24 +217,48 @@ bool SparseAdamOp::DoRunWithType() { const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) / (1.0f - std::pow(beta1_, iter + 1)); - SparseAdamKernel - <<>>( - N, - grad_slice_sz, - beta1_, - beta2_, - epsilon_, - Output(OUTPUT_PARAM)->template mutable_data(), - Output(OUTPUT_MOMENT_1)->template mutable_data(), - Output(OUTPUT_MOMENT_2)->template mutable_data(), - Input(INDICES).template data(), - Input(GRAD).template data(), - correction, - Input(LR).template data(), - iter); + if (OutputSize() == 3) { + SparseAdamKernel + <<>>( + N, + grad_slice_sz, + beta1_, + beta2_, + epsilon_, + Output(OUTPUT_PARAM)->template mutable_data(), + Output(OUTPUT_MOMENT_1)->template mutable_data(), + Output(OUTPUT_MOMENT_2)->template mutable_data(), + Input(INDICES).template data(), + Input(GRAD).template data(), + correction, + Input(LR).template data(), + iter); + } else { + Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); + SparseAdamOutputGradKernel + <<>>( + N, + grad_slice_sz, + beta1_, + beta2_, + epsilon_, + Output(OUTPUT_PARAM)->template mutable_data(), + Output(OUTPUT_MOMENT_1)->template mutable_data(), + Output(OUTPUT_MOMENT_2)->template mutable_data(), + Output(OUTPUT_GRAD)->template mutable_data(), + Input(INDICES).template data(), + Input(GRAD).template data(), + correction, + Input(LR).template data(), + iter); + } + return true; } diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h index 7b1c68634de228..a8f3ce75c7fc2c 100644 --- a/caffe2/sgd/fp16_momentum_sgd_op.h +++ b/caffe2/sgd/fp16_momentum_sgd_op.h @@ -37,8 +37,8 @@ class FP16MomentumSGDUpdateOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h index 75907a63501da9..57ea18a097b099 100644 --- a/caffe2/sgd/fp32_momentum_sgd_op.h +++ b/caffe2/sgd/fp32_momentum_sgd_op.h @@ -33,8 +33,8 @@ class FP32MomentumSGDUpdateOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h index 91709f47f3453a..22ec8d252c455f 100644 --- a/caffe2/sgd/iter_op.h +++ b/caffe2/sgd/iter_op.h @@ -39,7 +39,7 @@ class IterOp final : public Operator { bool RunOnDevice() override { if (InputSize() == 0) { LOG(INFO) << "[Input size is zero]"; - if (!OperatorBase::OutputIsType(0, CPU)) { + if (!OperatorBase::OutputIsTensorType(0, CPU)) { // This is the first run; set the iter to start with 0. LOG(ERROR) << "You are using an old definition of IterOp that will " "be deprecated soon. More specifically, IterOp now " diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h index 6e79d5dbedc7da..c3f25c84c9b8aa 100644 --- a/caffe2/sgd/momentum_sgd_op.h +++ b/caffe2/sgd/momentum_sgd_op.h @@ -47,8 +47,8 @@ class MomentumSGDOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); @@ -87,8 +87,8 @@ class MomentumSGDUpdateOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE_EQ(Input(LR).size(), 1); CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h index 249f638bfac03d..94150413df1750 100644 --- a/caffe2/sgd/yellowfin_op.h +++ b/caffe2/sgd/yellowfin_op.h @@ -126,7 +126,7 @@ CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory) CAFFE2_YF_READ_INPUT(GRAD, grad) #undef CAFFE2_YF_READ_OUTPUT -CAFFE_ENFORCE(OperatorBase::InputIsType(ITER, CPU)); +CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU)); CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1); CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1); CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim()); diff --git a/caffe2/share/contrib/CMakeLists.txt b/caffe2/share/contrib/CMakeLists.txt index 01af2c0616dfce..0fc3a4186f0189 100644 --- a/caffe2/share/contrib/CMakeLists.txt +++ b/caffe2/share/contrib/CMakeLists.txt @@ -1,4 +1,4 @@ -if (USE_NNPACK) +if (USE_NNPACK AND BUILD_CAFFE2_OPS) add_subdirectory(nnpack) endif() if (USE_ZSTD) diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h index 55c4a6a6e50af7..3217198dd8cdf3 100644 --- a/caffe2/utils/Array.h +++ b/caffe2/utils/Array.h @@ -259,7 +259,7 @@ template constexpr _Tp&& get(array<_Tp, _Nm>&& __arr) noexcept { static_assert(_Int < _Nm, "array index is within bounds"); - return std::move(get<_Int>(__arr)); + return guts::move(get<_Int>(__arr)); } template @@ -292,12 +292,12 @@ constexpr inline array tail(const array& arg) { namespace detail { template constexpr inline array prepend_(T&& head, const array& tail, guts::index_sequence) { - return {{std::forward(head), get(tail)...}}; + return {{guts::forward(head), get(tail)...}}; } } template constexpr inline array prepend(T&& head, const array& tail) { - return detail::prepend_(std::forward(head), tail, guts::make_index_sequence()); + return detail::prepend_(guts::forward(head), tail, guts::make_index_sequence()); } /** diff --git a/caffe2/utils/Array_test.cpp b/caffe2/utils/Array_test.cpp index 1d8c290b8a2249..1f3171ebe88eb0 100644 --- a/caffe2/utils/Array_test.cpp +++ b/caffe2/utils/Array_test.cpp @@ -78,11 +78,9 @@ namespace test_tail { static_assert(array < int, 0 > {{}} == tail(array < int, 1 > {{3}}), ""); } -TEST(ArrayTest, TestPrepend) { - // Some compilers can't handle move results as constexpr, so use - // gtest assert for this test - ASSERT_EQ((array {{2, 3, 4}}), (prepend(2, array {{3, 4}}))); - ASSERT_EQ((array {{3}}), (prepend(3, array {{}}))); +namespace test_prepend { + static_assert(array < int, 3 > {{2, 3, 4}} == prepend(2, array < int, 2 > {{3, 4}}), ""); + static_assert(array < int, 1 > {{3}} == prepend(3, array < int, 0 > {{}}), ""); } namespace test_to_std_array { diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 920bffc0ae3bfb..2bb11cd22ad70e 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -130,7 +130,7 @@ __global__ void BroadcastBinaryOpCUDAKernel( } template -void BinaryOpWith2DBroadcasting( +CAFFE2_CUDA_EXPORT void BinaryOpWith2DBroadcasting( const int rows, const int cols, const bool rowwise_broadcast, @@ -177,7 +177,7 @@ void BinaryOpWith2DBroadcasting( } template -void BroadcastBinaryOpImpl( +CAFFE2_CUDA_EXPORT void BroadcastBinaryOpImpl( const int* A_dims, const int* B_dims, const int* C_dims, @@ -212,7 +212,7 @@ void BroadcastBinaryOpImpl( } template -void BroadcastBinaryOp( +CAFFE2_CUDA_EXPORT void BroadcastBinaryOp( const int A_ndim, const int* A_dims, const int B_ndim, @@ -294,7 +294,7 @@ void BroadcastBinaryOp( } \ } \ template <> \ - void Func( \ + CAFFE2_CUDA_EXPORT void Func( \ const int N, const T* x, T* y, CUDAContext* context) { \ Func##CUDAKernel<<< \ CAFFE_GET_BLOCKS(N), \ @@ -362,7 +362,7 @@ DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv, utils::Inv) #define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \ template <> \ - void SinCos( \ + CAFFE2_CUDA_EXPORT void SinCos( \ const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \ SinCosCUDAKernel<<< \ CAFFE_GET_BLOCKS(N), \ @@ -376,7 +376,7 @@ CAFFE2_SPECIALIZED_CUDA_SINCOS(double) #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Func( \ + CAFFE2_CUDA_EXPORT void Func( \ const int N, \ const TIn* A, \ const TIn* B, \ @@ -444,7 +444,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( #define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Rowwise##Func( \ + CAFFE2_CUDA_EXPORT void Rowwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -463,7 +463,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( context->cuda_stream()>>>(size, cols_div, Op(), A, B, C); \ } \ template <> \ - void Rowwise##Func( \ + CAFFE2_CUDA_EXPORT void Rowwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -482,7 +482,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( context->cuda_stream()>>>(size, cols_div, Op(), A, B, C); \ } \ template <> \ - void Colwise##Func( \ + CAFFE2_CUDA_EXPORT void Colwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -501,7 +501,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( context->cuda_stream()>>>(size, cols_div, Op(), A, B, C); \ } \ template <> \ - void Colwise##Func( \ + CAFFE2_CUDA_EXPORT void Colwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -573,7 +573,7 @@ DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) #define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Func( \ + CAFFE2_CUDA_EXPORT void Func( \ const int A_ndim, \ const int* A_dims, \ const int B_ndim, \ @@ -638,7 +638,7 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ template <> \ - void Funcname( \ + CAFFE2_CUDA_EXPORT void Funcname( \ const int N, \ const T* src, \ T* dst, \ @@ -669,7 +669,7 @@ DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max) // Caffe2 gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -710,7 +710,7 @@ void Gemm( } template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -781,7 +781,7 @@ void Gemm( } template <> -void BiasCHW( +CAFFE2_CUDA_EXPORT void BiasCHW( const float* bias, const float* bias_multiplier, const int bias_channels, @@ -803,7 +803,7 @@ void BiasCHW( } template <> -void GemmBatched( +CAFFE2_CUDA_EXPORT void GemmBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -869,7 +869,7 @@ void GemmBatched( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -930,7 +930,7 @@ void GemmStridedBatched( } template <> -void GemmBatched( +CAFFE2_CUDA_EXPORT void GemmBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1059,7 +1059,7 @@ void GemmBatched( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1168,7 +1168,7 @@ void GemmStridedBatched( // No change, but required. Defer to default CUDA engine template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -1186,7 +1186,7 @@ void Gemm( } template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -1245,7 +1245,7 @@ void Gemm( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1282,7 +1282,7 @@ void GemmStridedBatched( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1321,7 +1321,7 @@ void GemmStridedBatched( #endif // CUDA_VERSION >= 9000 template <> -void GemmEx( +CAFFE2_CUDA_EXPORT void GemmEx( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -1362,7 +1362,7 @@ void GemmEx( } template <> -void Gemv( +CAFFE2_CUDA_EXPORT void Gemv( const CBLAS_TRANSPOSE trans_A, const int M, const int N, @@ -1415,7 +1415,7 @@ __global__ void AddStripedBatchKernel( #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T) \ template <> \ - void AddStripedBatch( \ + CAFFE2_CUDA_EXPORT void AddStripedBatch( \ const int N, \ const T* first, \ T* Y, \ @@ -1434,7 +1434,7 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16); #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH template <> -void Gemv( +CAFFE2_CUDA_EXPORT void Gemv( const CBLAS_TRANSPOSE trans_A, const int M, const int N, @@ -1514,7 +1514,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) { #define CAFFE2_SPECIALIZED_CUDA_SET(T) \ template <> \ - void Set( \ + CAFFE2_CUDA_API void Set( \ const size_t N, const T alpha, T* Y, CUDAContext* context) { \ if (N == 0) { \ return; \ @@ -1542,7 +1542,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t); #undef CAFFE2_SPECIALIZED_CUDA_SET template <> -void Set( +CAFFE2_CUDA_EXPORT void Set( const size_t N, const float16 alpha, float16* Y, @@ -1577,7 +1577,7 @@ UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) { } // namespace template <> -void RandUniform( +CAFFE2_CUDA_EXPORT void RandUniform( const size_t n, const float min, const float max, @@ -1592,7 +1592,7 @@ void RandUniform( } template <> -void RandUniform( +CAFFE2_CUDA_EXPORT void RandUniform( const size_t n, const double min, const double max, @@ -1608,7 +1608,7 @@ void RandUniform( } template <> -void RandUniform( +CAFFE2_CUDA_EXPORT void RandUniform( const size_t n, const int min, const int max, @@ -1642,7 +1642,7 @@ size_t HandleOddLengthRandGaussian( } template <> -void RandGaussian( +CAFFE2_CUDA_EXPORT void RandGaussian( const size_t n, const float mean, const float std, @@ -1658,7 +1658,7 @@ void RandGaussian( } template <> -void RandGaussian( +CAFFE2_CUDA_EXPORT void RandGaussian( const size_t n, const double mean, const double std, @@ -1671,7 +1671,7 @@ void RandGaussian( } template <> -void Dot( +CAFFE2_CUDA_EXPORT void Dot( const int n, const float* a, const float* b, @@ -1683,7 +1683,7 @@ void Dot( } template <> -void Dot( +CAFFE2_CUDA_EXPORT void Dot( const int n, const float16* a, const float16* b, @@ -1760,7 +1760,7 @@ __global__ void SumConvertKernel(float* sum, T* dest) { } template -void SumGenericIter( +CAFFE2_CUDA_EXPORT void SumGenericIter( const int N, IterT it, T*& dest, @@ -1789,7 +1789,7 @@ void SumGenericIter( } // namespace template <> -void Sum( +CAFFE2_CUDA_EXPORT void Sum( const int N, const float* x, float* y, @@ -1804,7 +1804,7 @@ void Sum( } template <> -void Sum( +CAFFE2_CUDA_EXPORT void Sum( const int N, const int32_t* x, int32_t* y, @@ -1829,7 +1829,7 @@ struct FloatTransform { #define CAFFE2_MATH_SUM_FUNC(T) \ template <> \ - void Sum( \ + CAFFE2_CUDA_EXPORT void Sum( \ const int N, \ const T* x, \ T* y, \ @@ -1861,7 +1861,7 @@ struct SqrTransform { } // namespace template <> -void SumSqr( +CAFFE2_CUDA_EXPORT void SumSqr( const int N, const float* x, float* y, @@ -1880,7 +1880,7 @@ void SumSqr( #define CAFFE2_MATH_SUMSQR_FUNC(T) \ template <> \ - void SumSqr( \ + CAFFE2_CUDA_EXPORT void SumSqr( \ const int N, \ const T* x, \ T* y, \ @@ -1920,7 +1920,7 @@ SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) { } // namespace template <> -void Select( +CAFFE2_CUDA_EXPORT void Select( const int N, const int D, const float* x, @@ -1935,7 +1935,7 @@ void Select( } template <> -void Select( +CAFFE2_CUDA_EXPORT void Select( const int N, const int D, const float16* x, @@ -1985,7 +1985,7 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) { } // namespace template <> -void Powx( +CAFFE2_CUDA_EXPORT void Powx( const int N, const float* a, const float b, @@ -2000,7 +2000,7 @@ void Powx( #define DELEGATE_CUBLAS_SCALE_FUNCTION(TAlpha, TData, CuBLASFunc) \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha alpha, \ const TData* x, \ @@ -2024,7 +2024,7 @@ void Powx( } \ } \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha* alpha, \ const TData* x, \ @@ -2051,7 +2051,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal) #define CAFFE2_SPECIALIZED_CUDA_SCALE(TAlpha, TData) \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha alpha, \ const TData* x, \ @@ -2078,7 +2078,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal) context->cuda_stream()>>>(N, alpha, x, y); \ } \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha* alpha, \ const TData* x, \ @@ -2098,7 +2098,7 @@ CAFFE2_SPECIALIZED_CUDA_SCALE(std::int64_t, std::int64_t) #undef CAFFE2_SPECIALIZED_CUDA_SCALE template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float16 alpha, const float16* x, @@ -2129,7 +2129,7 @@ void Scale( } template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float16* alpha, const float16* x, @@ -2160,7 +2160,7 @@ void Scale( } template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float alpha, const float16* x, @@ -2193,7 +2193,7 @@ void Scale( } template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float* alpha, const float16* x, @@ -2224,7 +2224,7 @@ void Scale( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float alpha, const float* X, @@ -2236,7 +2236,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float alpha, const double* X, @@ -2250,7 +2250,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float alpha, const float16* X, @@ -2273,7 +2273,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float* alpha, const float* X, @@ -2285,7 +2285,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float* alpha, const float16* X, @@ -2379,7 +2379,7 @@ __global__ void AxpbyCUDAKernel( #define CAFFE2_SPECIALIZED_CUDA_AXPBY(TCoeff, TData) \ template <> \ - void Axpby( \ + CAFFE2_CUDA_EXPORT void Axpby( \ const int n, \ const TCoeff a, \ const TData* x, \ @@ -2393,7 +2393,7 @@ __global__ void AxpbyCUDAKernel( context->cuda_stream()>>>(n, a, x, b, y); \ } \ template <> \ - void Axpby( \ + CAFFE2_CUDA_EXPORT void Axpby( \ const int n, \ const TCoeff* a, \ const TData* x, \ @@ -2468,7 +2468,7 @@ __global__ void Im2ColNCHWCUDAKernel( } template -__global__ void Im2ColNHWCCUDAKernel( +__global__ void Im2ColNHWCCUDAKernel( const int n, const int input_h, const int input_w, @@ -2519,7 +2519,7 @@ __global__ void Im2ColNHWCCUDAKernel( } template -__global__ void Col2ImNCHWCUDAKernel( +__global__ void Col2ImNCHWCUDAKernel( const int n, const int input_h, const int input_w, @@ -2574,7 +2574,7 @@ __global__ void Col2ImNCHWCUDAKernel( } template -__global__ void Col2ImNHWCCUDAKernel( +__global__ void Col2ImNHWCCUDAKernel( const int n, const int input_w, const int channels, @@ -2627,7 +2627,7 @@ __global__ void Col2ImNHWCCUDAKernel( } template -__global__ void Im2ColNdNCHWCUDAKernel( +__global__ void Im2ColNdNCHWCUDAKernel( const int outer_size, const int inner_size, const int kernel_size, @@ -2683,7 +2683,7 @@ __global__ void Im2ColNdNCHWCUDAKernel( } template -void Im2ColNdNCHWCUDAImpl( +CAFFE2_CUDA_EXPORT void Im2ColNdNCHWCUDAImpl( const int img_size, const int col_size, const int* img_shape, @@ -2730,7 +2730,7 @@ void Im2ColNdNCHWCUDAImpl( } template -void Col2ImNdNCHWCUDAImpl( +CAFFE2_CUDA_EXPORT void Col2ImNdNCHWCUDAImpl( const int img_size, const int col_size, const int* img_shape, @@ -2780,7 +2780,7 @@ void Col2ImNdNCHWCUDAImpl( } // namespace template <> -void Im2Col( +CAFFE2_CUDA_EXPORT void Im2Col( const int channels, const int height, const int width, @@ -2826,7 +2826,7 @@ void Im2Col( } template <> -void Im2Col( +CAFFE2_CUDA_EXPORT void Im2Col( const int channels, const int height, const int width, @@ -2874,7 +2874,7 @@ void Im2Col( } template <> -void Col2Im( +CAFFE2_CUDA_EXPORT void Col2Im( const int channels, const int height, const int width, @@ -2920,7 +2920,7 @@ void Col2Im( } template <> -void Col2Im( +CAFFE2_CUDA_EXPORT void Col2Im( const int channels, const int height, const int width, @@ -2968,7 +2968,7 @@ void Col2Im( } template <> -void Im2ColNd( +CAFFE2_CUDA_EXPORT void Im2ColNd( const int N, const int img_size, const int col_size, @@ -2999,7 +2999,7 @@ void Im2ColNd( } template <> -void Col2ImNd( +CAFFE2_CUDA_EXPORT void Col2ImNd( const int N, const int img_size, const int col_size, @@ -3030,7 +3030,7 @@ void Col2ImNd( } template <> -void CopyMatrix( +CAFFE2_CUDA_EXPORT void CopyMatrix( const size_t itemsize, const int M, const int N, @@ -3082,7 +3082,7 @@ CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(TIndex) #undef CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX template <> -void CopyVector( +CAFFE2_CUDA_EXPORT void CopyVector( const int N, const float* src, float* dst, @@ -3152,7 +3152,7 @@ __global__ void ColwiseReduceKernel( #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T) \ template <> \ - void RowwiseMax( \ + CAFFE2_CUDA_EXPORT void RowwiseMax( \ const int N, const int D, const T* x, T* y, CUDAContext* context) { \ RowwiseReduceKernel<<< \ std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \ @@ -3166,7 +3166,7 @@ CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float) #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T) \ template <> \ - void ColwiseMax( \ + CAFFE2_CUDA_EXPORT void ColwiseMax( \ const int N, const int D, const T* x, T* y, CUDAContext* context) { \ ColwiseReduceKernel<<< \ std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \ @@ -3188,7 +3188,7 @@ maximum_kernel(const int N, const float alpha, const float* x, float* y) { } // namespace template <> -void Maximum( +CAFFE2_CUDA_EXPORT void Maximum( const int N, const float alpha, const float* x, @@ -3241,7 +3241,7 @@ __global__ void ReduceTensorCUDAKernel( } template -void ReduceTensorCUDAImpl( +CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl( const int outer_size, const int inner_size, const int* dims, @@ -3275,7 +3275,7 @@ void ReduceTensorCUDAImpl( } template -void ReduceTensorCUDA( +CAFFE2_CUDA_EXPORT void ReduceTensorCUDA( const int num_dims, const int* dims, const int num_axes, @@ -3353,7 +3353,7 @@ void ReduceTensorCUDA( #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \ template <> \ - void ReduceMin( \ + CAFFE2_CUDA_EXPORT void ReduceMin( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3382,7 +3382,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \ template <> \ - void ReduceMax( \ + CAFFE2_CUDA_EXPORT void ReduceMax( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3411,7 +3411,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \ template <> \ - void ReduceSum( \ + CAFFE2_CUDA_EXPORT void ReduceSum( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3440,7 +3440,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \ template <> \ - void ReduceMean( \ + CAFFE2_CUDA_EXPORT void ReduceMean( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3496,7 +3496,7 @@ __global__ void BroadcastCUDAKernel( } template -void BroadcastCUDAImpl( +CAFFE2_CUDA_EXPORT void BroadcastCUDAImpl( const int X_ndim, const int* X_dims, const int* Y_dims, @@ -3534,7 +3534,7 @@ void BroadcastCUDAImpl( #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \ template <> \ - void Broadcast( \ + CAFFE2_CUDA_EXPORT void Broadcast( \ const int X_ndim, \ const int* X_dims, \ const int Y_ndim, \ @@ -3676,7 +3676,7 @@ __global__ void MomentsCUDAKernel( } template -void MomentsCUDAImpl( +CAFFE2_CUDA_EXPORT void MomentsCUDAImpl( const int outer_size, const int inner_size, const int* dims, @@ -3700,7 +3700,7 @@ void MomentsCUDAImpl( } template -void MomentsCUDA( +CAFFE2_CUDA_EXPORT void MomentsCUDA( const int num_dims, const int* dims, const int num_axes, @@ -3783,7 +3783,7 @@ void MomentsCUDA( #define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T) \ template <> \ - void Moments( \ + CAFFE2_CUDA_EXPORT void Moments( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3819,7 +3819,7 @@ DELEGATE_INV_STD_KERNEL_FUNCTION(float, rsqrtf) #define CAFFE2_SPECIALIZED_CUDA_INV_STD(T) \ template <> \ - void InvStd( \ + CAFFE2_CUDA_EXPORT void InvStd( \ const int N, \ const T epsilon, \ const T* var, \ @@ -3861,7 +3861,7 @@ __global__ void TransposeCUDAKernel( } template -void TransposeCUDAImpl( +CAFFE2_CUDA_EXPORT void TransposeCUDAImpl( const int* dims, const int* axes, const T* X, @@ -3886,7 +3886,7 @@ void TransposeCUDAImpl( #define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \ template <> \ - void Transpose( \ + CAFFE2_CUDA_EXPORT void Transpose( \ const int ndim, \ const int* dims, \ const int* axes, \ @@ -3933,7 +3933,7 @@ __global__ void AffineChannelCUDAKernel( #define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T, kOrder) \ template <> \ - void AffineChannel( \ + CAFFE2_CUDA_EXPORT void AffineChannel( \ const int N, \ const int C, \ const int HxW, \ diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc index b573968d9095ed..eb06524cae8417 100644 --- a/caffe2/utils/proto_wrap.cc +++ b/caffe2/utils/proto_wrap.cc @@ -29,3 +29,18 @@ void ShutdownProtobufLibrary() { } } // namespace caffe2 + +namespace torch { + +// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function +// used to avoid duplicated global variable in the case when protobuf +// is built with hidden visibility. +CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() { + return ::google::protobuf::internal::GetEmptyStringAlreadyInited(); +} + +void ShutdownProtobufLibrary() { + ::google::protobuf::ShutdownProtobufLibrary(); +} + +} // namespace torch diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index ff838c58889e45..7d9a18eda18489 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -167,12 +167,20 @@ if (NOT BUILD_ATEN_MOBILE) file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*") + # these are files that are generated by the script and checked in -- the script checks + # that they are equivalent so it must be a dependency of the script + set(core_gen_checked_inputs + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Type.h + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Tensor.h + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorMethods.h) + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen) + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core_tmp) add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} COMMAND ${GEN_COMMAND} --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen - DEPENDS ${all_python} ${all_templates} ${cwrap_files}) + DEPENDS ${all_python} ${all_templates} ${cwrap_files} ${core_gen_checked_inputs}) # Generated headers used from a CUDA (.cu) file are # not tracked correctly in CMake. We make the libATen.so depend explicitly diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d8d8927a05241e..db36ab66bbaeca 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -187,20 +187,19 @@ endif() if(BUILD_TEST) # Preserve build options. set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(TEMP_CMAKE_DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX}) # We will build gtest as static libs and embed it directly into the binary. set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) # For gtest, we will simply embed it into our test binaries, so we won't # need to install it. - set(BUILD_GTEST ON) - set(INSTALL_GTEST OFF) + set(BUILD_GTEST ON CACHE BOOL "Build gtest" FORCE) + set(INSTALL_GTEST OFF CACHE BOOL "Install gtest." FORCE) # We currently don't need gmock right now. - set(BUILD_GMOCK OFF) + set(BUILD_GMOCK OFF CACHE BOOL "Build gmock." FORCE) # For Windows, we will check the runtime used is correctly passed in. if (NOT CAFFE2_USE_MSVC_STATIC_RUNTIME) - set(gtest_force_shared_crt ON) + set(gtest_force_shared_crt ON CACHE BOOL "force shared crt on gtest" FORCE) endif() add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest) include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include) @@ -212,10 +211,8 @@ if(BUILD_TEST) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark) include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include) - # Recover build options. Unfortunately gtest modifies CMAKE_DEBUG_POSTFIX - # in some versions as detailed at https://github.com/google/googletest/issues/1334 + # Recover build options. set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) - set(CMAKE_DEBUG_POSTFIX ${TEMP_CMAKE_DEBUG_POSTFIX} CACHE BOOL "Debug postfix" FORCE) endif() # ---[ LMDB @@ -352,7 +349,9 @@ if(BUILD_PYTHON) execute_process( COMMAND "which" "python" RESULT_VARIABLE _exitcode OUTPUT_VARIABLE _py_exe) if(${_exitcode} EQUAL 0) - string(STRIP ${_py_exe} PYTHON_EXECUTABLE) + if (NOT MSVC) + string(STRIP ${_py_exe} PYTHON_EXECUTABLE) + endif() message(STATUS "Setting Python to ${PYTHON_EXECUTABLE}") endif() endif() @@ -391,7 +390,11 @@ if(BUILD_PYTHON) pycmd_no_exit(_py_lib _exitcode "from sysconfig import get_paths; print(get_paths()['stdlib'])") if("${_exitcode}" EQUAL 0 AND EXISTS "${_py_lib}" AND EXISTS "${_py_lib}") SET(PYTHON_LIBRARY "${_py_lib}") - message(STATUS "Setting Python's library to ${_py_lib}") + if (MSVC) + STRING(REPLACE "Lib" "libs" _py_static_lib ${_py_lib}) + link_directories(${_py_static_lib}) + endif() + message(STATUS "Setting Python's library to ${PYTHON_LIBRARY}") endif() endif(NOT DEFINED PYTHON_LIBRARY) @@ -545,6 +548,7 @@ if(NOT BUILD_ATEN_MOBILE) list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-negative) list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-overflow) list(APPEND HIP_HIPCC_FLAGS -Wno-unused-command-line-argument) + list(APPEND HIP_HIPCC_FLAGS -Wno-duplicate-decl-specifier) set(Caffe2_HIP_INCLUDES ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $ ${Caffe2_HIP_INCLUDES}) @@ -770,7 +774,7 @@ if (USE_NNAPI AND NOT ANDROID) caffe2_update_option(USE_NNAPI OFF) endif() -if (NOT BUILD_ATEN_MOBILE) +if (NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS) if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen) if (USE_CUDA) @@ -795,6 +799,11 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # We will build onnx as static libs and embed it directly into the binary. set(BUILD_SHARED_LIBS OFF) + if (MSVC AND BUILD_SHARED_LIBS) + # That also means we want to export all symbols from the shared + # library we are building + set(ONNX_BUILD_MAIN_LIB ON) + endif() set(ONNX_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME}) set(ONNX_USE_LITE_PROTO ${CAFFE2_USE_LITE_PROTO}) # If linking local protobuf, make sure ONNX has the same protobuf diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake index edc9b3ab3fda74..b2ca36a9677771 100644 --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake @@ -531,7 +531,9 @@ option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON) # Extra user settable flags cmake_initialize_per_config_variable(CUDA_NVCC_FLAGS "Semi-colon delimit multiple arguments.") -if(CMAKE_GENERATOR MATCHES "Visual Studio") +if(DEFINED ENV{CUDA_HOST_COMPILER}) + set(CUDA_HOST_COMPILER "$ENV{CUDA_HOST_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC") +elseif(CMAKE_GENERATOR MATCHES "Visual Studio") set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)Tools/MSVC/$(VCToolsVersion)/bin/Host$(Platform)/$(PlatformTarget)") if(MSVC_VERSION LESS 1910) set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)bin") diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake index 9adc2a2be8347a..ed12b3b90e5480 100644 --- a/cmake/ProtoBuf.cmake +++ b/cmake/ProtoBuf.cmake @@ -179,7 +179,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var) # If we remove all reference to these pb.h files from external # libraries and binaries this rewrite can be removed. - COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake + COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil} COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM ) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 846f4d6154b8ba..a314e4d348b708 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -38,6 +38,7 @@ function (caffe2_print_configuration_summary) message(STATUS " Python includes : ${PYTHON_INCLUDE_DIRS}") message(STATUS " Python site-packages: ${PYTHON_SITE_PACKAGES}") endif() + message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}") message(STATUS " BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") @@ -124,6 +125,7 @@ function (caffe2_print_configuration_summary) message(STATUS " USE_GLOO : ${USE_GLOO}") message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}") endif() + message(STATUS " TORCH_USE_CEREAL : ${TORCH_USE_CEREAL}") message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}") message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}") diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index a14b2e1b0e8b44..066a7e63f9c57a 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -7,49 +7,67 @@ # # TORCH_FOUND -- True if the system has the Torch library # TORCH_INCLUDE_DIRS -- The include directories for torch -# TORCH_LIBRARIES -- Libraries to link to +# TORCH_LIBRARIES -- Libraries to link against +# TORCH_CXX_FLAGS -- Additional (required) compiler flags # # and the following imported targets: # -# Torch -# -# and the following functions: -# -# torch_add_custom_op_library( ) +# torch -SET(TORCH_ROOT "${CMAKE_CURRENT_LIST_DIR}/../") +if ($ENV{TORCH_INSTALL_PREFIX}) + set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX}) +else() + # Assume we are in /share/cmake/Torch/TorchConfig.cmake + get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE) +endif() -set(TORCH_INCLUDE_DIRS - "${TORCH_ROOT}" - "${TORCH_ROOT}/aten/src" - "${CMAKE_CURRENT_LIST_DIR}/aten/src" - "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src" - "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src/TH" -) +# Include directories. +if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include") + set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include") +else() + set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/include") +endif() + +# Library dependencies. +find_package(Caffe2 REQUIRED) -find_library(TORCH_LIBRARY torch PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH) -find_library(CAFFE2_LIBRARY caffe2 PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH) +find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib") +add_library(torch SHARED IMPORTED) +set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS}) if (@USE_CUDA@) - find_package(CUDA REQUIRED) - find_library(CAFFE2_CUDA_LIBRARY caffe2_gpu PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH) - set(TORCH_CUDA_LIBRARIES -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 cuda nvrtc cudart nvToolsExt) - list(APPEND TORCH_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + if(MSVC) + set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") + if ($ENV{NVTOOLEXT_HOME}) + set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME}) + endif() + set(TORCH_CUDA_LIBRARIES + ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib + ${CUDA_LIBRARIES}) + list(APPEND TORCH_INCLUDE_DIRS "${NVTOOLEXT_HOME}/include") + elseif(APPLE) + set(TORCH_CUDA_LIBRARIES + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib + ${CUDA_LIBRARIES}) + else() + set(TORCH_CUDA_LIBRARIES + ${CUDA_CUDA_LIB} + ${CUDA_NVRTC_LIB} + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so + ${CUDA_LIBRARIES}) + endif() + list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) endif() -set(TORCH_LIBRARIES - ${TORCH_LIBRARY} - ${CAFFE2_LIBRARY} - ${CAFFE2_CUDA_LIBRARY} - ${TORCH_CUDA_LIBRARIES}) +# When we build libtorch with the old GCC ABI, dependent libraries must too. +set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@") -# Creates a shared library with the correct include directories -# and linker flags set to include Torch header files and link with Torch -# libraries. Also sets the C++ standard version to C++11. All options -# can be override by specifying further options on the `` CMake target. -function(torch_add_custom_op_library name source_files) - add_library(${name} SHARED ${source_files}) - target_include_directories(${name} PUBLIC "${TORCH_INCLUDE_DIRS}") - target_link_libraries(${name} "${TORCH_LIBRARIES}") - target_compile_options(${name} PUBLIC -std=c++11) -endfunction(torch_add_custom_op_library) +set_target_properties(torch PROPERTIES + IMPORTED_LOCATION ${TORCH_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${TORCH_INCLUDE_DIRS} + INTERFACE_COMPILE_OPTIONS ${TORCH_CXX_FLAGS} + CXX_STANDARD 11 +) diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh index c69d857118b2d7..82692d0acdb9b6 100644 --- a/docker/caffe2/jenkins/common/install_rocm.sh +++ b/docker/caffe2/jenkins/common/install_rocm.sh @@ -60,25 +60,30 @@ install_rocrand() { # Install rocSPARSE/hipSPARSE that will be released soon - can co-exist w/ hcSPARSE which will be removed soon install_hipsparse() { mkdir -p /opt/rocm/debians - curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.1.0.deb -o /opt/rocm/debians/rocsparse.deb - curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.1.0.deb -o /opt/rocm/debians/hipsparse.deb + curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.2.114-Linux.deb -o /opt/rocm/debians/rocsparse.deb + curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.2.55-Linux.deb -o /opt/rocm/debians/hipsparse.deb dpkg -i /opt/rocm/debians/rocsparse.deb dpkg -i /opt/rocm/debians/hipsparse.deb } # Install custom hcc containing two compiler fixes relevant to PyTorch install_customhcc() { + HIP_VERSION="1.5.18354" mkdir -p /opt/rocm/debians - curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-1.2.18272-Linux.deb - curl https://s3.amazonaws.com/ossci-linux/hip_base-1.5.18276.deb -o /opt/rocm/debians/hip_base-1.5.18276.deb - curl https://s3.amazonaws.com/ossci-linux/hip_doc-1.5.18276.deb -o /opt/rocm/debians/hip_doc-1.5.18276.deb - curl https://s3.amazonaws.com/ossci-linux/hip_samples-1.5.18276.deb -o /opt/rocm/debians/hip_samples-1.5.18276.deb - curl https://s3.amazonaws.com/ossci-linux/hip_hcc-1.5.18276.deb -o /opt/rocm/debians/hip_hcc-1.5.18276.deb - dpkg -i /opt/rocm/debians/hcc-1.2.18272-Linux.deb - dpkg -i /opt/rocm/debians/hip_base-1.5.18276.deb - dpkg -i /opt/rocm/debians/hip_doc-1.5.18276.deb - dpkg -i /opt/rocm/debians/hip_samples-1.5.18276.deb - dpkg -i /opt/rocm/debians/hip_hcc-1.5.18276.deb + curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-Linux.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_base-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_base.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_doc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_doc.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_samples-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_samples.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_hcc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_hcc.deb + dpkg -i /opt/rocm/debians/hcc-Linux.deb + dpkg -i /opt/rocm/debians/hip_base.deb + dpkg -i /opt/rocm/debians/hip_doc.deb + dpkg -i /opt/rocm/debians/hip_samples.deb + dpkg -i /opt/rocm/debians/hip_hcc.deb + + if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then + sudo sed -i 's/\ -I${dir}/\ $<$:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake + fi } # Install Python packages depending on the base OS diff --git a/docs/Makefile b/docs/Makefile index 4a56c12ca22d89..59c2397bb023e1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -39,4 +39,4 @@ html-stable: clean: @echo "Removing everything under 'build'.." - @rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees + @rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile deleted file mode 100644 index a21dcf8184ad52..00000000000000 --- a/docs/cpp/Doxyfile +++ /dev/null @@ -1,2028 +0,0 @@ -# Doxyfile 1.8.14 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a double hash (##) is considered a comment and is placed in -# front of the TAG it is preceding. -# -# All text after a single hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists, items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (\" \"). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See -# https://www.gnu.org/software/libiconv/ for the list of possible encodings. -# The default value is: UTF-8. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -# double-quotes, unless you are using Doxywizard) that should identify the -# project for which the documentation is generated. This name is used in the -# title of most generated pages and in a few other places. -# The default value is: My Project. - -PROJECT_NAME = "PyTorch" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -# could be handy for archiving the generated documentation or if some version -# control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a -# quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = - -# With the PROJECT_LOGO tag one can specify a logo or an icon that is included -# in the documentation. The maximum height of the logo should not exceed 55 -# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy -# the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -# into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If -# left blank the current directory will be used. - -OUTPUT_DIRECTORY = build - -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. -# The default value is: NO. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. -# The default value is: English. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member -# descriptions after the members that are listed in the file and class -# documentation (similar to Javadoc). Set to NO to disable this. -# The default value is: YES. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief -# description of a member or function before the detailed description -# -# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. -# The default value is: YES. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator that is -# used to form the text in various listings. Each string in this list, if found -# as the leading text of the brief description, will be stripped from the text -# and the result, after processing the whole list, is used as the annotated -# text. Otherwise, the brief description is used as-is. If left blank, the -# following values are used ($name is automatically replaced with the name of -# the entity):The $name class, The $name widget, The $name file, is, provides, -# specifies, contains, represents, a, an and the. - -ABBREVIATE_BRIEF = "The $name class" \ - "The $name widget" \ - "The $name file" \ - is \ - provides \ - specifies \ - contains \ - represents \ - a \ - an \ - the - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief -# description. -# The default value is: NO. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. -# The default value is: NO. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path -# before files name in the file list and in the header files. If set to NO the -# shortest path that makes the file name unique will be used -# The default value is: YES. - -FULL_PATH_NAMES = YES - -# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -# Stripping is only done if one of the specified strings matches the left-hand -# part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to -# strip. -# -# Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. -# This tag requires that the tag FULL_PATH_NAMES is set to YES. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -# path mentioned in the documentation of a class, which tells the reader which -# header file to include in order to use a class. If left blank only the name of -# the header file containing the class definition is used. Otherwise one should -# specify the list of include paths that are normally passed to the compiler -# using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't -# support long names like on DOS, Mac, or CD-ROM. -# The default value is: NO. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) -# The default value is: NO. - -JAVADOC_AUTOBRIEF = YES - -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) -# The default value is: NO. - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -# a brief description. This used to be the default behavior. The new default is -# to treat a multi-line C++ comment block as a detailed description. Set this -# tag to YES if you prefer the old behavior instead. -# -# Note that setting this tag to YES also means that rational rose comments are -# not recognized any more. -# The default value is: NO. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -# documentation from any documented member that it re-implements. -# The default value is: YES. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new -# page for each member. If set to NO, the documentation of a member will be part -# of the file/class/namespace that contains it. -# The default value is: NO. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -# uses this value to replace tabs by spaces in code fragments. -# Minimum value: 1, maximum value: 16, default value: 4. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that act as commands in -# the documentation. An alias has the form: -# name=value -# For example adding -# "sideeffect=@par Side Effects:\n" -# will allow you to put the command \sideeffect (or @sideeffect) in the -# documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines (in the resulting output). You can put ^^ in the value part of an -# alias to insert a newline as if a physical newline was in the original file. - -ALIASES = "rst=\verbatim embed:rst:leading-asterisk" -ALIASES += "endrst=\endverbatim" - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -# only. Doxygen will then generate output that is more tailored for C. For -# instance, some of the names that are used will be different. The list of all -# members will be omitted, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -# Python sources only. Doxygen will then generate output that is more tailored -# for that language. For instance, namespaces will be presented as packages, -# qualified scopes will look different, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources. Doxygen will then generate output that is tailored for Fortran. -# The default value is: NO. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for VHDL. -# The default value is: NO. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. -# -# Note: For files without extension you can use no_extension as a placeholder. -# -# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -# according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -# case of backward compatibilities issues. -# The default value is: YES. - -MARKDOWN_SUPPORT = YES - -# When enabled doxygen tries to link words that correspond to documented -# classes, or namespaces to their corresponding documentation. Such a link can -# be prevented in individual cases by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. -# The default value is: YES. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. -# The default value is: NO. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. -# The default value is: NO. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. -# The default value is: NO. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate -# getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. -# This will only work if the methods are indeed getting or setting a simple -# type. If this is not the case, or you want to show the methods anyway, you -# should set this option to NO. -# The default value is: YES. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. -# The default value is: NO. - -DISTRIBUTE_GROUP_DOC = NO - -# Set the SUBGROUPING tag to YES to allow class member groups of the same type -# (for instance a group of public functions) to be put as a subgroup of that -# type (e.g. under the Public Functions section). Set it to NO to prevent -# subgrouping. Alternatively, this can be done per class using the -# \nosubgrouping command. -# The default value is: YES. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -# are shown inside the group in which they are included (e.g. using \ingroup) -# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -# and RTF). -# -# Note that this feature does not work in combination with -# SEPARATE_MEMBER_PAGES. -# The default value is: NO. - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -# with only public data fields or simple typedef fields will be shown inline in -# the documentation of the scope in which they are defined (i.e. file, -# namespace, or group documentation), provided this scope is documented. If set -# to NO, structs, classes, and unions are shown on a separate page (for HTML and -# Man pages) or section (for LaTeX and RTF). -# The default value is: NO. - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -# enum is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically be -# useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. -# The default value is: NO. - -TYPEDEF_HIDES_STRUCT = NO - -# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -# cache is used to resolve symbols given their name and scope. Since this can be -# an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The -# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest -# the optimal cache size from a speed point of view. -# Minimum value: 0, maximum value: 9, default value: 0. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in -# documentation are documented, even if no documentation was available. Private -# class members and static file members will be hidden unless the -# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -# Note: This will also disable the warnings about undocumented members that are -# normally produced when WARNINGS is set to YES. -# The default value is: NO. - -EXTRACT_ALL = YES - -# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will -# be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIVATE = YES - -# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal -# scope will be included in the documentation. -# The default value is: NO. - -EXTRACT_PACKAGE = YES - -# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be -# included in the documentation. -# The default value is: NO. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined -# locally in source files will be included in the documentation. If set to NO, -# only classes defined in header files are included. Does not have any effect -# for Java sources. -# The default value is: YES. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. If set to YES, local methods, -# which are defined in the implementation section but not in the interface are -# included in the documentation. If set to NO, only methods in the interface are -# included. -# The default value is: NO. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base name of -# the file that contains the anonymous namespace. By default anonymous namespace -# are hidden. -# The default value is: NO. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -# undocumented members inside documented classes or files. If set to NO these -# members will be included in the various overviews, but no documentation -# section is generated. This option has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. If set -# to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. -# The default value is: NO. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -# documentation blocks found inside the body of a function. If set to NO, these -# blocks will be appended to the function's detailed documentation block. -# The default value is: NO. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation that is typed after a -# \internal command is included. If the tag is set to NO then the documentation -# will be excluded. Set it to YES to include the internal documentation. -# The default value is: NO. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. -# The default value is: system dependent. - -CASE_SENSE_NAMES = NO - -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -# their full class and namespace scopes in the documentation. If set to YES, the -# scope will be hidden. -# The default value is: NO. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -# the files that are included by a file in the documentation of that file. -# The default value is: YES. - -SHOW_INCLUDE_FILES = YES - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -# files with double quotes in the documentation rather than with sharp brackets. -# The default value is: NO. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -# documentation for inline members. -# The default value is: YES. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -# (detailed) documentation of file and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. -# The default value is: YES. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -# descriptions of file, namespace and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. Note that -# this will also influence the order of the classes in the class list. -# The default value is: NO. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -# (brief and detailed) documentation of class members so that constructors and -# destructors are listed first. If set to NO the constructors will appear in the -# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -# member documentation. -# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -# detailed member documentation. -# The default value is: NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -# of group names into alphabetical order. If set to NO the group names will -# appear in their defined order. -# The default value is: NO. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -# fully-qualified names, including namespaces. If set to NO, the class list will -# be sorted only by class name, not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the alphabetical -# list. -# The default value is: NO. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -# type resolution of all parameters of a function it will reject a match between -# the prototype and the implementation of a member function even if there is -# only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -# accept a match between prototype and implementation in such cases. -# The default value is: NO. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo -# list. This list is created by putting \todo commands in the documentation. -# The default value is: YES. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test -# list. This list is created by putting \test commands in the documentation. -# The default value is: YES. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug -# list. This list is created by putting \bug commands in the documentation. -# The default value is: YES. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) -# the deprecated list. This list is created by putting \deprecated commands in -# the documentation. -# The default value is: YES. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional documentation -# sections, marked by \if ... \endif and \cond -# ... \endcond blocks. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -# initial value of a variable or macro / define can have for it to appear in the -# documentation. If the initializer consists of more lines than specified here -# it will be hidden. Use a value of 0 to hide initializers completely. The -# appearance of the value of individual variables and macros / defines can be -# controlled using \showinitializer or \hideinitializer command in the -# documentation regardless of this setting. -# Minimum value: 0, maximum value: 10000, default value: 30. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -# the bottom of the documentation of classes and structs. If set to YES, the -# list will mention the files that were used to generate the documentation. -# The default value is: YES. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -# will remove the Files entry from the Quick Index and from the Folder Tree View -# (if specified). -# The default value is: YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -# page. This will remove the Namespaces entry from the Quick Index and from the -# Folder Tree View (if specified). -# The default value is: YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command command input-file, where command is the value of the -# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file -# version. For an example see the documentation. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can -# optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. -# -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -# tag is left empty. - -LAYOUT_FILE = - -GENERATE_LATEX = NO - -# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -# the reference definitions. This must be a list of .bib files. The .bib -# extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. -# For LaTeX the style of the bibliography can be controlled using -# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -# search path. See also \cite for info how to create references. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# Configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the -# messages are off. -# The default value is: NO. - -QUIET = YES - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES -# this implies that the warnings are on. -# -# Tip: Turn warnings on while writing the documentation. -# The default value is: YES. - -WARNINGS = YES - -# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate -# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -# will automatically be disabled. -# The default value is: YES. - -WARN_IF_UNDOCUMENTED = NO - -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. -# The default value is: YES. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -# are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. -# The default value is: NO. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that doxygen -# can produce. The string should contain the $file, $line, and $text tags, which -# will be replaced by the file and line number from which the warning originated -# and the warning text. Optionally the format may contain $version, which will -# be replaced by the version of the file (if it could be obtained via -# FILE_VERSION_FILTER) -# The default value is: $file:$line: $text. - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning and error -# messages should be written. If left blank the output is written to standard -# error (stderr). - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# Configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag is used to specify the files and/or directories that contain -# documented source files. You may enter file names like myfile.cpp or -# directories like /usr/src/myproject. Separate the files or directories with -# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING -# Note: If this tag is empty the current directory is searched. - -INPUT = ../../torch/csrc/api/include \ - ../../torch/csrc/api/src \ - ../../aten/src/ATen/ATen.h \ - ../../aten/src/ATen/Backend.h \ - ../../aten/src/ATen/Device.h \ - ../../aten/src/ATen/DeviceGuard.h \ - ../../aten/src/ATen/Layout.h \ - ../../aten/src/ATen/OptionsGuard.h \ - ../../aten/src/ATen/Scalar.h \ - ../../aten/src/ATen/TensorOptions.h \ - ../../aten/src/ATen/core/ArrayRef.h \ - ../../aten/src/ATen/core/DeviceType.h \ - ../../aten/src/ATen/core/Error.h \ - ../../aten/src/ATen/core/Half.h \ - ../../aten/src/ATen/core/ScalarType.h \ - ../../aten/src/ATen/cuda/CUDAGuard.h \ - ../../aten/src/ATen/cuda/CUDAStream.h \ - ../../aten/src/ATen/cuda/CUDAContext.h \ - ../../aten/src/ATen/cudnn/Descriptors.h \ - ../../aten/src/ATen/cudnn/Handles.h \ - ../../aten/src/ATen/cudnn/Types.h \ - ../../aten/src/ATen/cudnn/Utils.h \ - ../../aten/src/ATen/mkl/Descriptors.h \ - ../../build/aten/src/ATen/Tensor.h \ - ../../build/aten/src/ATen/Functions.h \ - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. -# The default value is: UTF-8. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -# *.h) to filter out the source-files in the directories. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# read by doxygen. -# -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, -# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. - -FILE_PATTERNS = *.h *.cpp - -# The RECURSIVE tag can be used to specify whether or not subdirectories should -# be searched for input files as well. -# The default value is: NO. - -RECURSIVE = YES - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. -# The default value is: NO. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* - -EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn* - -# The EXAMPLE_PATH tag can be used to specify one or more files or directories -# that contain example code fragments that are included (see the \include -# command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank all -# files are included. - -EXAMPLE_PATTERNS = * - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude commands -# irrespective of the value of the RECURSIVE tag. -# The default value is: NO. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or directories -# that contain images that are to be included in the documentation (see the -# \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command: -# -# -# -# where is the value of the INPUT_FILTER tag, and is the -# name of an input file. Doxygen will then use the output that the filter -# program writes to standard output. If FILTER_PATTERNS is specified, this tag -# will be ignored. -# -# Note that the filter must not add or remove lines; it is applied before the -# code is scanned, but not when the output code is generated. If lines are added -# or removed, the anchors will not be placed correctly. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: pattern=filter -# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -# patterns match the file name, INPUT_FILTER is applied. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will also be used to filter the input files that are used for -# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -# The default value is: NO. - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -# it is also possible to disable source filtering for a specific pattern using -# *.ext= (so without naming a filter). -# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - -FILTER_SOURCE_PATTERNS = - -# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -# is part of the input, its contents will be placed on the main page -# (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. - -USE_MDFILE_AS_MAINPAGE = - -#--------------------------------------------------------------------------- -# Configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -# generated. Documented entities will be cross-referenced with these sources. -# -# Note: To get rid of all source code in the generated output, make sure that -# also VERBATIM_HEADERS is set to NO. -# The default value is: NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. -# The default value is: NO. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -# special comment blocks from generated source code fragments. Normal C, C++ and -# Fortran comments will always remain visible. -# The default value is: YES. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. -# The default value is: NO. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES then for each documented function -# all documented entities called/used by that function will be listed. -# The default value is: NO. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -# to YES then the hyperlinks from functions in REFERENCES_RELATION and -# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -# link to the documentation. -# The default value is: YES. - -REFERENCES_LINK_SOURCE = YES - -# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -# source code will show a tooltip with additional information such as prototype, -# brief description and links to the definition and documentation. Since this -# will make the HTML file larger and loading of large files a bit slower, you -# can opt to disable this feature. -# The default value is: YES. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -SOURCE_TOOLTIPS = YES - -# If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in -# source browser. The htags tool is part of GNU's global source tagging system -# (see https://www.gnu.org/software/global/global.html). You will need version -# 4.8.6 or higher. -# -# To use it do the following: -# - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file -# - Make sure the INPUT points to the root of the source tree -# - Run doxygen as normal -# -# Doxygen will invoke htags (and that will in turn invoke gtags), so these -# tools must be available from the command line (i.e. in the search path). -# -# The result: instead of the source browser generated by doxygen, the links to -# source code will now point to the output of htags. -# The default value is: NO. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -# verbatim copy of the header file for each class for which an include is -# specified. Set to NO to disable this. -# See also: Section \class. -# The default value is: YES. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# Configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -# compounds will be generated. Enable this if the project contains a lot of -# classes, structs, unions or interfaces. -# The default value is: YES. - -ALPHABETICAL_INDEX = YES - -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output -# The default value is: YES. - -GENERATE_HTML = NO - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -# it. -# The default directory is: html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -# generated HTML page (for example: .htm, .php, .asp). -# The default value is: .html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a -# standard header. -# -# To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. -# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -# default header using -# doxygen -w html new_header.html new_footer.html new_stylesheet.css -# YourConfigFile -# and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally -# uses. -# Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description -# of the possible markers and block names see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard -# footer. See HTML_HEADER for more information on how to generate a default -# footer and what special commands can be used inside the footer. See also -# section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -# sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. -# See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. -# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -# it is more robust and this tag (HTML_STYLESHEET) will in the future become -# obsolete. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined -# cascading style sheets that are included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. -# This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefore more robust against future updates. -# Doxygen will copy the style sheet files to the output directory. -# Note: The order of the extra style sheet files is of importance (e.g. the last -# style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -# files will be copied as-is; there are no commands or markers available. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -# will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see -# https://en.wikipedia.org/wiki/Hue for more information. For instance the value -# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -# purple, and 360 is red again. -# Minimum value: 0, maximum value: 359, default value: 220. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A -# value of 255 will produce the most vivid colors. -# Minimum value: 0, maximum value: 255, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -# luminance component of the colors in the HTML output. Values below 100 -# gradually make the output lighter, whereas values above 100 make the output -# darker. The value divided by 100 is the actual gamma applied, so 80 represents -# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -# change the gamma. -# Minimum value: 40, maximum value: 240, default value: 80. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -# shown in the various tree structured indices initially; the user can expand -# and collapse entries dynamically later on. Doxygen will expand the tree to -# such a level that at most the specified number of entries are visible (unless -# a fully collapsed tree already exceeds this amount). So setting the number of -# entries 1 will produce a full collapsed tree by default. 0 is a special value -# representing an infinite number of entries and will result in a full expanded -# tree by default. -# Minimum value: 0, maximum value: 9999, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files will be -# generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_DOCSET = NO - -# This tag determines the name of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# The default value is: Doxygen generated docs. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# This tag specifies a string that should uniquely identify the documentation -# set bundle. This should be a reverse domain-name style string, e.g. -# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. -# The default value is: org.doxygen.Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -# The default value is: Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. -# -# The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -# files are now used as the Windows 98 help format, and will replace the old -# Windows help format (.hlp) on all Windows platforms in the future. Compressed -# HTML files also contain an index, a table of contents, and you can search for -# words in the documentation. The HTML workshop also contains a viewer for -# compressed HTML files. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_HTMLHELP = NO - -# The CHM_FILE tag can be used to specify the file name of the resulting .chm -# file. You can add a path in front of the file if the result should not be -# written to the html output directory. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_FILE = - -# The HHC_LOCATION tag can be used to specify the location (absolute path -# including file name) of the HTML help compiler (hhc.exe). If non-empty, -# doxygen will try to run the HTML help compiler on the generated index.hhp. -# The file has to be specified with full path. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -HHC_LOCATION = - -# The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -GENERATE_CHI = NO - -# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) -# and project file content. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_INDEX_ENCODING = - -# The BINARY_TOC flag controls whether a binary table of contents is generated -# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it -# enables the Previous and Next buttons. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members to -# the table of contents of the HTML help documentation and to the tree view. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -# (.qch) of the generated HTML documentation. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -# the file name of the resulting .qch file. The path specified is relative to -# the HTML output folder. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -# Project output. For more information please see Qt Help Project / Namespace -# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -# Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). -# The default value is: doc. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_VIRTUAL_FOLDER = doc - -# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -# filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_SECT_FILTER_ATTRS = - -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -# generated, together with the HTML files, they form an Eclipse help plugin. To -# install this plugin and make it available under the help contents menu in -# Eclipse, the contents of the directory containing the HTML and XML files needs -# to be copied into the plugins directory of eclipse. The name of the directory -# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -# After copying Eclipse needs to be restarted before the help appears. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the Eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have this -# name. Each documentation set should have its own identifier. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# If you want full control over the layout of the generated HTML pages it might -# be necessary to disable the index and replace it with your own. The -# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -# of each HTML page. A value of NO enables the index and the value YES disables -# it. Since the tabs in the index contain the same information as the navigation -# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. If the tag -# value is set to YES, a side panel will be generated containing a tree-like -# index structure (just like the one that is generated for HTML Help). For this -# to work a browser that supports JavaScript, DHTML, CSS and frames is required -# (i.e. any modern browser). Windows users are probably better off using the -# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. -# -# Note that a value of 0 will completely suppress the enum values from appearing -# in the overview section. -# Minimum value: 0, maximum value: 20, default value: 4. -# This tag requires that the tag GENERATE_HTML is set to YES. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -# to set the initial width (in pixels) of the frame in which the tree is shown. -# Minimum value: 0, maximum value: 1500, default value: 250. -# This tag requires that the tag GENERATE_HTML is set to YES. - -TREEVIEW_WIDTH = 250 - -# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to -# external symbols imported via tag files in a separate window. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of LaTeX formulas included as images in -# the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML -# output directory to force them to be regenerated. -# Minimum value: 8, maximum value: 50, default value: 10. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# https://www.mathjax.org) which uses client side Javascript for the rendering -# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX -# installed or if you want to formulas look prettier in the HTML output. When -# enabled you may also need to install MathJax separately and configure the path -# to it using the MATHJAX_RELPATH option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -USE_MATHJAX = NO - -# When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. -# Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. -# The default value is: HTML-CSS. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_FORMAT = HTML-CSS - -# When MathJax is enabled you need to specify the location relative to the HTML -# output directory using the MATHJAX_RELPATH option. The destination directory -# should contain the MathJax.js script. For instance, if the mathjax directory -# is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -# Content Delivery Network so you can quickly see the result without installing -# MathJax. However, it is strongly recommended to install a local copy of -# MathJax from https://www.mathjax.org before deployment. -# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ - -# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -# extension names that should be enabled during MathJax rendering. For example -# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_EXTENSIONS = - -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -# of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an -# example see the documentation. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_CODEFILE = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and -# should work on any modern browser. Note that when using HTML help -# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -# there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then -# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -# search using the keyboard; to jump to the search box use + S -# (what the is depends on the OS and browser, but it is typically -# , /