diff --git a/.gitmodules.aten b/.gitmodules.aten deleted file mode 100644 index e305c927f5a006..00000000000000 --- a/.gitmodules.aten +++ /dev/null @@ -1,10 +0,0 @@ -[submodule "third_party/tbb"] - path = third_party/tbb - url = https://github.com/01org/tbb - branch = tbb_2018 -[submodule "third_party/catch"] - path = third_party/catch - url = https://github.com/catchorg/Catch2.git -[submodule "third-party/cpuinfo"] - path = third_party/cpuinfo - url = https://github.com/Maratyszcza/cpuinfo.git diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index ae1a9b0756f812..fb44d4be6ad464 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -2,9 +2,15 @@ set -ex +# The INSTALL_PREFIX here must match up with test.sh +INSTALL_PREFIX="/usr/local/caffe2" LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) +CMAKE_ARGS=() + +# Setup SCCACHE +############################################################################### # Setup sccache if SCCACHE_BUCKET is set if [ -n "${SCCACHE_BUCKET}" ]; then mkdir -p ./sccache @@ -61,24 +67,29 @@ report_compile_cache_stats() { fi } -CMAKE_ARGS=("-DBUILD_BINARY=ON") -CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") -CMAKE_ARGS+=("-DUSE_ZSTD=ON") - -if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then - if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then - CMAKE_ARGS+=("-DBUILD_ATEN=ON") - fi +############################################################################### +# Explicitly set Python executable. +############################################################################### +# On Ubuntu 16.04 the default Python is still 2.7. +PYTHON="$(which python)" +if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then + PYTHON=$(which "python${BASH_REMATCH[1]}") + CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}") fi -# Run build script from scripts if applicable + +############################################################################### +# Use special scripts for Android, conda, and setup builds +############################################################################### if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then export ANDROID_NDK=/opt/ndk + CMAKE_ARGS+=("-DBUILD_BINARY=ON") + CMAKE_ARGS+=("-DBUILD_TEST=ON") + CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") + CMAKE_ARGS+=("-DUSE_ZSTD=ON") "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@" exit 0 -fi -if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then - +elif [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then # click (required by onnx) wants these set # TODO don't think this fixes the problem for conda3 yet export LANG=C.UTF-8 @@ -96,51 +107,52 @@ if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then PROTOBUF_INCDIR=/opt/conda/include pip install -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" report_compile_cache_stats exit 0 +elif [[ $BUILD_ENVIRONMENT == *setup* ]]; then + rm -rf $INSTALL_PREFIX && mkdir $INSTALL_PREFIX + PYTHONPATH=$INSTALL_PREFIX $PYTHON setup_caffe2.py develop --install-dir $INSTALL_PREFIX + exit 0 fi -# Run cmake from ./build_caffe2 directory so it doesn't conflict with -# standard PyTorch build directory. Eventually these won't need to -# be separate. -rm -rf build_caffe2 -mkdir build_caffe2 -cd ./build_caffe2 -INSTALL_PREFIX="/usr/local/caffe2" +############################################################################### +# Set cmake args +############################################################################### +CMAKE_ARGS+=("-DBUILD_BINARY=ON") +CMAKE_ARGS+=("-DBUILD_TEST=ON") +CMAKE_ARGS+=("-DINSTALL_TEST=ON") +CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") +CMAKE_ARGS+=("-DUSE_ZSTD=ON") CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") -# Explicitly set Python executable. -# On Ubuntu 16.04 the default Python is still 2.7. -PYTHON="$(which python)" -if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then - PYTHON=$(which "python${BASH_REMATCH[1]}") - CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}") +if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then + if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then + CMAKE_ARGS+=("-DBUILD_ATEN=ON") + fi +fi +if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then + CMAKE_ARGS+=("-DBLAS=MKL") fi +if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then + CMAKE_ARGS+=("-DUSE_CUDA=ON") + CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell") + CMAKE_ARGS+=("-DUSE_NNPACK=OFF") -case "${BUILD_ENVIRONMENT}" in - *-mkl*) - CMAKE_ARGS+=("-DBLAS=MKL") - ;; - *-cuda*) - CMAKE_ARGS+=("-DUSE_CUDA=ON") - CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell") - CMAKE_ARGS+=("-DUSE_NNPACK=OFF") - - # Explicitly set path to NVCC such that the symlink to ccache or sccache is used - CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc") - - # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit. - # Setting PATH to resolve to the right nvcc alone isn't enough. - # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589. - export CUDA_PATH="/usr/local/cuda" - - # Ensure the ccache symlink can still find the real nvcc binary. - export PATH="/usr/local/cuda/bin:$PATH" - ;; - *-rocm*) - export LANG=C.UTF-8 - export LC_ALL=C.UTF-8 - export HCC_AMDGPU_TARGET=gfx900 -esac + # Explicitly set path to NVCC such that the symlink to ccache or sccache is used + CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc") + + # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit. + # Setting PATH to resolve to the right nvcc alone isn't enough. + # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589. + export CUDA_PATH="/usr/local/cuda" + + # Ensure the ccache symlink can still find the real nvcc binary. + export PATH="/usr/local/cuda/bin:$PATH" +fi +if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then + export LANG=C.UTF-8 + export LC_ALL=C.UTF-8 + export HCC_AMDGPU_TARGET=gfx900 +fi # Try to include Redis support for Linux builds if [ "$(uname)" == "Linux" ]; then @@ -154,14 +166,6 @@ if [ "$(uname)" == "Darwin" ]; then CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON") fi -# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04) -# and use that if so. -if [[ -x "$(command -v cmake3)" ]]; then - CMAKE_BINARY=cmake3 -else - CMAKE_BINARY=cmake -fi - # Use a speciallized onnx namespace in CI to catch hardcoded onnx namespace CMAKE_ARGS+=("-DONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI") @@ -173,10 +177,13 @@ if [[ -n "$INTEGRATED" ]]; then CMAKE_ARGS+=("-DCAFFE2_LINK_LOCAL_PROTOBUF=OFF") fi -# Configure -${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@" - -# Build +# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04) +# and use that if so. +if [[ -x "$(command -v cmake3)" ]]; then + CMAKE_BINARY=cmake3 +else + CMAKE_BINARY=cmake +fi # sccache will fail for CUDA builds if all cores are used for compiling if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]] && [ -n "${SCCACHE}" ]; then MAX_JOBS=`expr $(nproc) - 1` @@ -184,6 +191,21 @@ else MAX_JOBS=$(nproc) fi + +############################################################################### +# Configure and make +############################################################################### +# Run cmake from ./build_caffe2 directory so it doesn't conflict with +# standard PyTorch build directory. Eventually these won't need to +# be separate. +rm -rf build_caffe2 +mkdir build_caffe2 +cd ./build_caffe2 + +# Configure +${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@" + +# Build if [ "$(uname)" == "Linux" ]; then make "-j${MAX_JOBS}" install else @@ -193,6 +215,11 @@ fi report_compile_cache_stats + +############################################################################### +# Install ONNX +############################################################################### + # Install ONNX into a local directory pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh old mode 100644 new mode 100755 index c020cf1b72c988..ad3ac913d81956 --- a/.jenkins/pytorch/macos-build.sh +++ b/.jenkins/pytorch/macos-build.sh @@ -21,10 +21,14 @@ rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch* git submodule update --init --recursive export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/ -# Build and test PyTorch +# Build PyTorch export MACOSX_DEPLOYMENT_TARGET=10.9 export CXX=clang++ export CC=clang +if which sccache > /dev/null; then + export CXX="sccache clang++" + export CC="sccache clang" +fi # If we run too many parallel jobs, we will OOM export MAX_JOBS=2 diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh old mode 100644 new mode 100755 index 741e10659ce7be..ce05699c6692f5 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -21,7 +21,7 @@ rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch* git submodule update --init --recursive export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/ -# Build and test PyTorch +# Test PyTorch export MACOSX_DEPLOYMENT_TARGET=10.9 export CXX=clang++ export CC=clang diff --git a/.jenkins/pytorch/win-build.sh b/.jenkins/pytorch/win-build.sh index b7825b0c4fedcb..86db838e0dced3 100755 --- a/.jenkins/pytorch/win-build.sh +++ b/.jenkins/pytorch/win-build.sh @@ -44,7 +44,15 @@ set MAGMA_HOME=%cd%\\magma :: Install sccache mkdir %CD%\\tmp_bin -if "%REBUILD%"=="" ( aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe --quiet ) +if "%REBUILD%"=="" ( + :check_sccache + %CD%\\tmp_bin\\sccache.exe --show-stats || ( + taskkill /im sccache.exe /f /t || set ERRORLEVEL=0 + del %CD%\\tmp_bin\\sccache.exe + aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe + goto :check_sccache + ) +) :: Install Miniconda3 if "%REBUILD%"=="" ( @@ -73,7 +81,7 @@ set CUDNN_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0 :: Target only our CI GPU machine's CUDA arch to speed up the build set TORCH_CUDA_ARCH_LIST=5.2 -sccache --stop-server || set ERRORLEVEL=0 +sccache --stop-server sccache --start-server sccache --zero-stats set CC=sccache cl diff --git a/CMakeLists.txt b/CMakeLists.txt index f69091a695e018..a766d0299bdf6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,15 @@ if(NOT DEFINED BLAS_SET_BY_USER) set(BLAS_SET_BY_USER ${BLAS_SET_BY_USER} CACHE STRING "Marks whether BLAS was manually set by user or auto-detected") endif() +# These lines are an attempt to make find_package(cuda) pick up +# libcuda.dylib, and not cuda.framework. It doesn't work all +# the time, but it seems to help for some users. +# TODO: replace this with a more robust fix +if(APPLE) + set(CMAKE_FIND_FRAMEWORK LAST) + set(CMAKE_FIND_APPBUNDLE LAST) +endif() + # ---[ Options. # Note to developers: if you add an option below, make sure you also add it to # cmake/Summary.cmake so that the summary prints out the option values. @@ -48,8 +57,11 @@ cmake_dependent_option( CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON "NOT BUILD_SHARED_LIBS" OFF) cmake_dependent_option( - BUILD_TEST "Build Caffe2 C++ test binaries (need gtest and gbenchmark)" ON + BUILD_TEST "Build Caffe2 C++ test binaries (need gtest and gbenchmark)" OFF "BUILD_CAFFE2" OFF) +cmake_dependent_option( + INSTALL_TEST "Install test binaries if BUILD_TEST is on" OFF + "BUILD_TEST" OFF) option(USE_ACL "Use ARM Compute Library" OFF) option(USE_ASAN "Use Address Sanitizer" OFF) option(USE_ATEN "Use ATen" OFF) @@ -210,12 +222,16 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-type-limits") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new") + endif() if ($ENV{WERROR}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") endif($ENV{WERROR}) @@ -273,6 +289,9 @@ if(BUILD_CAFFE2) add_subdirectory(caffe/proto) endif() +# ---[ Shared build +add_subdirectory(c10) + # ---[ Main build add_subdirectory(caffe2) diff --git a/CODEOWNERS b/CODEOWNERS index 5721f1688901bf..7cc2b3d47ec2e2 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -11,9 +11,9 @@ /requirements.txt @apaszke @soumith @colesbury @gchanan @zdevito @ezyang /torch/csrc/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough /test/cpp/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough -/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer -/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer -/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer -/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer -/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer +/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing +/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing +/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing +/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing +/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing /torch/lib/c10d/ @apaszke @pietern @teng-li diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index fc8b5afe496ebc..c9e948de418067 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -13,7 +13,6 @@ else() USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) option(ATEN_NO_TEST "Do not build ATen test binaries" OFF) - option(ATEN_NO_CONTRIB "Do not build ATen contrib" OFF) # Legacy options, which we will eventually remove cmake_dependent_option( @@ -157,13 +156,6 @@ list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/src/ATen) add_subdirectory(src/ATen) -if(ATEN_NO_CONTRIB) - message("disable contrib because ATEN_NO_CONTRIB is set") -else() - add_subdirectory(contrib/data) - add_subdirectory(contrib/meter) -endif() - if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) # Pass source, includes, and libs to parent set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) diff --git a/aten/contrib/data/BatchDataset.cc b/aten/contrib/data/BatchDataset.cc deleted file mode 100644 index e4d4ae458034ec..00000000000000 --- a/aten/contrib/data/BatchDataset.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include "BatchDataset.h" -#include "Dataset.h" -#include "ATen/ATen.h" -#include -#include -#include - -using namespace at; - -BatchDataset::BatchDataset(Dataset& dataset, uint64_t batchsize) { - BatchDataset(dataset, batchsize, true); -} - -BatchDataset::BatchDataset(Dataset& dataset, uint64_t batchsize, bool fullbatches) { - dataset_ = &dataset; - size_ = dataset_->size(); - batchsize_ = batchsize; - fullbatches_ = fullbatches; -} - -void BatchDataset::getField(uint64_t idx, std::string& fieldkey, at::Tensor& field) { - - // assertions: - assert(idx < size()); - assert(hasField(fieldkey)); - - // loop over samples: - Tensor singlefield, buffer; - uint64_t maxsize = std::min(batchsize_, size_ - idx * batchsize_); - for(uint64_t n = 0; n < maxsize; n++) { - - // get sample: - uint64_t batchidx = idx * batchsize_ + n; - dataset_->getField(batchidx, fieldkey, singlefield); - - // allocate memory for batch: - if(n == 0) { - - // determine size of batch: - std::vector fieldsize; - fieldsize.push_back(maxsize); - for(long d = 0; d < singlefield.dim(); ++d) { - fieldsize.push_back(singlefield.size(d)); - } - - // resize buffer: - field.resize_(fieldsize); - } - - // copy sample into batch: - buffer = select(field, 0, n); - buffer.copy_(singlefield); - } -} - -uint64_t BatchDataset::size() { - if(fullbatches_) - return floor(size_ / batchsize_); - else - return ceil(size_ / batchsize_); -} diff --git a/aten/contrib/data/BatchDataset.h b/aten/contrib/data/BatchDataset.h deleted file mode 100644 index 4431541ad17280..00000000000000 --- a/aten/contrib/data/BatchDataset.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef AT_BATCH_DATASET_H -#define AT_BATCH_DATASET_H - -#include "Dataset.h" -#include "ATen/ATen.h" - -class BatchDataset : public Dataset -{ -public: - BatchDataset(Dataset& dataset, uint64_t batchsize); - BatchDataset(Dataset& dataset, uint64_t batchsize, bool fullbatches); - virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field); - virtual uint64_t size(); -private: - Dataset* dataset_; - uint64_t batchsize_; - uint64_t size_; - bool fullbatches_; -}; - -#endif diff --git a/aten/contrib/data/CMakeLists.txt b/aten/contrib/data/CMakeLists.txt deleted file mode 100644 index 07bda18fbdef79..00000000000000 --- a/aten/contrib/data/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -cmake_minimum_required(VERSION 3.0) -project(ATen) - -if(APPLE) - set(CMAKE_MACOSX_RPATH ON) -endif() - -if(CMAKE_VERSION VERSION_LESS "3.1") - set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") -else() - set(CMAKE_CXX_STANDARD 11) -endif() - -set(src - BatchDataset.cc - ConcatDataset.cc - Dataset.cc - MergeDataset.cc - ResampleDataset.cc - ShuffleDataset.cc - TensorDataset.cc - TransformDataset.cc -) - -add_library(xtdata ${TH_LINK_STYLE} ${src}) -target_link_libraries(xtdata ATen_cpu_library) -target_include_directories(xtdata PRIVATE ${ATen_CPU_INCLUDE}) -target_include_directories(xtdata PRIVATE .) - -# add_executable(test-data test/basic.cc) -# target_link_libraries(test-data xtdata) diff --git a/aten/contrib/data/ConcatDataset.cc b/aten/contrib/data/ConcatDataset.cc deleted file mode 100644 index 89ae3c1662b297..00000000000000 --- a/aten/contrib/data/ConcatDataset.cc +++ /dev/null @@ -1,63 +0,0 @@ -#include "ConcatDataset.h" -#include "Dataset.h" -#include -#include - -using namespace at; - -ConcatDataset::ConcatDataset(std::vector& datasets) { - datasets_ = &datasets; - size_ = 0; - beginindices_ = std::vector(); - endindices_ = std::vector(); - beginindices_.push_back(0); - for(Dataset* dataset : datasets) { - size_ += dataset->size(); - uint64_t curidx = endindices_.back(); - endindices_.push_back(beginindices_.back() + dataset->size()); - if(endindices_.size() > 1) - beginindices_.push_back(curidx + 1); - } -} - -void ConcatDataset::getField(uint64_t idx, std::string& fieldkey, Tensor &field) { - - // assertions: - assert(idx < size()); - assert(hasField(fieldkey)); - - // get sample from correct dataset: - uint64_t datasetidx = binarySearch(idx); - Dataset* curdataset = (*datasets_)[datasetidx]; - curdataset->getField(idx - beginindices_[datasetidx], fieldkey, field); -} - -uint64_t ConcatDataset::binarySearch(uint64_t idx) { - assert(idx < size()); // TODO: Add caching to this method. - uint64_t left = 0; - uint64_t right = size_ - 1; - while(left != right) { - uint64_t middle = (right - left) / 2; - if(left == middle) { - if(idx > endindices_[left]) - left = right; - else - right = left; - } - else { - if(idx > endindices_[middle]) - left = middle; - else if(idx < beginindices_[middle]) - right = middle; - else { - left = middle; - right = middle; - } - } - } - return left; -} - -uint64_t ConcatDataset::size() { - return size_; -} diff --git a/aten/contrib/data/ConcatDataset.h b/aten/contrib/data/ConcatDataset.h deleted file mode 100644 index 2bde7ca0f514dd..00000000000000 --- a/aten/contrib/data/ConcatDataset.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef AT_CONCAT_DATASET_H -#define AT_CONCAT_DATASET_H - -#include "Dataset.h" - -class ConcatDataset : public Dataset -{ -public: - ConcatDataset(std::vector& datasets); - virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor &field); - virtual uint64_t size(); -private: - uint64_t binarySearch(uint64_t idx); - std::vector* datasets_; - std::vector beginindices_; - std::vector endindices_; - uint64_t size_; -}; - -#endif diff --git a/aten/contrib/data/Dataset.cc b/aten/contrib/data/Dataset.cc deleted file mode 100644 index e210be58636d4d..00000000000000 --- a/aten/contrib/data/Dataset.cc +++ /dev/null @@ -1,28 +0,0 @@ -#include "Dataset.h" -#include - -typedef std::map Fields; - -void Dataset::get(int64_t idx, Fields& fields) { - for(auto& field : fields) { - std::string fieldname = field.first; - assert(hasField(fieldname)); - getField(idx, fieldname, field.second); - } -} - -bool Dataset::hasField(std::string& fieldkey) { - auto search = fieldkeys_.find(fieldkey); - return (search != fieldkeys_.end()); -} - -std::set& Dataset::fieldKeys() { - return fieldkeys_; -} - -void Dataset::addFieldKey(std::string& fieldkey) { - fieldkeys_.insert(fieldkey); -} - -Dataset::~Dataset() { -} diff --git a/aten/contrib/data/Dataset.h b/aten/contrib/data/Dataset.h deleted file mode 100644 index 02b45acaf0a35b..00000000000000 --- a/aten/contrib/data/Dataset.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef AT_DATASET_H -#define AT_DATASET_H - -#include "ATen/ATen.h" -#include -#include -#include - -typedef std::map Fields; - -class Dataset { - std::set fieldkeys_; -public: - virtual uint64_t size() = 0; // pure virtual function - virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field) = 0; - virtual bool hasField(std::string& fieldkey); - virtual std::set& fieldKeys(); - virtual void addFieldKey(std::string& fieldkey); - virtual void get(int64_t idx, Fields& fields); - virtual ~Dataset(); -}; - -#endif diff --git a/aten/contrib/data/DatasetIterator.h b/aten/contrib/data/DatasetIterator.h deleted file mode 100644 index 9f888e265a1a57..00000000000000 --- a/aten/contrib/data/DatasetIterator.h +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef AT_DATASET_ITERATOR_H -#define AT_DATASET_ITERATOR_H - -#include "Dataset.h" -#include - -class DatasetIterator : public std::iterator { -private: - uint64_t idx_ = 0; - Dataset* dataset_; -public: - - DatasetIterator(Dataset& dataset) { - dataset_ = &dataset; - idx_ = 0; - } - - DatasetIterator(DatasetIterator& rhs) { - DatasetIterator(*rhs.dataset_); - } - - DatasetIterator& operator ++() { - ++idx_; - return *this; - } - - DatasetIterator operator ++ (int) { - DatasetIterator tmp(*this); - ++idx_; - return tmp; - } - - friend bool operator == (const DatasetIterator& lhs, const DatasetIterator& rhs); - friend bool operator != (const DatasetIterator& lhs, const DatasetIterator& rhs); - - Fields operator* () const { - Fields sample; - dataset_->get(idx_, sample); - return sample; - } - - Fields* operator-> () const { - Fields sample; - dataset_->get(idx_, sample); - return &sample; - } -}; - -bool operator == (const DatasetIterator& lhs, const DatasetIterator& rhs) { - return lhs.dataset_ == rhs.dataset_; -} - -bool operator != (const DatasetIterator& lhs, const DatasetIterator& rhs) { - return lhs.dataset_ != rhs.dataset_; -} - -typedef DatasetIterator iterator; -//typedef DatasetIterator const_iterator; - -#endif - -/** -iterator { - iterator(const iterator&); - ~iterator(); - iterator& operator=(const iterator&); - iterator& operator++(); //prefix increment - reference operator*() const; - friend void swap(iterator& lhs, iterator& rhs); //C++11 I think -}; - -input_iterator : public virtual iterator { - iterator operator++(int); //postfix increment - value_type operator*() const; - pointer operator->() const; - friend bool operator==(const iterator&, const iterator&); - friend bool operator!=(const iterator&, const iterator&); -}; -//once an input iterator has been dereferenced, it is -//undefined to dereference one before that. - -output_iterator : public virtual iterator { - reference operator*() const; - iterator operator++(int); //postfix increment -}; -//dereferences may only be on the left side of an assignment -//once an input iterator has been dereferenced, it is -//undefined to dereference one before that. - -forward_iterator : input_iterator, output_iterator { - forward_iterator(); -}; -**/ diff --git a/aten/contrib/data/MergeDataset.cc b/aten/contrib/data/MergeDataset.cc deleted file mode 100644 index a3cab619c3012a..00000000000000 --- a/aten/contrib/data/MergeDataset.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "MergeDataset.h" -#include - -using namespace at; - -MergeDataset::MergeDataset(std::vector& datasets) { - datasets_ = &datasets; - uint64_t idx = 0; - for(Dataset* dataset : *datasets_) { - for(auto& fieldkey : dataset->fieldKeys()) { - std::string fieldkeyc = fieldkey; - addFieldKey(fieldkeyc); - datasetidx_[fieldkeyc] = idx; - } - } -} - -void MergeDataset::getField(uint64_t idx, std::string& fieldkey, Tensor& field) { - assert(idx < size()); - assert(hasField(fieldkey)); - Dataset* curdataset = (*datasets_)[datasetidx_[fieldkey]]; - return curdataset->getField(idx, fieldkey, field); -} - -uint64_t MergeDataset::size() { - uint64_t size = 0; - for(Dataset* dataset : *datasets_) - size += dataset->size(); - return size; -} diff --git a/aten/contrib/data/MergeDataset.h b/aten/contrib/data/MergeDataset.h deleted file mode 100644 index 925646845bbd9d..00000000000000 --- a/aten/contrib/data/MergeDataset.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef AT_MERGE_DATASET_H -#define AT_MERGE_DATASET_H - -#include "Dataset.h" -#include -#include - -class MergeDataset : public Dataset -{ -public: - MergeDataset(std::vector& datasets); - virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field); - virtual uint64_t size(); -private: - std::vector* datasets_; - std::map datasetidx_; -}; - -#endif diff --git a/aten/contrib/data/ResampleDataset.cc b/aten/contrib/data/ResampleDataset.cc deleted file mode 100644 index ce8d3547ddb4fd..00000000000000 --- a/aten/contrib/data/ResampleDataset.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "ResampleDataset.h" -#include "Dataset.h" -#include -#include - -using namespace at; - -ResampleDataset::ResampleDataset(Dataset& dataset) { - dataset_ = &dataset; - size_ = dataset.size(); - perm_ = std::vector(); - perm_.reserve(size_); - for(uint64_t n = 0; n < size_; ++n) - perm_[n] = n; -} - -ResampleDataset::ResampleDataset(Dataset& dataset, std::vector& perm) { - dataset_ = &dataset; - size_ = dataset.size(); - perm_ = perm; - assert(perm_.size() == size_); -} - -ResampleDataset::ResampleDataset(Dataset& dataset, std::function perm) { - dataset_ = &dataset; - size_ = dataset.size(); - permfunc_ = perm; - resample(); -} - -void ResampleDataset::getField(uint64_t idx, std::string& fieldkey, at::Tensor& field) { - assert(idx < size()); - assert(hasField(fieldkey)); - dataset_->getField(perm_[idx], fieldkey, field); -} - -void ResampleDataset::resample() { - if(permfunc_) { - perm_.reserve(size_); - for(uint64_t n = 0; n < size_; ++n) - perm_[n] = permfunc_(n); - } -} - -uint64_t ResampleDataset::size() { - return size_; -} diff --git a/aten/contrib/data/ResampleDataset.h b/aten/contrib/data/ResampleDataset.h deleted file mode 100644 index 4f84012402eff7..00000000000000 --- a/aten/contrib/data/ResampleDataset.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef AT_RESAMPLE_DATASET_H -#define AT_RESAMPLE_DATASET_H - -#include -#include -#include -#include "ATen/ATen.h" -#include "Dataset.h" - -class ResampleDataset : public Dataset -{ -public: - ResampleDataset(Dataset& dataset); - ResampleDataset(Dataset& dataset, std::vector& perm); - ResampleDataset(Dataset& dataset, std::function perm); - virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field); - virtual uint64_t size(); - virtual void resample(); -protected: - std::vector perm_; - uint64_t size_; -private: - Dataset* dataset_; - std::function permfunc_; - std::vector fields_; -}; - -#endif diff --git a/aten/contrib/data/ShuffleDataset.cc b/aten/contrib/data/ShuffleDataset.cc deleted file mode 100644 index 25ebb38c69f9a1..00000000000000 --- a/aten/contrib/data/ShuffleDataset.cc +++ /dev/null @@ -1,16 +0,0 @@ -#include "ShuffleDataset.h" -#include "Dataset.h" -#include - -using namespace at; - -ShuffleDataset::ShuffleDataset(Dataset& dataset) : ResampleDataset(dataset) { - resample(); -} - -void ShuffleDataset::resample() { - perm_.reserve(size_); - for(size_t n = 0; n < size_; ++n) - perm_[n] = n; - std::random_shuffle(perm_.begin(), perm_.end()); -} diff --git a/aten/contrib/data/ShuffleDataset.h b/aten/contrib/data/ShuffleDataset.h deleted file mode 100644 index 80ac6ce640d032..00000000000000 --- a/aten/contrib/data/ShuffleDataset.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef AT_SHUFFLE_DATASET_H -#define AT_SHUFFLE_DATASET_H - -#include "Dataset.h" -#include "ResampleDataset.h" - -class ShuffleDataset : public ResampleDataset -{ -public: - ShuffleDataset(Dataset& dataset); - virtual void resample(); -}; - -#endif diff --git a/aten/contrib/data/TensorDataset.cc b/aten/contrib/data/TensorDataset.cc deleted file mode 100644 index fc31e6eb6a298f..00000000000000 --- a/aten/contrib/data/TensorDataset.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include "TensorDataset.h" -#include "ATen/ATen.h" -#include - -using namespace at; - -TensorDataset::TensorDataset(Tensor& t, std::string& fieldkey) { - t_ = t; - fieldkey_ = fieldkey; - addFieldKey(fieldkey); -} - -void TensorDataset::getField(uint64_t idx, std::string& fieldkey, Tensor& field) { - - // assertions: - assert(idx < size()); - assert(fieldkey_.compare(fieldkey) == 0); - - // get sample: - Tensor buffer = select(t_, 0, idx); - field.copy_(buffer); - -} - -uint64_t TensorDataset::size() { - return t_.size(0); -} diff --git a/aten/contrib/data/TensorDataset.h b/aten/contrib/data/TensorDataset.h deleted file mode 100644 index 3d2db04b8e612f..00000000000000 --- a/aten/contrib/data/TensorDataset.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef AT_TENSOR_DATASET_H -#define AT_TENSOR_DATASET_H - -#include "Dataset.h" -#include "ATen/ATen.h" -#include - -class TensorDataset : public Dataset -{ -public: - TensorDataset(at::Tensor& t, std::string& fieldkey); - virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field); - virtual uint64_t size(); -private: - at::Tensor t_; - std::string fieldkey_; -}; - -#endif diff --git a/aten/contrib/data/TransformDataset.cc b/aten/contrib/data/TransformDataset.cc deleted file mode 100644 index 80bdc7e84e6b46..00000000000000 --- a/aten/contrib/data/TransformDataset.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "TransformDataset.h" -#include "ATen/ATen.h" -#include "ATen/ATen.h" -#include - -using namespace at; - -TransformDataset::TransformDataset(Dataset& dataset, std::string& fieldkey, std::function& transform) { - assert(hasField(fieldkey)); - dataset_ = &dataset; - fieldkey_ = fieldkey; - transform_ = transform; -} - -void TransformDataset::getField(uint64_t idx, std::string& fieldkey, Tensor& field) { - dataset_->getField(idx, fieldkey, field); - if(fieldkey.compare(fieldkey_) == 0) { - Tensor transformed = transform_(field); - field.copy_(transformed); - } -} - -uint64_t TransformDataset::size() { - return dataset_->size(); -} diff --git a/aten/contrib/data/TransformDataset.h b/aten/contrib/data/TransformDataset.h deleted file mode 100644 index ac3737ea74a1c3..00000000000000 --- a/aten/contrib/data/TransformDataset.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef AT_TRANSFORM_DATASET_H -#define AT_TRANSFORM_DATASET_H - -#include "Dataset.h" -#include "ATen/ATen.h" -#include -#include - -using namespace at; - -class TransformDataset : public Dataset -{ -public: - TransformDataset(Dataset& dataset, std::string& fieldkey, std::function& transform); - virtual void getField(uint64_t idx, std::string& fieldkey, Tensor& field); - virtual uint64_t size(); -private: - Dataset* dataset_; - std::string fieldkey_; - std::function transform_; -}; - -#endif diff --git a/aten/contrib/data/test/basic.cc b/aten/contrib/data/test/basic.cc deleted file mode 100644 index c3e48a2785e4d4..00000000000000 --- a/aten/contrib/data/test/basic.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "Dataset.h" -#include "DatasetIterator.h" -#include "TensorDataset.h" -#include - -using namespace at; - -int main() -{ - std::cout << "hello\n"; - - Tensor tensor = rand(CPU(kDouble), {256,32}); - - TensorDataset dataset(tensor); - DatasetIterator datasetiterator(dataset); - uint64_t cnt = 0; - for(auto& sample : datasetiterator) { - std::cout << "got sample " << cnt << std:endl; - cnt++; - } - return 0; -} diff --git a/aten/contrib/data/threadpool/ThreadPool.cc b/aten/contrib/data/threadpool/ThreadPool.cc deleted file mode 100644 index 5a1172f8912e3a..00000000000000 --- a/aten/contrib/data/threadpool/ThreadPool.cc +++ /dev/null @@ -1,77 +0,0 @@ -// dependencies: -#include "ThreadPool.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// constructor launches the specified number of workers: -ThreadPool::ThreadPool(uint64_t threads) : stop(false) { - - // loop over all threads: - for(uint64_t i = 0; i < threads; ++i) { - workers.emplace_back( - [this] { - for(;;) { - std::function task; - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait(lock, - [this]{ return this->stop || !this->tasks.empty(); }); - if(this->stop && this->tasks.empty()) - return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - task(); - } - } - ); - } -} - -// synchronize all the threads: -void ThreadPool::synchronize() { - for(uint64_t i = 0; i < futures.size(); i++) - futures[i].first.wait(); -} - -// poll the threads for results: -unsigned int ThreadPool::waitFor() { - - // wait until a task is finished: - uint64_t i; - std::future_status status; - do { - for(i = 0; i < futures.size(); i++) { - status = futures[i].first.wait_for(std::chrono::microseconds(0)); - if(status == std::future_status::ready) break; - } - std::this_thread::sleep_for(std::chrono::microseconds(1)); - } while (status != std::future_status::ready); - - // get the result and remove the future: - futures[i].first.get(); - unsigned int handle = futures[i].second; - iter_swap(futures.begin() + i, futures.end() - 1); - futures.pop_back(); - return handle; -} - -// the destructor joins all threads: -ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for(std::thread &worker: workers) - worker.join(); -} diff --git a/aten/contrib/data/threadpool/ThreadPool.h b/aten/contrib/data/threadpool/ThreadPool.h deleted file mode 100644 index 3024af43144100..00000000000000 --- a/aten/contrib/data/threadpool/ThreadPool.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef AT_THREADPOOL_H -#define AT_THREADPOOL_H - -// dependencies: -#include -#include -#include -#include -#include -#include - -// definition of the ThreadPool class: -class ThreadPool { - public: - explicit ThreadPool(uint64_t); - void synchronize(); - unsigned int waitFor(); - template - unsigned int enqueue(F&& f, Args&&... args); - ~ThreadPool(); - - private: - - // all the threads that can perform work: - std::vector< std::thread > workers; - - // the list of futures: - unsigned int handle = 0; - std::vector< std::pair< std::future, unsigned int > > futures; - - // the task queue: - std::queue< std::function > tasks; - - // synchronization: - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// enqueue new work item into the pool: -template -unsigned int ThreadPool::enqueue(F&& f, Args&&... args) -{ - - // create the task: - auto task = std::make_shared< std::packaged_task >( - std::bind(std::forward(f), std::forward(args)...) - ); - - // get future and enqueue the task: - std::future future = task->get_future(); - { - std::unique_lock lock(queue_mutex); - if(stop) throw std::runtime_error("enqueue on stopped ThreadPool"); - tasks.emplace([task](){ (*task)(); }); - } - condition.notify_one(); - - // generate handle and store future: - handle++; - futures.push_back(std::make_pair(std::move(future), handle)); - return handle; -} - -#endif diff --git a/aten/contrib/meter/APMeter.cc b/aten/contrib/meter/APMeter.cc deleted file mode 100644 index 316d4366463317..00000000000000 --- a/aten/contrib/meter/APMeter.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "APMeter.h" -#include -#include - -using namespace at; - -APMeter::APMeter() { - reset(); -} - -void APMeter::reset() { - outputs_ = CPU(kFloat).tensor(); - targets_ = CPU(kFloat).tensor(); - n_ = 0; -} - -void APMeter::add(Tensor& output, Tensor& target) { - - // assertions and allocations: - assert(output.dim() == 2 && target.dim() == 2); - //assert(isSameSizeAs(output, target)); - assert(output.size(1) == outputs_.size(1)); - - // get current outputs and targets: - Tensor curoutputs = getOutputs(); - Tensor curtargets = getTargets(); - - // make sure underlying storages are sufficiently large: - if(numel(outputs_) < numel(curoutputs) + numel(output)) { - long long newsize = ceil(numel(outputs_) * 1.5); - outputs_.resize_({newsize + numel(output)}); - targets_.resize_({newsize + numel(output)}); - } - n_ += output.size(0); - - // store scores and targets: - uint64_t offset = (numel(curoutputs) > 0) ? curoutputs.size(0) : 0; - Tensor outputbuffer = outputs_.narrow( 0, offset, output.size(0)); - Tensor targetbuffer = targets_.narrow( 0, offset, target.size(0)); - - outputbuffer.copy_(output); - targetbuffer.copy_(target); -} - -Tensor APMeter::getOutputs() { - return outputs_.narrow(0, 0, n_); -} -Tensor APMeter::getTargets() { - return targets_.narrow(0, 0, n_); -} - -void APMeter::value(Tensor& val) { - - // get current outputs and targets: - Tensor curoutputs = getOutputs(); - Tensor curtargets = getTargets(); - - // allocate some memory: - val.resize_({curoutputs.size(1)}); - double * val_d = val.data(); - Tensor outputbuffer, targetbuffer, sortval, sortidx, sorttgt; - Tensor truepos, precision; - Tensor range = val.type().range(0,curoutputs.size(0)); - - // loop over all classes: - for(long k = 0; k < curoutputs.size(1); ++k) { - - // sort scores: - outputbuffer = curoutputs.narrow( 1, k, 1); - targetbuffer = curtargets.narrow(1, k, 1).contiguous().toType(CPU(kDouble)); - double * targetbuffer_d = targetbuffer.data(); - std::tie(sortval, sortidx) = sort(curoutputs, 0); - sorttgt = index_select(targetbuffer, 0, sortidx); - - // compue true positive sums, and precision: - truepos = cumsum(targetbuffer,0); // NOTE: Cast to double first? - precision = div(truepos, range); - double * precision_d = precision.data(); - // compute average precision: - val_d[k] = .0; - for(long n = 0; n < precision.size(0); ++n) { - if(targetbuffer_d[n] != 0.) - val_d[k] += precision_d[n]; - } - auto norm = sum(targetbuffer).toCDouble(); - if(norm > 0) - val_d[k] /= norm; - } -} diff --git a/aten/contrib/meter/APMeter.h b/aten/contrib/meter/APMeter.h deleted file mode 100644 index 06e8a6330130d7..00000000000000 --- a/aten/contrib/meter/APMeter.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef AT_AP_METER_H -#define AT_AP_METER_H - -#include "Meter.h" -#include "ATen/ATen.h" - -class APMeter : public Meter -{ -public: - APMeter(); - virtual void add(Tensor& output, Tensor& target); - virtual void value(Tensor& val); - virtual void reset(); - virtual Tensor getOutputs(); - virtual Tensor getTargets(); -private: - Tensor outputs_; - Tensor targets_; - uint64_t n_; -}; - -#endif diff --git a/aten/contrib/meter/AUCMeter.cc b/aten/contrib/meter/AUCMeter.cc deleted file mode 100644 index fdd352b7014a08..00000000000000 --- a/aten/contrib/meter/AUCMeter.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include "AUCMeter.h" -#include "APMeter.h" -#include - -using namespace at; - -AUCMeter::AUCMeter() { - reset(); -} - -void AUCMeter::reset() { - meter_ = APMeter(); -} - -void AUCMeter::add(Tensor& output, Tensor& target) { - meter_.add(output, target); -} - -void AUCMeter::value(Tensor& val) { - - // get data from APMeter: - Tensor outputs = meter_.getOutputs(); - Tensor targets = meter_.getTargets(); - - // sort scores: - Tensor sortval, sortidx, sorttgt; - std::tie(sortval, sortidx) = sort(outputs, 0, true); - sorttgt = index_select(targets, 0, sortidx); - int64_t * sortidx_d = sortidx.data(); - int16_t * targets_d = sortidx.data(); - // construct the ROC curve: - Tensor tpr = zeros(CPU(kDouble), {numel(outputs)}); - Tensor fpr = zeros(CPU(kDouble), {numel(outputs)}); - - double * tpr_d = tpr.data(); - double * fpr_d = fpr.data(); - for(long n = 1; n <= numel(outputs); ++n) { - if(targets_d[sortidx_d[n - 1]] == 1) { - tpr_d[n] = tpr_d[n - 1] + 1.; - fpr_d[n] = fpr_d[n - 1]; - } else { - tpr_d[n] = tpr_d[n - 1]; - fpr_d[n] = fpr_d[n - 1] + 1.; - } - } - tpr.div_(sum(targets)); - fpr.div_(sum(at::add(mul(targets, -1.), 1.))); - - /** - local auc = torch.cmul( - tpr:narrow(1, 1, tpr:nElement() - 1), - fpr:narrow(1, 2, fpr:nElement() - 1) - - fpr:narrow(1, 1, fpr:nElement() - 1)):sum() - */ - - val.resize_({1}).fill_( - sum(mul(tpr.narrow(0, 0, numel(tpr) - 1), - sub(fpr.narrow(0, 1, numel(tpr) - 1), - fpr.narrow(0, 0, numel(tpr) - 1))))); -} diff --git a/aten/contrib/meter/AUCMeter.h b/aten/contrib/meter/AUCMeter.h deleted file mode 100644 index 9da96e6bab65e9..00000000000000 --- a/aten/contrib/meter/AUCMeter.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef AT_AUC_METER_H -#define AT_AUC_METER_H - -#include "Meter.h" -#include "APMeter.h" -#include "ATen/ATen.h" - -class AUCMeter : public Meter -{ -public: - AUCMeter(); - virtual void reset(); - virtual void add(Tensor& output, Tensor& target); - virtual void value(Tensor& val); -private: - APMeter meter_; -}; - -#endif diff --git a/aten/contrib/meter/CMakeLists.txt b/aten/contrib/meter/CMakeLists.txt deleted file mode 100644 index 1f23ff50e64505..00000000000000 --- a/aten/contrib/meter/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -# requires to be compiled along xttensor - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -# C++11 -if(CMAKE_VERSION VERSION_LESS "3.1") - set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") -else() - set(CMAKE_CXX_STANDARD 11) -endif() - -set(src - APMeter.cc - AUCMeter.cc - ClassErrorMeter.cc - MAPMeter.cc - MSEMeter.cc -) - -add_library(xtmeter SHARED ${src}) -target_link_libraries(xtmeter ATen_cpu_library) -target_include_directories(xtmeter PRIVATE ${ATen_CPU_INCLUDE}) - -add_executable(test-meter test/basic.cc ${BACKWARD_ENABLE}) -# add_backward(test-meter) -target_link_libraries(test-meter xtmeter) -target_include_directories(test-meter PRIVATE ${ATen_CPU_INCLUDE}) diff --git a/aten/contrib/meter/ClassErrorMeter.cc b/aten/contrib/meter/ClassErrorMeter.cc deleted file mode 100644 index a24e850542aed8..00000000000000 --- a/aten/contrib/meter/ClassErrorMeter.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "ClassErrorMeter.h" -#include "ATen/ATen.h" -#include - -using namespace at; - -ClassErrorMeter::ClassErrorMeter() { - ClassErrorMeter(1); -} - -ClassErrorMeter::ClassErrorMeter(const int64_t topk) { - topkval_ = CPU(kShort).tensor(); - sumval_ = CPU(kShort).tensor(); - topkval_.resize_({topk}); - sumval_.resize_({topk}); - reset(); -} - -void ClassErrorMeter::reset() { - range_out(topkval_, 1, numel(topkval_)); - sumval_.fill_(0.); - n_ = 0; -} - -void ClassErrorMeter::add(Tensor& output, Tensor& target) { - - // assertions and allocations: - assert(output.dim() == 2 && target.dim() == 1); - //assert(isSameSizeAs(output, target)); - auto sumval_d = sumval_.data(); - auto target_long = target.contiguous().toType(CPU(kLong)); - auto target_d = target_long.data(); - // update counts: - Tensor val, idx; - std::tie(val, idx) = topk(output, numel(topkval_), 1, true, true); - for(long n = 0; n < output.size(0); ++n) { - bool targetseen = false; - Tensor idx_n = idx.select(0,n); - auto idx_n_d = idx_n.data(); - for(long k = 0; k < numel(topkval_); ++k) { - n_++; - if(targetseen) { - sumval_d[k]++; - } else if(idx_n_d[k] == target_d[n]) { - targetseen = true; - sumval_d[k]++; - } - } - } -} - -void ClassErrorMeter::value(Tensor& val) { - val.resize_({numel(topkval_)}); - auto val_d = val.data(); - auto sumval_d = sumval_.data(); - for(long k = 0; k < numel(topkval_); ++k) { - val_d[k] = 1.0 - (double(sumval_d[k]) / double(n_)); - } -} diff --git a/aten/contrib/meter/ClassErrorMeter.h b/aten/contrib/meter/ClassErrorMeter.h deleted file mode 100644 index 87529b17dc716d..00000000000000 --- a/aten/contrib/meter/ClassErrorMeter.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef AT_CLASS_ERROR_METER_H -#define AT_CLASS_ERROR_METER_H - -#include "Meter.h" -#include "ATen/ATen.h" - -class ClassErrorMeter : public Meter -{ -public: - ClassErrorMeter(); - ClassErrorMeter(const int64_t topk); - virtual void reset(); - virtual void add(Tensor& output, Tensor& target); - virtual void value(Tensor& val); -private: - Tensor topkval_; - Tensor sumval_; - uint64_t n_; -}; - -#endif diff --git a/aten/contrib/meter/MAPMeter.cc b/aten/contrib/meter/MAPMeter.cc deleted file mode 100644 index a73c541d5ed34f..00000000000000 --- a/aten/contrib/meter/MAPMeter.cc +++ /dev/null @@ -1,23 +0,0 @@ -#include "MAPMeter.h" - -using namespace at; - -MAPMeter::MAPMeter() { - reset(); -} - -void MAPMeter::reset() { - meter_.reset(); -} - -void MAPMeter::add(Tensor& output, Tensor& target) { - meter_.add(output, target); -} - -void MAPMeter::value(Tensor& val) { - //TODO: 0-dim - val.resize_({1}); - Tensor allvalues = val.type().tensor(); - meter_.value(allvalues); - val.fill_(mean(allvalues)); -} diff --git a/aten/contrib/meter/MAPMeter.h b/aten/contrib/meter/MAPMeter.h deleted file mode 100644 index ee38a548661fbd..00000000000000 --- a/aten/contrib/meter/MAPMeter.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef AT_MAP_METER_H -#define AT_MAP_METER_H - -#include "Meter.h" -#include "APMeter.h" -#include "ATen/ATen.h" - -class MAPMeter : public Meter -{ -public: - MAPMeter(); - virtual void reset(); - virtual void add(Tensor& output, Tensor& target); - virtual void value(Tensor& val); -private: - APMeter meter_; -}; - -#endif diff --git a/aten/contrib/meter/MSEMeter.cc b/aten/contrib/meter/MSEMeter.cc deleted file mode 100644 index 431415b08ac6a8..00000000000000 --- a/aten/contrib/meter/MSEMeter.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "MSEMeter.h" -#include -#include - -using namespace at; - -MSEMeter::MSEMeter() { - reset(); -} - -void MSEMeter::reset() { - n_ = 0; - val_ = .0; -} - -void MSEMeter::add(Tensor& output, Tensor& target) { - //assert(isSameSizeAs(output, output - Tensor t = output.sub(target); - Tensor result = t.mul(t).contiguous().toType(CPU(kDouble)); - double * data = result.data(); - for(long n = 0; n < numel(result); ++n) { - n_++; - val_ += ( (1. / ((double)n_ - 1.) * val_) + - ((1. / (double)n_) * data[n])); - } -} - -void MSEMeter::value(Tensor& val) { - //TODO: 0-dim - val.resize_({1}).fill_(val_); -} diff --git a/aten/contrib/meter/MSEMeter.h b/aten/contrib/meter/MSEMeter.h deleted file mode 100644 index 46ccf167d4cf13..00000000000000 --- a/aten/contrib/meter/MSEMeter.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef AT_MSE_METER_H -#define AT_MSE_METER_H - -#include "Meter.h" -#include "ATen/ATen.h" - -class MSEMeter : public Meter -{ -public: - MSEMeter(); - virtual void reset(); - virtual void add(Tensor& output, Tensor& target); - virtual void value(Tensor& val); -private: - double val_; - uint64_t n_; -}; - -#endif diff --git a/aten/contrib/meter/Meter.h b/aten/contrib/meter/Meter.h deleted file mode 100644 index 7f240ea30286fe..00000000000000 --- a/aten/contrib/meter/Meter.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef AT_METER_H -#define AT_METER_H - -#include "ATen/ATen.h" - -using namespace at; - -class Meter -{ -public: - virtual void add(Tensor& output, Tensor& target) = 0; - virtual void value(Tensor& val) = 0; - virtual void reset() = 0; - virtual ~Meter() {}; -}; - -#endif diff --git a/aten/contrib/meter/test/basic.cc b/aten/contrib/meter/test/basic.cc deleted file mode 100644 index 0c87e5d2330eea..00000000000000 --- a/aten/contrib/meter/test/basic.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "APMeter.h" -#include - -using namespace at; - -int main() -{ - auto && T = CPU(kFloat); - std::cout << "hello\n"; - APMeter meter; - Tensor output = at::randn(T, {10, 7}); - Tensor target = at::zeros(T, {10, 7}); - for(uint64_t n = 0; n < 10; ++n) { - Tensor row = target.select(0,n); - auto row_d = row.data(); - row_d[rand() % 7] = 1.; - } - std::cout << output; - std::cout << target; - meter.add(output, target); - Tensor val; - meter.value(val); - std::cout << "value: " << val << std::endl; - return 0; -} diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h index e41c2d95655d96..84568880fcf8e6 100644 --- a/aten/src/ATen/ATen.h +++ b/aten/src/ATen/ATen.h @@ -15,3 +15,4 @@ #include "ATen/TensorOperators.h" #include "ATen/TensorMethods.h" #include "ATen/Dispatch.h" +#include "ATen/DimVector.h" diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index df7e81cf6cc80c..6eac12a0e54262 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -49,11 +49,13 @@ FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu") FILE(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh") FILE(GLOB cudnn_cpp "cudnn/*.cpp") +FILE(GLOB miopen_cpp "miopen/*.cpp") FILE(GLOB mkl_cpp "mkl/*.cpp") FILE(GLOB mkldnn_cpp "mkldnn/*.cpp") FILE(GLOB native_cpp "native/*.cpp") FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp") +FILE(GLOB native_miopen_cpp "native/miopen/*.cpp") FILE(GLOB native_cuda_cu "native/cuda/*.cu") FILE(GLOB native_cuda_cpp "native/cuda/*.cpp") FILE(GLOB native_mkl_cpp "native/mkl/*.cpp") @@ -70,10 +72,20 @@ endif() IF(NOT NO_CUDA OR USE_ROCM) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda) set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu}) - set(all_cuda_cpp ${native_cudnn_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS}) + set(all_cuda_cpp ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS}) + + IF(USE_ROCM) + set(all_cuda_cpp ${native_miopen_cpp} ${all_cuda_cpp}) + ELSE() + set(all_cuda_cpp ${native_cudnn_cpp} ${all_cuda_cpp}) + ENDIF() + IF(CUDNN_FOUND) SET(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp}) + ELSEIF(MIOPEN_FOUND) + SET(all_cuda_cpp ${all_cuda_cpp} ${miopen_cpp}) ENDIF() + endif() filter_list(generated_h generated_cpp "\\.h$") @@ -203,7 +215,7 @@ if(NOT MSVC) set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) endif() -IF(NOT NO_CUDA) +IF(NOT NO_CUDA AND NOT USE_ROCM) IF ($ENV{ATEN_STATIC_CUDA}) # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support # we first have to build a fake lib that links with no device callbacks, @@ -271,9 +283,15 @@ ENDIF() IF(USE_ROCM) ### Link in the ROCm libraries BLAS / RNG. FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib) - FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib) + FIND_LIBRARY(HIPRAND_LIBRARY hiprand HINTS ${HIPRAND_PATH}/lib) + + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRAND_LIBRARY}) - list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY}) + # Set necessary HIPCC Flags + SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}") + + # Set necessary CXX Flags + SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}") ENDIF() # Include CPU paths for CUDA as well @@ -371,6 +389,9 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE") if(NOT NO_CUDA OR USE_ROCM) set_property(TARGET ATen_cuda PROPERTY CXX_STANDARD 11) endif() + if(USE_ROCM) + set_target_properties(ATen_cuda PROPERTIES LINKER_LANGUAGE HIP) + endif() endif() endif() diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 526efb717e4c84..d46d1dc388bd5d 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -354,28 +354,26 @@ inline void CPU_tensor_parallel_apply1( int64_t grain_size = internal::TBB_GRAIN_SIZE) { if (!_apply_preamble({tensor1})) return; - if (tensor1.numel() < grain_size) { - CPU_tensor_apply1(tensor1, op); - return; - } - auto range = tbb::blocked_range(0, tensor1.numel()); if (tensor1.ndimension() < 8) { - tbb::parallel_for( - range, [&tensor1, &op](const tbb::blocked_range r) { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &op](int64_t begin, int64_t end) { apply_op( - r.end() - r.begin(), - r.begin(), + end - begin, + begin, op, strided_tensor_iter_fixed(tensor1, true)); }); } else { - tbb::parallel_for( - range, [&tensor1, &op](const tbb::blocked_range r) { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &op](int64_t begin, int64_t end) { apply_op( - r.end() - r.begin(), - r.begin(), - op, - strided_tensor_iter(tensor1)); + end - begin, begin, op, strided_tensor_iter(tensor1)); }); } } @@ -388,27 +386,28 @@ inline void CPU_tensor_parallel_apply2( int64_t grain_size = internal::TBB_GRAIN_SIZE) { if (!_apply_preamble({tensor1, tensor2})) return; - if ((tensor1.numel() + tensor2.numel()) < grain_size) { - CPU_tensor_apply2(tensor1, tensor2, op); - return; - } - auto range = tbb::blocked_range(0, tensor1.numel()); if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) { - tbb::parallel_for( - range, [&tensor1, &tensor2, &op](const tbb::blocked_range r) { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { apply_op( - r.end() - r.begin(), - r.begin(), + end - begin, + begin, op, strided_tensor_iter_fixed(tensor1), strided_tensor_iter_fixed(tensor2)); }); } else { - tbb::parallel_for( - range, [&tensor1, &tensor2, &op](const tbb::blocked_range r) { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { apply_op( - r.end() - r.begin(), - r.begin(), + end - begin, + begin, op, strided_tensor_iter(tensor1), strided_tensor_iter(tensor2)); diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 6fcb951d184f5c..b8aff319e6271e 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -86,6 +86,9 @@ class AT_API Context { cudaStream_t getCurrentCUDAStream() const { return detail::getCUDAHooks().getCurrentCUDAStream(thc_state.get()); } + cudaStream_t getCurrentCUDAStreamOnDevice(int64_t device) const { + return detail::getCUDAHooks().getCurrentCUDAStreamOnDevice(thc_state.get(), device); + } cudaDeviceProp* getCurrentDeviceProperties() const { return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get()); } diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index c2103d20fd4ba5..6d7c7eab0391c1 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -3885,7 +3885,7 @@ aten_custom_call: | if (self_->isScalar()) { // Empty tensor - return self_->type().toScalarType(kLong).tensor({0}); + return toBackend(toDense(backend())).toScalarType(kLong).tensor({0}); } ]] diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h new file mode 100644 index 00000000000000..aaa4dc9c07290e --- /dev/null +++ b/aten/src/ATen/DimVector.h @@ -0,0 +1,11 @@ +#pragma once + +#include "SmallVector.h" +#include + +namespace at { + +/// A container for sizes or strides +using DimVector = SmallVector; + +} diff --git a/aten/src/ATen/Parallel.cpp b/aten/src/ATen/Parallel.cpp index c404056465cd01..961cc6df31b2a3 100644 --- a/aten/src/ATen/Parallel.cpp +++ b/aten/src/ATen/Parallel.cpp @@ -7,6 +7,7 @@ #include #include + namespace at { namespace internal { // thread_local variable with internal linkage @@ -51,4 +52,5 @@ void init_tbb_num_threads() { num_threads_ = num_threads; } } -}} // namespace at::internal +} // namespace internal +} // namespace at diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index 660ae48c791ccf..dba4e35ca74488 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -1,13 +1,8 @@ #pragma once #include -#include #include +#include -#ifdef __PPC64__ -using default_partitioner_type = tbb::simple_partitioner; -#else -using default_partitioner_type = tbb::affinity_partitioner; -#endif namespace at { namespace internal { @@ -30,68 +25,28 @@ AT_API void init_tbb_num_threads(); constexpr int64_t TBB_GRAIN_SIZE = 32768; } // namespace internal -template class OP> -T parallel_reduce( - T (*f)(const T*, size_t, size_t, T), - const T* data, - size_t start, - size_t end, - T init_) { +template +void parallel_for( + int64_t begin, + int64_t end, + int64_t grain_size, + F f) { internal::init_tbb_num_threads(); - T result_; - static default_partitioner_type ap; - - if (end - start < internal::TBB_GRAIN_SIZE) { - result_ = f(data, start, end, init_); - } else { - result_ = tbb::parallel_reduce( - tbb::blocked_range(start, end, internal::TBB_GRAIN_SIZE), - init_, - [&data, &f](const tbb::blocked_range r, T init) -> T { - return f(data, r.begin(), r.end(), init); - }, - OP(), - ap); - } - return result_; -} - -template -void parallel_reduce_2d( - void (*f)(const T*, T*, size_t, size_t), - size_t num_rows, - size_t num_cols, - size_t numel, - const T* arr_, - T* outarr_) { - internal::init_tbb_num_threads(); +#ifdef __PPC64__ + using default_partitioner_type = tbb::simple_partitioner; +#else + using default_partitioner_type = tbb::affinity_partitioner; +#endif - static default_partitioner_type ap; + thread_local static default_partitioner_type ap; - size_t max_i_ = - (numel && num_rows && num_cols) ? numel / (num_rows * num_cols) : 0; - if (numel < internal::TBB_GRAIN_SIZE) { - for (size_t i_ = 0; i_ < max_i_; i_++) { - int64_t i = i_ * num_rows * num_cols; - int64_t i_r = i_ * num_cols; - const T* arr = arr_ + i; - T* outarr = outarr_ + i_r; - f(arr, outarr, num_rows, num_cols); - } + if ((end - begin) < grain_size) { + f(begin, end); } else { tbb::parallel_for( - tbb::blocked_range(0, max_i_, 1), - [&arr_, &outarr_, num_rows, num_cols, &f]( - const tbb::blocked_range r) { - for (size_t i_ = r.begin(); i_ < r.end(); i_++) { - int64_t i = i_ * num_rows * num_cols; - int64_t i_r = i_ * num_cols; - const T* arr = arr_ + i; - T* outarr = outarr_ + i_r; - f(arr, outarr, num_rows, num_cols); - } - }, + tbb::blocked_range(begin, end, grain_size), + [f](const tbb::blocked_range& r) { f(r.begin(), r.end()); }, ap); } } diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h index 840d78b48acfb0..3439cd12b9dade 100644 --- a/aten/src/ATen/ScalarType.h +++ b/aten/src/ATen/ScalarType.h @@ -89,6 +89,18 @@ static inline const char * toString(ScalarType t) { #undef DEFINE_CASE } +static inline size_t elementSize(ScalarType t) { +#define CASE_ELEMENTSIZE_CASE(ctype,name,_2) \ + case ScalarType:: name : return sizeof(ctype); + + switch(t) { + AT_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE) + default: + AT_ERROR("Unknown ScalarType"); + } +#undef CASE_ELEMENTSIZE_CASE +} + static inline bool isIntegralType(ScalarType t) { return (t == ScalarType::Byte || t == ScalarType::Char || diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h index 521e46082bce4a..3a5926a06df8d1 100644 --- a/aten/src/ATen/SmallVector.h +++ b/aten/src/ATen/SmallVector.h @@ -921,6 +921,12 @@ class SmallVector : public SmallVectorImpl { SmallVectorImpl::operator=(::std::move(RHS)); } + template + const SmallVector &operator=(const Container &RHS) { + this->assign(RHS.begin(), RHS.end()); + return *this; + } + SmallVector(SmallVectorImpl &&RHS) : SmallVectorImpl(N) { if (!RHS.empty()) SmallVectorImpl::operator=(::std::move(RHS)); diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/SparseTensorRef.h index f0381d51c28c7e..9c9fada2dc7117 100644 --- a/aten/src/ATen/SparseTensorRef.h +++ b/aten/src/ATen/SparseTensorRef.h @@ -3,8 +3,8 @@ namespace at { struct Tensor; -struct SparseTensor { - explicit SparseTensor(const Tensor& t): tref(t) {} +struct SparseTensorRef { + explicit SparseTensorRef(const Tensor& t): tref(t) {} const Tensor& tref; }; diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h index 617a57a392c314..e13c594db35d10 100644 --- a/aten/src/ATen/cpu/vec256/vec256_base.h +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -22,7 +22,7 @@ namespace { template struct Vec256 { static constexpr int size = 32 / sizeof(T); - __at_align32__ T values[32 / sizeof(T)]; + T values[32 / sizeof(T)]; Vec256() {} Vec256(T val) { for (int i = 0; i != size; i++) { diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index 242796eebbef86..0e453fdcdeb1b7 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -236,13 +236,10 @@ __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) { return (a + b - 1) / b; } -inline bool getApplyGrid(uint64_t totalElements, dim3& grid) { - int curDevice = -1; - cudaGetDevice(&curDevice); +inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) { if (curDevice == -1) return false; - uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast(AT_APPLY_THREADS_PER_BLOCK)); - uint64_t maxGridX = at::globalContext().getCurrentDeviceProperties()->maxGridSize[0]; + uint64_t maxGridX = at::globalContext().getDeviceProperties(curDevice)->maxGridSize[0]; if (numBlocks > maxGridX) numBlocks = maxGridX; grid = dim3(numBlocks); @@ -286,7 +283,9 @@ bool CUDA_tensor_apply2(at::Tensor a, const dim3 block = getApplyBlock(); dim3 grid; - if (!getApplyGrid(totalElements, grid)) { + int64_t curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice)) { return false; } @@ -323,7 +322,7 @@ bool CUDA_tensor_apply2(at::Tensor a, scalar1, \ scalar2, \ TYPE, A, B> \ - <<>>( \ + <<>>( \ aInfo, bInfo, (TYPE) totalElements, op); #define HANDLE_B_CASE(TYPE, A, B) { \ @@ -466,7 +465,9 @@ bool CUDA_tensor_apply3(at::Tensor a, const dim3 block = getApplyBlock(); dim3 grid; - if (!getApplyGrid(totalElements, grid)) { + int64_t curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice)) { return false; } @@ -501,7 +502,7 @@ bool CUDA_tensor_apply3(at::Tensor a, scalar2, \ scalar3, \ TYPE, A, B, C> \ - <<>>( \ + <<>>( \ aInfo, bInfo, cInfo, (TYPE) totalElements, op); #define HANDLE_C_CASE(TYPE, A, B, C) { \ @@ -685,7 +686,9 @@ bool CUDA_tensor_apply4(at::Tensor a, const dim3 block = getApplyBlock(); dim3 grid; - if (!getApplyGrid(totalElements, grid)) { + int64_t curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice)) { return false; } @@ -727,7 +730,7 @@ bool CUDA_tensor_apply4(at::Tensor a, scalar3, \ scalar4, \ TYPE, A, B, C, D> \ - <<>>( \ + <<>>( \ aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op); #define HANDLE_D_CASE(TYPE, A, B, C, D) { \ diff --git a/aten/src/ATen/cuda/CUDAConfig.h.in b/aten/src/ATen/cuda/CUDAConfig.h.in index 72adee50cf84fb..9e4b3d35c09bec 100644 --- a/aten/src/ATen/cuda/CUDAConfig.h.in +++ b/aten/src/ATen/cuda/CUDAConfig.h.in @@ -5,3 +5,4 @@ // c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined #define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@ +#define AT_MIOPEN_ENABLED() @AT_MIOPEN_ENABLED@ diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index fb2d25f54f86c8..05d408786d47b4 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -48,6 +48,9 @@ bool CUDAHooks::hasCuDNN() const { cudaStream_t CUDAHooks::getCurrentCUDAStream(THCState* thc_state) const { return THCState_getCurrentStream(thc_state); } +cudaStream_t CUDAHooks::getCurrentCUDAStreamOnDevice(THCState* thc_state, int64_t device) const { + return THCState_getCurrentStreamOnDevice(thc_state, device); +} struct cudaDeviceProp* CUDAHooks::getCurrentDeviceProperties(THCState* thc_state) const { return THCState_getCurrentDeviceProperties(thc_state); } @@ -76,6 +79,10 @@ bool CUDAHooks::compiledWithCuDNN() const { return AT_CUDNN_ENABLED(); } +bool CUDAHooks::compiledWithMIOpen() const { + return AT_MIOPEN_ENABLED(); +} + bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { #if AT_CUDNN_ENABLED() cudaDeviceProp* prop = getCurrentDeviceProperties(globalContext().getTHCState()); diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 69b0fd658eebe0..52dd836ace3885 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -15,12 +15,14 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasCUDA() const override; bool hasCuDNN() const override; cudaStream_t getCurrentCUDAStream(THCState*) const override; + cudaStream_t getCurrentCUDAStreamOnDevice(THCState*, int64_t device) const override; struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const override; struct cudaDeviceProp* getDeviceProperties(THCState*, int device) const override; int64_t current_device() const override; std::unique_ptr newPinnedMemoryAllocator() const override; void registerCUDATypes(Context*) const override; bool compiledWithCuDNN() const override; + bool compiledWithMIOpen() const override; bool supportsDilatedConvolutionWithCuDNN() const override; long versionCuDNN() const override; double batchnormMinEpsilonCuDNN() const override; diff --git a/aten/src/ATen/cudnn/Handles.h b/aten/src/ATen/cudnn/Handles.h index 2845dec500f9a1..369b1f3410e03d 100644 --- a/aten/src/ATen/cudnn/Handles.h +++ b/aten/src/ATen/cudnn/Handles.h @@ -1,9 +1,10 @@ #pragma once #include "cudnn-wrapper.h" +#include "ATen/cuda/ATenCUDAGeneral.h" namespace at { namespace native { -cudnnHandle_t getCudnnHandle(); +AT_CUDA_API cudnnHandle_t getCudnnHandle(); }} // namespace diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 2ec5ad67cbf621..6d87dc4478bfdc 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -62,6 +62,10 @@ struct AT_API CUDAHooksInterface { AT_ERROR("cannot getCurrentCUDAStream() without ATen_cuda library"); } + virtual cudaStream_t getCurrentCUDAStreamOnDevice(THCState*, int64_t device) const { + AT_ERROR("cannot getCurrentCUDAStream() without ATen_cuda library"); + } + virtual struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const { AT_ERROR("cannot getCurrentDeviceProperties() without ATen_cuda library"); } @@ -86,6 +90,10 @@ struct AT_API CUDAHooksInterface { return false; } + virtual bool compiledWithMIOpen() const { + return false; + } + virtual bool supportsDilatedConvolutionWithCuDNN() const { return false; } diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index f151ecc962b08c..1437101ff2fb12 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -169,7 +169,7 @@ def __init__(self, reason): TYPE_FORMAL_GENERIC = { 'THTensor*': 'Tensor &', - 'THSTensor*': 'SparseTensor', + 'THSTensor*': 'SparseTensorRef', 'THBoolTensor*': 'Tensor &', 'THIndexTensor*': 'Tensor &', 'THIntegerTensor*': 'Tensor &', @@ -186,7 +186,7 @@ def __init__(self, reason): DYNAMIC_TYPE = { 'THTensor*': 'Tensor', - 'THSTensor*': 'SparseTensor', + 'THSTensor*': 'SparseTensorRef', 'THBoolTensor*': 'BoolTensor', 'THIndexTensor*': 'IndexTensor', 'THIntegerTensor*': 'IntegerTensor', @@ -1133,7 +1133,7 @@ def handle_sparse(env, option): return [] check_name = option['when_sparse_dispatch'] sparse_actuals = [arg['name'] - if arg['name'] != check_name else "SparseTensor({})".format(arg['name']) + if arg['name'] != check_name else "SparseTensorRef({})".format(arg['name']) for arg in option['formals_list']] return [SPARSE_CHECK.substitute(env, check_name=check_name, sparse_actuals=sparse_actuals)] diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp new file mode 100644 index 00000000000000..f3d720c6bcdd19 --- /dev/null +++ b/aten/src/ATen/miopen/Descriptors.cpp @@ -0,0 +1,120 @@ +#include "Descriptors.h" +#include + +namespace at { namespace native { + +namespace { + +inline miopenDataType_t getDataType(const at::Type& t) { + auto scalar_type = t.scalarType(); + if (scalar_type == at::kFloat) { + return miopenFloat; + } else if (scalar_type == at::kHalf) { + return miopenHalf; + } + throw std::runtime_error("TensorDescriptor only supports float and half tensors"); +} + +inline miopenDataType_t getDataType(const at::Tensor& t) { + return getDataType(t.type()); +} + +} // anonymous namespace + + +void TensorDescriptor::set(const at::Tensor &t, size_t pad) { + set(getDataType(t), t.sizes(), t.strides(), pad); +} + +static int MIOPEN_DIM_MAX = 4; + +void TensorDescriptor::set(miopenDataType_t datatype, IntList t_sizes, IntList t_strides, size_t pad) { + size_t dim = t_sizes.size(); + if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX) +#define _STR(X) #X +#define STR(X) _STR(X) + throw std::runtime_error("MIOpen supports only up to " STR(MIOPEN_DIM_MAX) " dimensions"); +#undef _STR +#undef STR + int size[MIOPEN_DIM_MAX]; + int stride[MIOPEN_DIM_MAX]; + for (size_t i = 0; i < dim; ++i) { + size[i] = static_cast(t_sizes[i]); + stride[i] = static_cast(t_strides[i]); + } + for (size_t i = dim; i < pad; ++i) { + size[i] = 1; + stride[i] = 1; + } + set(datatype, static_cast(std::max(dim, pad)), size, stride); +} + +std::string miopenTypeToString(miopenDataType_t dtype) { + switch (dtype) { + case miopenFloat: + return "miopenFloat"; + case miopenHalf: + return "miopenHalf"; + default: + std::ostringstream oss; + oss << "(unknown data-type " << static_cast(dtype) << ")"; + return oss.str(); + } +} + +std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) { + out << "TensorDescriptor " << static_cast(d.desc()) << "\n"; + int nbDims = 4; + int dimA[MIOPEN_DIM_MAX]; + int strideA[MIOPEN_DIM_MAX]; + miopenDataType_t dtype; + miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA); + out << " type = " << miopenTypeToString(dtype) << "\n"; + out << " nbDims = " << nbDims << "\n"; + // Read out only nbDims of the arrays! + out << " dimA = "; + for (auto i : ArrayRef{dimA, static_cast(nbDims)}) { + out << i << ", "; + } + out << "\n"; + out << " strideA = "; + for (auto i : ArrayRef{strideA, static_cast(nbDims)}) { + out << i << ", "; + } + out << "\n"; + return out; +} + +void TensorDescriptor::print() { std::cout << *this; } + +void FilterDescriptor::set(const at::Tensor &t, int64_t pad) { + auto dim = t.ndimension(); + if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX) +#define _STR(X) #X +#define STR(X) _STR(X) + throw std::runtime_error("MIOpen supports only up to " STR(MIOPEN_DIM_MAX) " dimensions"); +#undef _STR +#undef STR + if (!t.is_contiguous()) { + // NB: It is possible for this test to be insufficient, because the + // Tensor passed in to set the filter descriptor may not be the actual + // Tensor whose data pointer is passed to cuDNN. Nevertheless, + // that is the common case, so we can catch most client errors with this test. + throw std::runtime_error("MIOpen filters (a.k.a. weights) must be contiguous"); + } + int size[MIOPEN_DIM_MAX]; + int stride[MIOPEN_DIM_MAX]; + for (int i = 0; i < dim; ++i) { + size[i] = (int) t.size(i); + } + for (int i = dim; i < pad; ++i) { + size[i] = (int) 1; + } + for (int i = dim - 1; i >=0; --i) { + stride[i] = (i == dim - 1) ? 1 : stride[i+1] * size[i+1]; + } + dim = std::max(dim, pad); + set(getDataType(t), (int) dim, size, stride); +} + +}} diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h new file mode 100644 index 00000000000000..eb259ea6199f4a --- /dev/null +++ b/aten/src/ATen/miopen/Descriptors.h @@ -0,0 +1,323 @@ +#pragma once + +#include "Exceptions.h" + +#include "miopen-wrapper.h" +#include +#include +#include + +/* +Note [cuDNN dropout descriptor initialization] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In most cases, setting descriptors in cuDNN is cheap (e.g., +cudnnSetTensorNdDescriptor). However, this is not the case for +cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an +expensive precomputation to initialize the random number generator states. In +cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor, +which means that law-abiding clients were expected to generate a dropout +descriptor once and cache it. However, our ATen interface is (1) stateless (so +we can't cache the descriptors) and (2) does not accept arbitrary user types in +its interface (so we can't pass the descriptor in). This puts us in a pickle. + +In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which +forgoes the expensive initialization process, and can initialize the +descriptor with a pre-initialized state CUDA tensor. This is great, because +it means we can simply pass in the state tensor and then initialize the +descriptor internally. Unfortunately, this function is not available in +cuDNN 6. + +To work around this, we break the cuDNN abstraction barrier, and have +the struct layout of the underlaying dropout descriptor. With this struct, +we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great! +*/ + +namespace at { namespace native { + +// TODO: Add constructors for all of the descriptors + +inline int dataSize(miopenDataType_t dataType) +{ + switch (dataType) { + case miopenHalf: return 2; + case miopenFloat: return 4; + default: return 8; + } +} + +// The stride for a size-1 dimensions is not uniquely determined; in +// fact, it can be anything you want, because the fact that the +// tensor is size 1 at this dimension means that you will never actually +// try advancing your pointer by this stride. +// +// However, CuDNN has a much more stringent requirement on strides: +// if you are passing a contiguous input, it better be the case +// that the stride for dim i is the product of the sizes of dims +// i+1 to the end. This stride is indeed uniquely determined. This +// function modifies 'stride' in place so this invariant holds. +static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) { + int64_t z = 1; + for(int d = dim-1; d >= 0; d--) + { + if (size[d] == 1) { + stride[d] = z; + } else { + z *= size[d]; + } + } +} + +template +struct DescriptorDeleter { + void operator()(T* x) { + if (x != nullptr) { + MIOPEN_CHECK(dtor(x)); + } + } +}; + +// A generic class for wrapping cuDNN descriptor types. All you need +// is to give the underlying type the Descriptor_t points to (usually, +// if it's miopenTensorDescriptor_t it points to miopenTensorStruct), +// the constructor and the destructor. Subclasses are responsible +// for defining a set() function to actually set the descriptor. +// +// Descriptors default construct to a nullptr, and have a descriptor +// initialized the first time you call set() or any other initializing +// function. +template +class Descriptor +{ +public: + // TODO: Figure out why const-correctness doesn't work here + + // Use desc() to access the underlying descriptor pointer in + // a read-only fashion. Most client code should use this. + // If the descriptor was never initialized, this will return + // nullptr. + T* desc() const { return desc_.get(); } + T* desc() { return desc_.get(); } + + // Use mut_desc() to access the underlying desciptor pointer + // if you intend to modify what it points to (e.g., using + // miopenSetFooDescriptor). This will ensure that the descriptor + // is initialized. Code in this file will use this function. + T* mut_desc() { init(); return desc_.get(); } +protected: + void init() { + if (desc_ == nullptr) { + T* raw_desc; + MIOPEN_CHECK(ctor(&raw_desc)); + desc_.reset(raw_desc); + } + } +private: + std::unique_ptr> desc_; +}; + +class TensorDescriptor + : public Descriptor +{ +public: + TensorDescriptor() {} + explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) { + set(t, pad); + } + + // Note [CuDNN broadcast padding] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // pad specifies the minimum dimensionality of the tensor descriptor + // we produce (it doesn't have anything to do with, e.g., convolution + // padding). If 't' is lower-dimensional than 'pad', the remaining + // dimensions (on the right) are padded with ones. This doesn't + // affect the underlying data layout. This is particularly useful for + // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is + // done in two steps: first, the client code is expected to pad out + // (the dimensions) input tensors to be the same dimension as the + // target broadcast, and then second, CuDNN takes of actually + // broadcasting size 1 dimensions. + + void set(const at::Tensor &t, size_t pad = 0); + void set(miopenDataType_t dataType, IntList sizes, IntList strides, size_t pad = 0); + + void print(); + +private: + void set(miopenDataType_t dataType, int dim, int* size, int* stride) { + fixSizeOneDimStride(dim, size, stride); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride)); + } +}; + +std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); + +class FilterDescriptor + : public Descriptor +{ +public: + void set(const at::Tensor &t, int64_t pad = 0); + +private: + void set(miopenDataType_t dataType, int dim, int* size, int* stride) { + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride)); + } +}; + +struct ConvolutionDescriptor + : public Descriptor +{ + void set(miopenDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) { + miopenDataType_t mathType = dataType; + if (dataType == miopenHalf) mathType = miopenFloat; + //?????????????? What the fuck? It's asking for the stride for the height & stride for the width seperately....! + MIOPEN_CHECK(miopenInitConvolutionDescriptor(mut_desc(), miopenConvolution, *pad, *pad, *stride, *stride, 1, 1)); +#if 0 + CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups)); + CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH)); + if(dataType == CUDNN_DATA_HALF) + CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); +#endif + } +}; + +struct SpatialTransformerDescriptor + : public Descriptor +{ + void set(miopenDataType_t dataType, int dim, int* size) { + //?????????????? This function doesn't even exist! + MIOPEN_CHECK(miopenSetSpatialTransformerNdDescriptor(mut_desc(), MIOPEN_SAMPLER_BILINEAR, dataType, dim, size)); + } +}; + +#if 0 // DROPOUT NOT IMPLEMENTED IN MIOpen + +// See Note [cuDNN dropout descriptor initialization] +inline cudnnStatus_t cudnnRestoreDropoutDescriptor( + cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed) { + // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends. + // This is not entirely accurate but is good enough to catch some API + // uses which would not be compatible in cuDNN 7. Feel free to fix + // this if you notice something is wrong. + if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE; + if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE; + size_t expectedStateSizeInBytes; + // State size will differ depending on size of GPU + auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes); + if (ret != CUDNN_STATUS_SUCCESS) return ret; + if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE; + dropoutDesc->dropout = dropout; + dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t); + dropoutDesc->states = states; + return CUDNN_STATUS_SUCCESS; +} + +#endif // DROPOUT NOT IMPLEMENTED IN MIOpen + +#if 0 // DROPOUT NOT IMPLEMENTED IN MIOpen +struct DropoutDescriptor + : public Descriptor +{ + at::Tensor state; + + // Initialize a dropout descriptor's RNG state. + // WARNING: This function is very expensive, avoid calling this function! + // NB: it takes a Type so that we can generate a Variable if necessary + void initialize_rng(const at::Type& ty, miopenHandle_t handle, float dropout, long long int seed) { + AT_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout"); + size_t state_size; + MIOPEN_CHECK(miopenDropoutGetStatesSize(handle, &state_size)); + AT_ASSERT(ty.is_cuda(), "dropout state type must be CUDA type"); + AT_ASSERT(ty.scalarType() == kByte, "dropout state type must be byte"); + state = ty.tensor({static_cast(state_size)}); + MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed)); + } + + // Restore a dropout descriptor given a dropout probability and existing RNG state. + // See Note [cuDNN dropout descriptor initialization] + void set(miopenHandle_t handle, float dropout, at::Tensor state_) { + AT_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout"); + state = state_; + void *state_ptr = state.data_ptr(); + size_t state_size = state.size(0); + // NB: The seed doesn't actually matter, so we give a dummy value + MIOPEN_CHECK(miopenRestoreDropoutDescriptor(mut_desc(), handle, dropout, state_ptr, state_size, 0 /* seed */)); + } + + // Restore a dropout descriptor corresponding to no dropout + // See Note [cuDNN dropout descriptor initialization] + void set_no_dropout(miopenHandle_t handle) { + // NB: seed doesn't matter when dropout = 0, because no random number + // initialization actually takes place when there is no dropout. + // NB: Empirically, miopenSetDropoutDescriptor is cheap when + // dropoot == 0 + MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */)); + } +}; +#endif // DROPOUT NOT IMPLEMENTED IN MIOpen + +struct RNNDescriptor + : public Descriptor +{ + DropoutDescriptor dropout_desc_; + void set(miopenHandle_t handle, int hidden_size, int num_layers, DropoutDescriptor&& dropout_desc, + miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t bidirectional, + miopenRNNMode_t mode, miopenDataType_t datatype) { + dropout_desc_ = std::move(dropout_desc); + MIOPEN_CHECK(miopenSetRNNDescriptor( + handle, + mut_desc(), + hidden_size, + num_layers, + dropout_desc_.desc(), + input_mode, + bidirectional, + mode, + MIOPEN_RNN_ALGO_STANDARD, + datatype)); +#if 0 + hipDeviceProp_t* prop = globalContext().getCurrentDeviceProperties(); + if (prop->major >= 7) { + if (datatype == CUDNN_DATA_HALF) { + cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH); + } else { + // Technically, as the default it's not necessary to explicitly + // set this. + cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_DEFAULT_MATH); + } + } +#endif + } +}; + +union Constant +{ + float f; + double d; + Constant(miopenDataType_t dataType, double value) { + if (dataType == miopenHalf || dataType == miopenFloat) { + f = (float) value; + } else { + d = value; + } + } +}; + +}} // namespace diff --git a/aten/src/ATen/miopen/Exceptions.h b/aten/src/ATen/miopen/Exceptions.h new file mode 100644 index 00000000000000..06180eb7a7a7e4 --- /dev/null +++ b/aten/src/ATen/miopen/Exceptions.h @@ -0,0 +1,74 @@ +gigit #pragma once + +#include "miopen-wrapper.h" +#include +#include +#include + +struct THCState; + +namespace at { namespace native { + +class miopen_exception : public std::runtime_error { +public: + miopenStatus_t status; + miopen_exception(miopenStatus_t status, const char* msg) + : std::runtime_error(msg) + , status(status) {} + miopen_exception(miopenStatus_t status, const std::string& msg) + : std::runtime_error(msg) + , status(status) {} +}; + +const char* miopenGetErrorString(miopenStatus_t status) { + switch (status) { + case miopenStatusSuccess: + return "miopenStatusSuccess"; + + case miopenStatusNotInitialized: + return "miopenStatusNotInitialized"; + + case miopenStatusInvalidValue: + return "miopenStatusInvalidValue"; + + case miopenStatusBadParm: + return "miopenStatusBadParm"; + + case miopenStatusAllocFailed: + return "miopenStatusAllocFailed"; + + case miopenStatusInternalError: + return "miopenStatusInternalError"; + + case miopenStatusNotImplemented: + return "miopenStatusNotImplemented"; + + case miopenStatusUnknownError: + return "miopenStatusUnknownError"; + + default: + return "Unrecognized Status Code"; + } +} + +inline void MIOPEN_CHECK(miopenStatus_t status) +{ + if (status != miopenStatusSuccess) { + if (status == miopenStatusNotImplemented) { + throw miopen_exception(status, std::string(miopenGetErrorString(status)) + + ". This error may appear if you passed in a non-contiguous input."); + } + throw miopen_exception(status, miopenGetErrorString(status)); + } +} + +inline void CUDA_CHECK(hipError_t error) +{ + if (error != hipSuccess) { + std::string msg("HIP error: "); + msg += hipErrorString(error); + throw std::runtime_error(msg); + } +} + +}} // namespace at::miopen diff --git a/aten/src/ATen/miopen/Handles.cpp b/aten/src/ATen/miopen/Handles.cpp new file mode 100644 index 00000000000000..d37b1043fdeca5 --- /dev/null +++ b/aten/src/ATen/miopen/Handles.cpp @@ -0,0 +1,42 @@ +#include "Handles.h" + +#include "Exceptions.h" + +#include +#include + +// TODO: Get rid of the mutex, and just initialize these +// handles in at::Context along with lazy CUDA initialization + +namespace at { namespace native { + +namespace { + +struct Handle { + miopenHandle_t handle; + Handle() : handle(NULL) { + MIOPEN_CHECK(miopenCreate(&handle)); + } + ~Handle() { + if (handle) { + miopenDestroy(handle); + } + } +}; + +std::mutex mutex; +std::unordered_map handles; + +} // namespace + + +miopenHandle_t getMiopenHandle() +{ + int device; + CUDA_CHECK(hipGetDevice(&device)); + + std::lock_guard guard(mutex); + return handles[device].handle; +} + +}} // namespace at::miopen diff --git a/aten/src/ATen/miopen/Handles.h b/aten/src/ATen/miopen/Handles.h new file mode 100644 index 00000000000000..e8df69270f17f3 --- /dev/null +++ b/aten/src/ATen/miopen/Handles.h @@ -0,0 +1,9 @@ +#pragma once + +#include "miopen-wrapper.h" + +namespace at { namespace native { + +miopenHandle_t getMiopenHandle(); + +}} // namespace diff --git a/aten/src/ATen/miopen/README.md b/aten/src/ATen/miopen/README.md new file mode 100644 index 00000000000000..f1e9f16e99f3c7 --- /dev/null +++ b/aten/src/ATen/miopen/README.md @@ -0,0 +1,4 @@ +All files living in this directory are written with the assumption that MIOpen is available, +which means that these code are not guarded by `#if AT_MIOPEN_ENABLED()`. Therefore, whenever +you need to use definitions from here, please guard the `#include` and +definition usages with `#if AT_MIOPEN_ENABLED()` macro, e.g. [native/miopen/BatchNorm.cpp](native/miopen/BatchNorm.cpp). diff --git a/aten/src/ATen/miopen/Types.cpp b/aten/src/ATen/miopen/Types.cpp new file mode 100644 index 00000000000000..d35927f58acf40 --- /dev/null +++ b/aten/src/ATen/miopen/Types.cpp @@ -0,0 +1,24 @@ +#include "Types.h" + +#include +#include "miopen/version.h" + +namespace at { namespace native { + +miopenDataType_t getMiopenDataType(const at::Tensor& tensor) { + if (tensor.type().scalarType() == at::kFloat) { + return miopenFloat; + } else if (tensor.type().scalarType() == at::kHalf) { + return miopenHalf; + } + std::string msg("getMiopenDataType() not supported for "); + msg += at::toString(tensor.type().scalarType()); + throw std::runtime_error(msg); +} + +int64_t miopen_version() { + //return MIOPEN_VERSION_TWEAK; + return (MIOPEN_VERSION_MAJOR<<8) + (MIOPEN_VERSION_MINOR<<4) + MIOPEN_VERSION_PATCH; +} + +}} // namespace at::miopen diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h new file mode 100644 index 00000000000000..0034aa4d84a2b1 --- /dev/null +++ b/aten/src/ATen/miopen/Types.h @@ -0,0 +1,12 @@ +#pragma once + +#include "miopen-wrapper.h" +#include + +namespace at { namespace native { + +miopenDataType_t getMiopenDataType(const at::Tensor& tensor); + +int64_t miopen_version(); + +}} // namespace at::miopen diff --git a/aten/src/ATen/miopen/Utils.h b/aten/src/ATen/miopen/Utils.h new file mode 100644 index 00000000000000..3a1128b482f4a6 --- /dev/null +++ b/aten/src/ATen/miopen/Utils.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include "THC/THC.h" +#include "miopen-wrapper.h" +#include "Handles.h" + +namespace at { namespace native { + +inline void setMIOpenStreamToCurrent() { + // TODO: Should getCurrentStream be a method on Context? + MIOPEN_CHECK(miopenSetStream(getMiopenHandle(), THCState_getCurrentStream(globalContext().getTHCState()))); +} + +// cuDNN has a buggy check for tensor being contiguous (that is, it does +// not ignore stride for dimension that is equal to 0). This function +// makes tensors which have zero stride contiguous, by setting the +// strides to 1 as cuDNN likes. +inline Tensor contiguousIfZeroInStrides(const Tensor& t) { + for (auto s : t.strides()) { + if (s == 0) return t.contiguous(); + } + return t; +} + +}} diff --git a/aten/src/ATen/miopen/miopen-wrapper.h b/aten/src/ATen/miopen/miopen-wrapper.h new file mode 100644 index 00000000000000..1e378aedbef17a --- /dev/null +++ b/aten/src/ATen/miopen/miopen-wrapper.h @@ -0,0 +1,3 @@ +#pragma once + +#include diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 88d1acb2dddd02..a293dbbf7145f7 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -102,6 +102,11 @@ auto ConvParams::view1d_as_2d() -> void { } auto ConvParams::use_cudnn(const at::Tensor& input) const -> bool { + if (detail::getCUDAHooks().compiledWithMIOpen()) { + if (!input.type().is_cuda() || !cudnn_enabled) + return false; + return true; + } if (!detail::getCUDAHooks().compiledWithCuDNN()) { return false; } @@ -151,9 +156,9 @@ static void check_input_shape_forward(const at::Tensor& input, if (weight_dim != k) { std::stringstream ss; - ss << "Expected " << k << "-dimensional weight for " << k - << "-dimensional input " << input.sizes() << ", but got weight of size " - << weight.sizes() << " instead"; + ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim + << "-dimensional weight " << weight.sizes() << ", but got input of size " + << input.sizes() << " instead"; throw std::runtime_error(ss.str()); } if (weight.size(0) < groups) { @@ -167,8 +172,8 @@ static void check_input_shape_forward(const at::Tensor& input, if (!transposed) { if (input.size(1) != (weight.size(1) * groups)) { std::stringstream ss; - ss << "Given groups=" << groups << ", weight" << weight.sizes() - << ", so expected input" << input.sizes() << " to have " + ss << "Given groups=" << groups << ", weight of size " << weight.sizes() + << ", expected input" << input.sizes() << " to have " << (weight.size(1) * groups) << " channels, but got " << input.size(1) << " channels instead"; throw std::runtime_error(ss.str()); @@ -183,8 +188,8 @@ static void check_input_shape_forward(const at::Tensor& input, } else { // transposed if (input.size(1) != weight.size(0)) { std::stringstream ss; - ss << "Given transposed=" << transposed << ", weight" << weight.sizes() - << ", so expected input" << input.sizes() << " to have " + ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() + << ", expected input" << input.sizes() << " to have " << weight.size(0) << " channels, but got " << input.size(1) << " channels instead"; throw std::runtime_error(ss.str()); @@ -295,11 +300,11 @@ at::Tensor _convolution( auto input = input_r.contiguous(); auto weight = weight_r; auto bias = bias_r; - auto k = input.ndimension(); + auto k = weight.ndimension(); int64_t dim = k - 2; if (dim <= 0) { - throw std::runtime_error("input has less dimensions than expected"); + throw std::runtime_error("weight should have at least two dimensions"); } ConvParams params; diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp index 65c9b1005fb391..0446b8c8a392c8 100644 --- a/aten/src/ATen/native/Indexing.cpp +++ b/aten/src/ATen/native/Indexing.cpp @@ -147,7 +147,7 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) auto min_idx = index.min().toCLong(); if (max_idx >= dim_size) { AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); - } + } if (min_idx < -dim_size) { AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); } @@ -249,6 +249,18 @@ Tensor index(const Tensor & self, TensorList indices) { return src.take(linearIndex); } +Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) { + if (indices.size() > (size_t)self.dim()) { + AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); + } + + Tensor src, linearIndex, expandedValue; + std::tie(src, linearIndex) = makeLinearIndex(self, indices); + std::tie(expandedValue) = expand_inplace(linearIndex, value); + Tensor dst = src.clone(); + return dst.put_(linearIndex, expandedValue); +} + Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) { if (indices.size() > (size_t)self.dim()) { AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index c0758c867decb0..02653629e55bfe 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -1,4 +1,3 @@ -#include #include "ATen/ATen.h" #include "ATen/AccumulateType.h" #include "ATen/NativeFunctions.h" @@ -11,11 +10,8 @@ namespace at { namespace native { namespace { -static default_partitioner_type ap; - template void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { - internal::init_tbb_num_threads(); int64_t outer_size = 1; int64_t dim_size = input.size(dim); int64_t inner_size = 1; @@ -28,10 +24,10 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { scalar_t* input_data_base = input.data(); scalar_t* output_data_base = output.data(); int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1); - tbb::parallel_for( - tbb::blocked_range(0, outer_size * inner_size, grain_size), - [&](const tbb::blocked_range& r) { - for (int64_t i = r.begin(); i < r.end(); i++) { + parallel_for( + 0, outer_size * inner_size, grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { int64_t outer_idx = i / inner_size; int64_t inner_idx = i % inner_size; scalar_t* input_data = @@ -62,8 +58,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { else output_data[d * dim_stride] *= tmpsum; } - }, - ap); + }); } template @@ -72,7 +67,6 @@ void host_softmax_backward( const Tensor& grad, const Tensor& output, int64_t dim) { - internal::init_tbb_num_threads(); int64_t outer_size = 1; int64_t dim_size = grad.size(dim); @@ -87,10 +81,9 @@ void host_softmax_backward( scalar_t* output_data_base = output.data(); scalar_t* gradOutput_data_base = grad.data(); int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1); - tbb::parallel_for( - tbb::blocked_range(0, outer_size * inner_size, grain_size), - [&](const tbb::blocked_range& r) { - for (int64_t i = r.begin(); i < r.end(); i++) { + parallel_for( + 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { int64_t outer_idx = i / inner_size; int64_t inner_idx = i % inner_size; scalar_t* gradInput_data = @@ -118,8 +111,7 @@ void host_softmax_backward( } } } - }, - ap); + }); } } // namespace diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index f9f558a3deed0b..e6fe1ffa2f1253 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -263,18 +263,15 @@ THGenerator* get_generator(at::Generator* gen) { Tensor randperm(const Type& dtype, int64_t n, Generator* generator) { Tensor result = dtype.tensor(n); - return at::native::randperm_out(result, n, generator); + return at::randperm_out(result, n, generator); } -Tensor& randperm_out(Tensor& result, int64_t n, Generator* generator) { +Tensor& randperm_out_cpu(Tensor& result, int64_t n, Generator* generator) { if (n < 0) { std::ostringstream oss; oss << "n must be non-negative, got " << n; throw std::runtime_error(oss.str()); } - if (result.type().backend() != at::kCPU) { - throw std::runtime_error("randperm is only implemented for CPU"); - } result.resize_({n}); auto gen = get_generator(generator); diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 2f1bf85956f85e..ca66f5b547f4a5 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -6,8 +6,14 @@ #include "ATen/Dispatch.h" #include "ATen/Parallel.h" -#include "ATen/optional.h" #include "ATen/cpu/vec256/vec256.h" +#include "ATen/optional.h" + +#ifdef __PPC64__ +using default_partitioner_type = tbb::simple_partitioner; +#else +using default_partitioner_type = tbb::affinity_partitioner; +#endif namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 357ec1624b5f95..2e0a68b1fed487 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -22,8 +22,6 @@ namespace at { namespace native { namespace { -static default_partitioner_type ap; - template inline void _vec_log_softmax_lastdim( scalar_t* input_data_base, @@ -36,15 +34,17 @@ inline void _vec_log_softmax_lastdim( if (grain_size < CHUNK_SIZE) grain_size = CHUNK_SIZE; - tbb::parallel_for( - tbb::blocked_range(0, outer_size, grain_size), - [&](const tbb::blocked_range& r) { - for (int64_t ii = r.begin(); ii < r.end(); ii += CHUNK_SIZE) { + parallel_for( + 0, + outer_size, + grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) { scalar_t tmp_sum_scalar[CHUNK_SIZE]; scalar_t max_input_arr[CHUNK_SIZE]; int64_t loop_end = CHUNK_SIZE; - if (ii + CHUNK_SIZE > r.end()) - loop_end = r.end() - ii; + if (ii + CHUNK_SIZE > end) + loop_end = end - ii; for (int64_t j = 0; j < loop_end; j++) { int64_t i = ii + j; scalar_t* input_data = input_data_base + i * dim_size; @@ -83,8 +83,7 @@ inline void _vec_log_softmax_lastdim( dim_size); } } - }, - ap); + }); } template @@ -98,10 +97,12 @@ inline void _vec_softmax_lastdim( if (grain_size < 1) grain_size = 1; - tbb::parallel_for( - tbb::blocked_range(0, outer_size, grain_size), - [&](const tbb::blocked_range& r) { - for (int64_t i = r.begin(); i < r.end(); i++) { + parallel_for( + 0, + outer_size, + grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { scalar_t* input_data = input_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; scalar_t max_input = vec256::reduce_all( @@ -122,8 +123,7 @@ inline void _vec_softmax_lastdim( output_data, dim_size); } - }, - ap); + }); } template @@ -138,10 +138,12 @@ inline void _vec_host_softmax_backward_lastdim( if (grain_size < 1) grain_size = 1; - tbb::parallel_for( - tbb::blocked_range(0, outer_size, grain_size), - [&](const tbb::blocked_range& r) { - for (int64_t i = r.begin(); i < r.end(); i++) { + parallel_for( + 0, + outer_size, + grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { scalar_t* grad_input_data = grad_input_data_base + i * dim_size; scalar_t* grad_data = grad_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; @@ -173,14 +175,12 @@ inline void _vec_host_softmax_backward_lastdim( dim_size); } } - }, - ap); + }); } template struct vec_host_softmax_lastdim { static void apply(Tensor& output, const Tensor& input) { - internal::init_tbb_num_threads(); int64_t outer_size = 1; int64_t dim_size = input.size(input.ndimension() - 1); for (int64_t i = 0; i < input.ndimension() - 1; ++i) @@ -201,7 +201,6 @@ template struct vec_host_softmax_backward_lastdim { static void apply(Tensor& grad_input, const Tensor& grad, const Tensor& output) { - internal::init_tbb_num_threads(); int64_t outer_size = 1; int64_t dim_size = grad.size(grad.ndimension() - 1); for (int64_t i = 0; i < grad.ndimension() - 1; ++i) diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index efe7dcf87c852b..ab24d7faba0ba6 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -1,7 +1,6 @@ #include "ATen/native/cpu/UnaryOpsKernel.h" #include -#include #include "ATen/Dispatch.h" #include "ATen/Parallel.h" #include "ATen/cpu/vec256/vec256.h" @@ -13,51 +12,51 @@ namespace { using namespace vec256; -template -static void parallel_apply(Tensor& result, const Tensor& self, F f) { - internal::init_tbb_num_threads(); - -static default_partitioner_type ap; - - auto arr_out = result.data(); - auto arr_in = self.data(); - int64_t size = self.numel(); - int64_t grain_size = 2048; - if (size < grain_size) { - map(f, arr_out, arr_in, size); - } else { - tbb::parallel_for( - tbb::blocked_range(0, size, grain_size), - [&](const tbb::blocked_range& r) { - map(f, arr_out + r.begin(), arr_in + r.begin(), r.end() - r.begin()); - }, - ap); - } -} - static void abs_kernel(Tensor& result, const Tensor& self) { AT_DISPATCH_ALL_TYPES(self.type(), "abs", [&] { - parallel_apply( - result, - self, - [](const Vec256& x) { return x.abs(); }); }); + scalar_t* out = result.data(); + scalar_t* in = self.data(); + int64_t size = self.numel(); + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { + map([](const Vec256& x) { return x.abs(); }, + out + begin, + in + begin, + end - begin); + }); + }); } static void rsqrt_kernel(Tensor& result, const Tensor& self) { AT_DISPATCH_FLOATING_TYPES(self.type(), "rsqrt", [&] { - parallel_apply( - result, - self, - [](const Vec256& x) { return Vec256((scalar_t)(1)) / x.sqrt(); }); }); + scalar_t* out = result.data(); + scalar_t* in = self.data(); + int64_t size = self.numel(); + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { + map( + [](const Vec256& x) { + return Vec256((scalar_t)(1)) / x.sqrt(); + }, + out + begin, + in + begin, + end - begin); + }); + }); } -#define IMPLEMENT_FLOAT_KERNEL(op) \ - static void op##_kernel(Tensor& result, const Tensor& self) { \ - AT_DISPATCH_FLOATING_TYPES(self.type(), #op, [&] { \ - parallel_apply( \ - result, self, [](const Vec256& x) { return x.op(); }); \ - }); \ - } \ +#define IMPLEMENT_FLOAT_KERNEL(op) \ + static void op##_kernel(Tensor& result, const Tensor& self) { \ + AT_DISPATCH_FLOATING_TYPES(self.type(), #op, [&] { \ + scalar_t* out = result.data(); \ + scalar_t* in = self.data(); \ + int64_t size = self.numel(); \ + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { \ + map([](const Vec256& x) { return x.op(); }, \ + out + begin, \ + in + begin, \ + end - begin); \ + }); \ + }); \ + } \ REGISTER_DISPATCH(op##Impl, &op##_kernel) } // anonymous namespace diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index f6db9bdc681c2b..4f2013c2f19f3e 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -1,4 +1,14 @@ +#include "ATen/ATen.h" #include "ATen/NativeFunctions.h" +#include "ATen/cuda/CUDATypeConversion.cuh" + +#include +#include +#include +#include +#include +#include + #include #include @@ -26,4 +36,57 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { return result; } +Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) { + if (n < 0) { + std::ostringstream oss; + oss << "n must be non-negative, got " << n; + throw std::runtime_error(oss.str()); + } + + if (n > 0) { + AT_DISPATCH_ALL_TYPES_AND_HALF( + result.type(), "randperm_out_cuda", [&] { + AT_CHECK(Scalar(n).to(), + "n is too large for result tensor type: '", result.type().toString(), "'"); + } + ); + } + + result.resize_({n}); + + if (result.type().scalarType() == at::ScalarType::Half) { + auto result_float = CUDA(kFloat).tensor({n}); + result.copy_(randperm_out_cuda(result_float, n, generator)); + } else { + if (n < 30000) { // For small inputs, we offload it to CPU instead. + auto result_cpu = result.type().toBackend(kCPU).tensor({n}); + randperm_out(result_cpu, n, generator); + result.copy_(result_cpu); + } else { + // Generate random values for the keys array + AT_DISPATCH_ALL_TYPES( + result.type(), "randperm_out_cuda", [&] { + using cuda_scalar_t = cuda::into_type; + + auto keys = result.type().tensor(result.sizes()).random_(generator); + + auto result_data = thrust::device_ptr(result.data()); + auto keys_data = thrust::device_ptr(keys.data()); + + auto state = globalContext().getTHCState(); + THCThrustAllocator thrustAlloc(state); + auto policy = thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)); + + thrust::sequence(policy, result_data, result_data + n); + + // Use the sorted order of keys to rearrange the result array + thrust::sort_by_key(policy, keys_data, keys_data + n, result_data); + } + ); + } + } + + return result; +} + }} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp index bba5b9aa92eb1f..c5475980860d82 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -323,7 +323,7 @@ struct ConvolutionArgs { // Hashing machinery for ConvolutionParams struct ParamsHash { - size_t operator()(const ConvolutionParams& params) const { + std::size_t operator()(const ConvolutionParams& params) const { auto ptr = reinterpret_cast(¶ms); uint32_t value = 0x811C9DC5; for (int i = 0; i < (int)sizeof(ConvolutionParams); ++i) { diff --git a/aten/src/ATen/native/miopen/BatchNorm.cpp b/aten/src/ATen/native/miopen/BatchNorm.cpp new file mode 100644 index 00000000000000..a87acd8fb68d76 --- /dev/null +++ b/aten/src/ATen/native/miopen/BatchNorm.cpp @@ -0,0 +1,213 @@ +#include +#include +#include + +#if !AT_MIOPEN_ENABLED + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +std::tuple cudnn_batch_norm( + const Tensor& input, const Tensor& weight, + const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, + bool training, double exponential_average_factor, double epsilon) { + throw std::runtime_error("cudnn_batch_norm: ATen not compiled with MIOpen support"); +} + +std::tuple cudnn_batch_norm_backward( + const Tensor& input, const Tensor& grad_output, const Tensor& weight, + const Tensor& running_mean, const Tensor& running_var, + const Tensor& save_mean, const Tensor& save_var, + double epsilon) { + throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with MIOpen support"); +} + +}} // namespace at::native + +#else // AT_MIOPEN_ENABLED + +#include +#include +#include + +#include + +namespace at { namespace native { + +namespace { + +Tensor expandScale(const Tensor& t, int64_t dim) { + std::vector size{ 1, t.numel() }; + while (static_cast(size.size()) < dim) { + size.emplace_back(1); + } + return t.view(size); +} + +} // namespace + +std::tuple cudnn_batch_norm( + const Tensor& input_t, const Tensor& weight_t, + const Tensor& bias_t, const Tensor& running_mean_t, const Tensor& running_var_t, + bool training, double exponential_average_factor, double epsilon) +{ + TensorArg input{ input_t, "input", 1 }, + weight{ weight_t, "weight", 2 }, + bias{ bias_t, "bias", 3 }, + running_mean{ running_mean_t, "running_mean", 4 }, + running_var{ running_var_t, "running_var", 5 }; + CheckedFrom c = "cudnn_batch_norm"; + setMIOpenStreamToCurrent(); + + checkAllDefined(c, {input, weight, bias}); + if (!training) { + checkAllDefined(c, {running_mean, running_var}); + } + checkAllSameGPU(c, {input, weight, bias, running_mean, running_var}); + if (input->type().scalarType() == ScalarType::Half) { + checkScalarType(c, weight, ScalarType::Float); + } else { + checkAllSameType(c, {input, weight}); + } + checkAllSameType(c, {weight, bias, running_mean, running_var}); + // TODO: is weight required to be contiguous? + checkAllContiguous(c, {input, weight, bias, running_mean, running_var}); + checkDimRange(c, input, 2, 6 /* exclusive */); + auto num_features = input->size(1); + for (auto t : {weight, bias, running_mean, running_var}) { + if (t->defined()) { + checkNumel(c, t, num_features); + } + } + + miopenBatchNormMode_t mode; + if (input->dim() == 2) { + mode = miopenBNPerActivation; + } else { + mode = miopenBNSpatial; + } + + auto output_t = input->type().tensor(input->sizes()); + TensorArg output{ output_t, "output", 0 }; + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*input); + TensorDescriptor idesc{ *input, 4 }; // input descriptor + TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, running_mean, etc. + + Constant one(dataType, 1); + Constant zero(dataType, 0); + Tensor save_mean, save_var; + + if (training) { + int64_t num_features = input_t.size(1); + save_mean = weight_t.type().tensor({ num_features }); + save_var = weight_t.type().tensor({ num_features }); + MIOPEN_CHECK(miopenBatchNormalizationForwardTraining( + handle, mode, &one, &zero, + idesc.desc(), input->data_ptr(), + idesc.desc(), output->data_ptr(), + wdesc.desc(), + weight->data_ptr(), + bias->data_ptr(), + exponential_average_factor, + at::maybe_data_ptr(running_mean), + at::maybe_data_ptr(running_var), + epsilon, + save_mean.data_ptr(), + save_var.data_ptr())); + } else { + MIOPEN_CHECK(miopenBatchNormalizationForwardInference( + handle, mode, &one, &zero, + idesc.desc(), input->data_ptr(), + idesc.desc(), output->data_ptr(), + wdesc.desc(), + weight->data_ptr(), + bias->data_ptr(), + running_mean->data_ptr(), + running_var->data_ptr(), + epsilon)); + } + + // save_mean and save_var can be undefined + // If this causes problems, we can initialize them to empty tensors + // of the correct type + return std::tuple{output_t, save_mean, save_var}; +} + +// NB: CuDNN only implements the backward algorithm for batchnorm +// in training mode (evaluation mode batchnorm has a different algorithm), +// which is why this doesn't accept a 'training' parameter. +std::tuple cudnn_batch_norm_backward( + const Tensor& input_t, const Tensor& grad_output_t, const Tensor& weight_t, + // Unused: but we require them to be passed so that double backwards + // has access + const Tensor& running_mean, const Tensor& running_var, + const Tensor& save_mean_t, const Tensor& save_var_t, + double epsilon) +{ + TensorArg input{ input_t, "input", 1 }, + grad_output{ grad_output_t, "grad_output", 2 }, + weight{ weight_t, "weight", 3 }, + save_mean{ save_mean_t, "save_mean", 4 }, + save_var{ save_var_t, "save_var", 5 }; + CheckedFrom c = "cudnn_batch_norm_backward"; + setMIOpenStreamToCurrent(); + + checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); + checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var}); + if (input->type().scalarType() == ScalarType::Half) { + checkScalarType(c, weight, ScalarType::Float); + } else { + checkAllSameType(c, {input, weight}); + } + checkAllSameType(c, {input, grad_output}); + checkAllSameType(c, {weight, save_mean, save_var}); + // TODO: is weight required to be contiguous? + checkAllContiguous(c, {input, grad_output, save_mean, save_var}); + checkDimRange(c, input, 2, 6 /* exclusive */); + checkSameSize(c, input, grad_output); + auto num_features = input->size(1); + for (auto t : {weight, save_mean, save_var}) { + checkNumel(c, t, num_features); + } + + miopenBatchNormMode_t mode; + if (input->dim() == 2) { + mode = miopenBNPerActivation; + } else { + mode = miopenBNSpatial; + } + + auto grad_input_t = input->type().tensor(input->sizes()); + auto grad_weight_t = weight->type().tensor(weight->sizes()); + auto grad_bias_t = weight->type().tensor(weight->sizes()); + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*input); + + TensorDescriptor idesc{ *input, 4 }; // input, output, grad_output descriptor + TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, save_mean, etc. + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenBatchNormalizationBackward( + handle, mode, &one, &zero, &one, &zero, + idesc.desc(), input->data_ptr(), + idesc.desc(), grad_output->data_ptr(), + idesc.desc(), grad_input_t.data_ptr(), + wdesc.desc(), weight->data_ptr(), + grad_weight_t.data_ptr(), + grad_bias_t.data_ptr(), + epsilon, + save_mean->data_ptr(), + save_var->data_ptr())); + + return std::tuple{grad_input_t, grad_weight_t, grad_bias_t}; +} + +}} // namespace native + +#endif diff --git a/aten/src/ATen/native/miopen/Conv.cpp b/aten/src/ATen/native/miopen/Conv.cpp new file mode 100644 index 00000000000000..fb9bd89a2919cc --- /dev/null +++ b/aten/src/ATen/native/miopen/Conv.cpp @@ -0,0 +1,1209 @@ +#include +#include +#include + +#if !AT_MIOPEN_ENABLED + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +at::Tensor cudnn_convolution( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution: ATen not compiled with MIOpen support"); +} + +at::Tensor cudnn_convolution_backward_input( + IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with MIOpen support"); +} + +at::Tensor cudnn_convolution_backward_weight( + IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with MIOpen support"); +} + +at::Tensor cudnn_convolution_backward_bias( + const at::Tensor& grad_output) { + throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with MIOpen support"); +} + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with MIOpen support"); +} + +at::Tensor cudnn_convolution_transpose( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, + IntList padding, IntList output_padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with MIOpen support"); +} + +at::Tensor cudnn_convolution_transpose_backward_input( + const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with MIOpen support"); +} + +at::Tensor cudnn_convolution_transpose_backward_weight( + IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with MIOpen support"); +} + +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with MIOpen support"); +} + +}} + +#else // AT_MIOPEN_ENABLED + +#include "THC/THC.h" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { + +// TODO: Go through all the checking code again and make sure +// we haven't missed anything. + +// --------------------------------------------------------------------- +// +// Math +// +// --------------------------------------------------------------------- + +constexpr int input_batch_size_dim = 0; // also grad_input +constexpr int input_channels_dim = 1; +constexpr int output_batch_size_dim = 0; // also grad_output +constexpr int output_channels_dim = 1; +constexpr int weight_output_channels_dim = 0; +constexpr int weight_input_channels_dim = 1; + +// Often written as 2 + max_dim (extra dims for batch size and channels) +constexpr int max_dim = 3; + +// NB: conv_output_size and conv_input_size are not bijections, +// as conv_output_size loses information; this is why conv_input_size +// takes an extra output_padding argument to resolve the ambiguity. + +std::vector conv_output_size( + IntList input_size, IntList weight_size, + IntList padding, IntList stride, IntList dilation, int64_t groups +) { + // ASSERT(input_size.size() > 2) + // ASSERT(input_size.size() == weight_size.size()) + auto dim = input_size.size(); + std::vector output_size(dim); + output_size[0] = input_size[input_batch_size_dim]; + output_size[1] = weight_size[weight_output_channels_dim]; + for (size_t d = 2; d < dim; ++d) { + auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1; + output_size[d] = (input_size[d] + (2 * padding[d - 2]) + - kernel) / stride[d - 2] + 1; + } + return output_size; +} + +std::vector conv_input_size( + IntList output_size, IntList weight_size, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups +) { + // ASSERT(output_size.size() > 2) + // ASSERT(output_size.size() == weight_size.size()) + auto dim = output_size.size(); + std::vector input_size(dim); + input_size[0] = output_size[output_batch_size_dim]; + input_size[1] = weight_size[weight_input_channels_dim] * groups; + for (size_t d = 2; d < dim; ++d) { + int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1; + input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) + + kernel + output_padding[d - 2]; + } + return input_size; +} + +std::vector conv_weight_size( + IntList input_size, IntList output_size, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups +) { + auto dim = input_size.size(); + std::vector weight_size(dim); + weight_size[0] = output_size[1]; + weight_size[1] = input_size[1] / groups; + for (size_t d = 2; d < dim; ++d) { + int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2] + + 2 * padding[d - 2] - output_padding[d - 2]; + weight_size[d] = (kernel - 1) / dilation[d - 2] + 1; + } + return weight_size; +} + +// TODO: Move this into the standard library, with a better name? +Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { + auto group_size = t.size(dim) / groups; + return t.narrow(dim, group_idx * group_size, group_size); +} + +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Note [Legacy CuDNN grouped convolution support] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// CuDNN earlier than CuDNN 7 does not directly support group +// convolution, so we provide support for it by sequentially +// running a convolution per group with appropriately +// adjusted sizes. https://blog.yani.io/filter-group-tutorial/ +// has a fairly good diagram explaining how it works. + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name) +{ + if (args.size() > expected_size){ + std::stringstream ss; + ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; + throw std::runtime_error(ss.str()); + } + else if (args.size() < expected_size){ + std::stringstream ss; + ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; + throw std::runtime_error(ss.str()); + } + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + throw std::runtime_error(ss.str()); + } +} + + +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +static void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntList padding, IntList stride, IntList dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + +// This POD struct is used to let us easily compute hashes of the +// parameters +struct ConvolutionParams +{ + miopenDataType_t dataType; + int input_size[2 + max_dim]; + int input_stride[2 + max_dim]; + int weight_size[2 + max_dim]; + int padding[max_dim]; + int stride[max_dim]; + int dilation[max_dim]; + int64_t groups; + bool deterministic; + // NB: transposed purposely omitted: transposed just swaps + // forward and backward, so you can reuse the benchmark entry, +}; +// ConvolutionParams must be a POD because we read out its memory +// contenst as char* when hashing +static_assert(std::is_pod::value, "ConvolutionParams not POD"); + +// NB: This can't be a constructor, because then ConvolutionParams +// would not be a POD anymore. +// TODO: Use TensorGeometry here instead of the entire Tensor, which we +// don't actually need. (OTOH: We can always pass in +// grad_input/grad_output, so this is not very pressing) +void setConvolutionParams( + ConvolutionParams* params, + const at::Tensor& input, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool deterministic) { + + miopenDataType_t dataType = getMiopenDataType(input); + memset(params, 0, sizeof(ConvolutionParams)); + params->dataType = dataType; + // ASSERT(weight.dim() == input.dim()) + for (int i = 0; i != input.dim(); ++i) { + params->input_size[i] = (int) input.size(i); + params->input_stride[i] = (int) input.stride(i); + params->weight_size[i] = (int) weight.size(i); + } + // ASSERT(padding.size() == stride.size()) + // ASSERT(padding.size() == dilation.size()) + for (size_t i = 0; i != padding.size(); ++i) { + params->padding[i] = padding[i]; + params->stride[i] = stride[i]; + params->dilation[i] = dilation[i]; + } + // In principle, we shouldn't parametrize by groups for legacy + // CuDNN, but it doesn't seem worth the effort to actually do this. + params->groups = groups; + params->deterministic = deterministic; +} + +// Convenience struct for passing around descriptors and data +// pointers +struct ConvolutionArgs { + miopenHandle_t handle; + ConvolutionParams params; + TensorDescriptor idesc, odesc; + FilterDescriptor wdesc; + const Tensor& input, output, weight; + ConvolutionDescriptor cdesc; + + ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) { + } +}; + +// --------------------------------------------------------------------- +// +// Benchmarking +// +// --------------------------------------------------------------------- + +// Hashing machinery for ConvolutionParams +struct ParamsHash { + std::size_t operator()(const ConvolutionParams& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (int i = 0; i < (int)sizeof(ConvolutionParams); ++i) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +struct ParamsEqual { + bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0; + } +}; + +// TODO: Use something less heavy duty than a big honking mutex +template +struct BenchmarkCache { + std::mutex mutex; + std::unordered_map map; + + bool find(const ConvolutionParams& params, T* results) { + std::lock_guard guard(mutex); + auto it = map.find(params); + if (it == map.end()) { + return false; + } + *results = it->second; + return true; + } + + void insert(const ConvolutionParams& params, const T& results) { + std::lock_guard guard(mutex); + map[params] = results; + } +}; + +BenchmarkCache fwd_algos; +BenchmarkCache bwd_data_algos; +BenchmarkCache bwd_filter_algos; + +// TODO: Stop manually allocating CUDA memory; allocate an ATen byte +// tensor instead. +struct Workspace { + Workspace(size_t size) : size(size), data(NULL) { + CUDA_CHECK(THCudaMalloc(globalContext().lazyInitCUDA(), &data, size)); + } + Workspace(const Workspace&) = delete; + Workspace(Workspace&&) = default; + Workspace& operator=(Workspace&&) = default; + ~Workspace() { + if (data) { + THCudaFree(globalContext().lazyInitCUDA(), data); + } + } + + size_t size; + void* data; +}; + +template +struct algorithm_search { +}; + +miopenStatus_t getWorkspaceSize( + const ConvolutionArgs& args, + miopenConvFwdAlgorithm_t algo, size_t* sz) +{ + return miopenConvolutionForwardGetWorkSpaceSize( + args.handle, + args.wdesc.desc(), + args.idesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + sz); +} +miopenStatus_t getWorkspaceSize( + const ConvolutionArgs& args, miopenConvBwdDataAlgorithm_t algo, size_t* sz) +{ + return miopenConvolutionBackwardDataGetWorkSpaceSize( + args.handle, + args.odesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + sz); +} +miopenStatus_t getWorkspaceSize( + const ConvolutionArgs& args, miopenConvBwdWeightsAlgorithm_t algo, size_t* sz) +{ + return miopenConvolutionBackwardWeightsGetWorkSpaceSize( + args.handle, + args.odesc.desc(), + args.idesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + sz); +} + +template +size_t getMaxWorkspaceSize( + const ConvolutionArgs& args, + const algo_t *algo, int n_algo) +{ + THCState *state = globalContext().lazyInitCUDA(); + + size_t max_ws_size = 0; + size_t max_block_size = 0; + size_t total_gpu_mem = 0; + size_t free_gpu_mem = 0; + + THCudaCheck(THCudaMemGetInfoCached(state, &free_gpu_mem, &total_gpu_mem, &max_block_size)); + + for (int i = 0; i < n_algo; i++) { + miopenStatus_t err; + size_t sz; + err = getWorkspaceSize(args, algo[i], &sz); + if (miopenStatusSuccess != err || sz == 0 + || sz < max_ws_size || sz > max_block_size) continue; + max_ws_size = sz; + } + return max_ws_size; +} + +template +perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) { + if (deterministic) { + throw std::runtime_error("no deterministic convolution algorithms available in MIOpen"); + } else { + return perfResults[0]; + } +} + +template<> +struct algorithm_search { + using perf_t = miopenConvAlgoPerf_t; + using algo_t = miopenConvFwdAlgorithm_t; + + static constexpr auto DEFAULT_ALGO = miopenConvolutionFwdAlgoGEMM; + static BenchmarkCache& cache() { return fwd_algos; } + + static perf_t findAlgorithm(const ConvolutionArgs& args) { + static const algo_t algos[] = { + miopenConvolutionFwdAlgoGEMM, + miopenConvolutionFwdAlgoFFT, + miopenConvolutionFwdAlgoDirect, + miopenConvolutionFwdAlgoWinograd, + }; + static constexpr int num_algos = 4; + static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing MIOpen convolution forward algorithms"); + int perf_count; + std::unique_ptr perf_results(new perf_t[num_algos]); + size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); + Workspace ws(max_ws_size); + MIOPEN_CHECK(miopenFindConvolutionForwardAlgorithm( + args.handle, + args.idesc.desc(), args.input.data_ptr(), + args.wdesc.desc(), args.weight.data_ptr(), + args.cdesc.desc(), + args.odesc.desc(), args.output.data_ptr(), + num_algos, + &perf_count, + &perf_results, + ws.data, + ws.size, + false)); + return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count) + } + + static void getAlgorithm( + const ConvolutionArgs& args, + algo_t* algo) + { + throw std::runtime_error("miopenGetConvolutionForwardAlgorithm is not supported."); + /*miopenConvolutionFwdPreference_t pref = cudnn_convolution_FWD_PREFER_FASTEST; + MIOPEN_CHECK(miopenGetConvolutionForwardAlgorithm( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + pref, + 0, + algo));*/ + } + + static void getWorkspaceSize( + const ConvolutionArgs& args, + algo_t algo, size_t* workspaceSize) + { + throw std::runtime_error("miopenGetConvolutionForwardWorkspaceSize is not supported."); + /*MIOPEN_CHECK(miopenGetConvolutionForwardWorkspaceSize( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + algo, + workspaceSize));*/ + } +}; + +template<> +struct algorithm_search { + using perf_t = miopenConvAlgoPerf_t; + using algo_t = miopenConvBwdDataAlgorithm_t; + + static constexpr auto DEFAULT_ALGO = miopenConvolutionBwdDataAlgoGEMM; + static BenchmarkCache& cache() { return bwd_data_algos; } + + static perf_t findAlgorithm(const ConvolutionArgs& args) { + static const algo_t algos[] = { + miopenConvolutionBwdDataAlgoGEMM, + miopenConvolutionBwdDataAlgoDirect, + miopenConvolutionBwdDataAlgoFFT, + miopenConvolutionBwdDataAlgoWinograd, + }; + static constexpr int num_algos = 4; + static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing MIOpen convolution backward data algorithms."); + int perf_count; + std::unique_ptr perf_results(new perf_t[num_algos]); + size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); + Workspace ws(max_ws_size); + MIOPEN_CHECK(miopenFindConvolutionBackwardDataAlgorithm( + args.handle, + args.odesc.desc(), args.output.data_ptr(), + args.wdesc.desc(), args.weight.data_ptr(), + args.cdesc.desc(), + args.idesc.desc(), args.input.data_ptr(), + num_algos, + &perf_count, + &perf_results, + ws.data, + ws.size, + false)); + return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count); + } + + static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) { + throw std::runtime_error("miopenGetConvolutionBackwardDataAlgorithm is not supported."); + + /*MIOPEN_CHECK(miopenGetConvolutionBackwardDataAlgorithm( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + cudnn_convolution_BWD_DATA_PREFER_FASTEST, + 0, + algo));*/ + } + + static void getWorkspaceSize( + const ConvolutionArgs& args, + miopenConvBwdDataAlgorithm_t algo, size_t* workspaceSize) + { + throw std::runtime_error("miopenGetConvolutionBackwardDataWorkspaceSize is not supported."); + + /*MIOPEN_CHECK(miopenGetConvolutionBackwardDataWorkspaceSize( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + algo, + workspaceSize));*/ + } +}; + +template<> +struct algorithm_search { + using perf_t = miopenConvAlgoPerf_t; + using algo_t = miopenConvBwdWeightsAlgorithm_t; + + static constexpr auto DEFAULT_ALGO = miopenConvolutionBwdWeightsAlgoGEMM; + static BenchmarkCache& cache() { return bwd_filter_algos; } + + static perf_t findAlgorithm(const ConvolutionArgs& args) { + static const algo_t algos[] = { + miopenConvolutionBwdWeightsAlgoGEMM, + miopenConvolutionBwdWeightsAlgoDirect, + }; + + static constexpr int num_algos = 2; + static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing MIOpen convolution backward filter algorithms."); + std::unique_ptr perf_results(new perf_t[num_algos]); + size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); + int perf_count; + Workspace ws(max_ws_size); + + MIOPEN_CHECK(miopenFindConvolutionBackwardWeightsAlgorithm( + args.handle, + args.odesc.desc(), args.output.data_ptr(), + args.idesc.desc(), args.input.data_ptr(), + args.cdesc.desc(), + args.wdesc.desc(), args.weight.data_ptr(), + num_algos, + &perf_count, + &perf_results, + ws.data, + ws.size, + false)); + return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count); + } + + static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) { + throw std::runtime_error("miopenGetConvolutionBackwardFilterAlgorithm is not supported."); + + /*MIOPEN_CHECK(miopenGetConvolutionBackwardFilterAlgorithm( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + cudnn_convolution_BWD_FILTER_PREFER_FASTEST, + 0, + algo));*/ + } + + static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize) + { + throw std::runtime_error("miopenGetConvolutionBackwardFilterWorkspaceSize is not supported."); + + /*MIOPEN_CHECK(miopenGetConvolutionBackwardFilterWorkspaceSize( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + algo, + workspaceSize));*/ + } +}; + +template +void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) { + using search = algorithm_search; + auto& cache = search::cache(); + + if (cache.find(args.params, algo)) { + return; + } + + if (args.params.deterministic && !benchmark) { + *algo = search::DEFAULT_ALGO; + return; + } + + if (!benchmark) { + search::getAlgorithm(args, algo); + return; + } + + if (cache.find(args.params, algo)) { + // re-check cache since another thread may have benchmarked the algorithm + return; + } + + auto perfResults = search::findAlgorithm(args); + // for deterministic algo, look at all the perf results and return the best + // deterministic algo + if (perfResults.status == miopenStatusSuccess && + !(args.params.deterministic /*&& perfResults.determinism != CUDNN_DETERMINISTIC*/)) { + *algo = perfResults.algo; + } else { + //*algo = search::DEFAULT_ALGO; + throw std::runtime_error("Deterministic algorithms aren't supported in MIOpen."); + } + cache.insert(args.params, *algo); + + THCDeviceAllocator* allocator = THCCachingAllocator_get(); + CUDA_CHECK(allocator->emptyCache(allocator->state)); +} + +template +Workspace chooseAlgorithm( + const ConvolutionArgs& args, + bool benchmark, + algo_t* algo) +{ + findAlgorithm(args, benchmark, algo); + + using search = algorithm_search; + size_t workspace_size; + search::getWorkspaceSize(args, *algo, &workspace_size); + try { + return Workspace(workspace_size); + } catch (std::runtime_error& e) { + hipGetLastError(); // clear OOM error + + // switch to default algorithm and record it in the cache to prevent + // further OOM errors + *algo = search::DEFAULT_ALGO; + search::cache().insert(args.params, *algo); + + search::getWorkspaceSize(args, *algo, &workspace_size); + return Workspace(workspace_size); + } +} + +// --------------------------------------------------------------------- +// +// Bias addition +// +// --------------------------------------------------------------------- + +// In-place! +void cudnn_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const TensorArg& bias) +{ + checkAllSameType(c, {output, bias}); + checkAllSameGPU(c, {output, bias}); + checkSize(c, bias, { output->size(output_channels_dim) }); + + // See Note [CuDNN broadcast padding]. Handle the left padding + // ourselves, but use TensorDescriptor's padding argument to do the rest. + TensorDescriptor bdesc, odesc; + bdesc.set(bias->expand({1, bias->size(0)}), output->dim()); + odesc.set(*output); + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*bias); + Constant one(dataType, 1); + + MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->data_ptr(), + &one, odesc.desc(), output->data_ptr())); +} + +// The general strategy: +// +// - cudnn_convolution (Tensor) +// Entry points for clients, takes bias +// +// - cudnn_convolution_forward (TensorArg) +// Entry point, which may be reused between regular +// convolution and transposed convolution. Does NOT take bias. +// +// - raw_cudnn_convolution_forward_out (Tensor) +// Low level function which invokes CuDNN, and takes an output +// tensor which is directly written to (thus _out). +// +// Where does argument checking happen? Here's the division of +// responsibility: +// - Things that happen in at::Tensor +// - TensorArg allocation +// - setCuDNNStreamToCurrent +// - Things that happen in TensorArg +// - Check arguments (type, GPU, shape) +// +// TODO: Consider renaming zero-indexed arguments to "self" + + + +// --------------------------------------------------------------------- +// +// Convolution forward / Transposed convolution backward +// +// --------------------------------------------------------------------- + +// The raw API directly invokes CuDNN and does not emulate support +// for group convolution on old versions of CuDNN. +// +// There are a few reasons this should never be directly exposed +// via ATen: +// +// - It takes output as a parameter (this should be computed!) +// - It doesn't do input checking +// - It doesn't resize output (it is assumed to be correctly sized) +// - It takes a ConvolutionParams struct +// +void raw_cudnn_convolution_forward_out( + const Tensor& output, const Tensor& input, const Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getMiopenDataType(input); + + ConvolutionArgs args{ input, output, weight }; + args.handle = getMiopenHandle(); + setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(input); + args.wdesc.set(weight); + args.odesc.set(output); + args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + // TODO: when we do legacy group convolution support, we'll repeatedly + // reinitialize the workspace for each convolution we do. This is + // wasteful; we'd rather reuse the workspace. OTOH, legacy group + // convolution support is already pretty slow, so this might not + // matter. (This applies to raw_cudnn_convolution_backward_input as well.) + miopenConvFwdAlgorithm_t fwdAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionForward( + args.handle, + &one, args.idesc.desc(), input.data_ptr(), + args.wdesc.desc(), weight.data_ptr(), + args.cdesc.desc(), fwdAlg, &zero, + args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); +} + +Tensor cudnn_convolution_forward( + CheckedFrom c, + const TensorArg& input, const TensorArg& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + checkAllSameType(c, {input, weight}); + checkAllSameGPU(c, {input, weight}); + + auto output_t = input->type().tensor( + conv_output_size(input->sizes(), weight->sizes(), + padding, stride, dilation, groups)); + + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{ output_t, "result", 0 }; + convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); + + // See #4500 + Tensor weight_contig = weight->contiguous(); + +#if CUDNN_VERSION < 7000 + for (int i = 0; i < groups; i++) { + raw_cudnn_convolution_forward_out( + narrowGroup(*output, output_channels_dim, i, groups), + narrowGroup(*input, input_channels_dim, i, groups), + narrowGroup(weight_contig, weight_output_channels_dim, i, groups), + padding, stride, dilation, 1, benchmark, deterministic); + } +#else + raw_cudnn_convolution_forward_out( + *output, *input, weight_contig, + padding, stride, dilation, groups, benchmark, deterministic); +#endif + + return *output; +} + +Tensor cudnn_convolution( + const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }, + bias { bias_t, "bias", 3 }; + setMIOpenStreamToCurrent(); + CheckedFrom c = "cudnn_convolution"; + auto output_t = cudnn_convolution_forward( + c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); + if (bias->defined()) { + cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + } + return output_t; +} + +// NB: output_padding not needed here, as there is no ambiguity to +// resolve +Tensor cudnn_convolution_transpose_backward_input( + const Tensor& grad_output_t, const Tensor& weight_t, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + TensorArg grad_output { grad_output_t, "grad_output", 1 }, + weight { weight_t, "weight", 2 }; + setMIOpenStreamToCurrent(); + return cudnn_convolution_forward( + "cudnn_convolution_transpose_backward_input", + grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); +} + +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[1]) { + grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[2]) { + grad_bias = at::cudnn_convolution_backward_bias(grad_output); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +// --------------------------------------------------------------------- +// +// Convolution backward / Transposed convolution forward +// +// --------------------------------------------------------------------- + +void raw_cudnn_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getMiopenDataType(grad_output); + + ConvolutionArgs args{ grad_input, grad_output, weight }; + args.handle = getMiopenHandle(); + setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(grad_input); + args.wdesc.set(weight); + args.odesc.set(grad_output); + args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + miopenConvBwdDataAlgorithm_t bwdDataAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardData( + args.handle, + &one, args.odesc.desc(), grad_output.data_ptr(), + args.wdesc.desc(), weight.data_ptr(), + args.cdesc.desc(), bwdDataAlg, &zero, + args.idesc.desc(), grad_input.data_ptr(), workspace.data, workspace.size)); +} + +// Backward and transpose are algorithmically equivalent, but they +// compute their geometry differently. In a backwards, you knew what +// the original size of the input tensor was, so you can cache that +// geometry and fill it directly. In transposed convolution, it is +// more conventional to not explicitly specify the output (previously +// input) size, and compute it. This, however, leaves a degree of +// freedom; this degree of freedom is resolved using the +// output_padding parameter. Both of these interfaces are equivalent, +// but they are differently convenient depending on the use case. + +Tensor cudnn_convolution_backward_input( + CheckedFrom c, + IntList input_size, const TensorArg& grad_output, const TensorArg& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + checkAllSameType(c, {grad_output, weight}); + checkAllSameGPU(c, {grad_output, weight}); + + auto grad_input_t = grad_output->type().tensor(input_size); + + // Avoid "grad_input" when this is being used as transposed convolution + TensorArg grad_input{ grad_input_t, "result", 0 }; + convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + + // See #4500 + Tensor weight_contig = weight->contiguous(); + +#if CUDNN_VERSION < 7000 + for (int i = 0; i < groups; i++) { + raw_cudnn_convolution_backward_input_out( + narrowGroup(*grad_input, input_channels_dim, i, groups), + narrowGroup(*grad_output, output_channels_dim, i, groups), + narrowGroup(weight_contig, weight_output_channels_dim, i, groups), + padding, stride, dilation, 1, benchmark, deterministic); + } +#else + raw_cudnn_convolution_backward_input_out( + *grad_input, *grad_output, weight_contig, + padding, stride, dilation, groups, benchmark, deterministic); +#endif + + return *grad_input; +} + +Tensor cudnn_convolution_transpose_forward( + CheckedFrom c, + const TensorArg& grad_output, const TensorArg& weight, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), + padding, output_padding, stride, dilation, groups); + return cudnn_convolution_backward_input(c, input_size, grad_output, weight, + padding, stride, dilation, groups, benchmark, deterministic); +} + +Tensor cudnn_convolution_backward_input( + IntList input_size, const Tensor& grad_output_t, const Tensor& weight_t, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + weight{ weight_t, "weight", 2 }; + setMIOpenStreamToCurrent(); + return cudnn_convolution_backward_input( + "cudnn_convolution_backward_input", + input_size, grad_output, weight, + padding, stride, dilation, groups, benchmark, deterministic); +} + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[1]) { + grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[2]) { + grad_bias = at::cudnn_convolution_backward_bias(grad_output); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +Tensor cudnn_convolution_transpose( + const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t, + IntList padding, IntList output_padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }, + bias { bias_t, "bias", 3 }; + CheckedFrom c = "cudnn_convolution_transpose"; + auto output_t = cudnn_convolution_transpose_forward( + c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic); + if (bias->defined()) { + cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + } + return output_t; +} + +// --------------------------------------------------------------------- +// +// Convolution backward (weight) +// +// --------------------------------------------------------------------- + +void raw_cudnn_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getMiopenDataType(input); + + ConvolutionArgs args{ input, grad_output, grad_weight }; + args.handle = getMiopenHandle(); + setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(input); + args.wdesc.set(grad_weight); + args.odesc.set(grad_output); + args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardWeights( + args.handle, + &one, args.odesc.desc(), grad_output.data_ptr(), + args.idesc.desc(), input.data_ptr(), + args.cdesc.desc(), bwdFilterAlg, &zero, + args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); +} + +Tensor cudnn_convolution_backward_weight( + CheckedFrom c, + IntList weight_size, const TensorArg& grad_output, const TensorArg& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + + checkAllSameType(c, {grad_output, input}); + checkAllSameGPU(c, {grad_output, input}); + + auto grad_weight_t = grad_output->type().tensor(weight_size); + + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{ grad_weight_t, "result", 0 }; + convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); + +#if CUDNN_VERSION < 7000 + for (int i = 0; i < groups; i++) { + raw_cudnn_convolution_backward_weight_out( + narrowGroup(*grad_weight, weight_output_channels_dim, i, groups), + narrowGroup(*grad_output, output_channels_dim, i, groups), + narrowGroup(*input, input_channels_dim, i, groups), + padding, stride, dilation, groups, benchmark, deterministic); + } +#else + raw_cudnn_convolution_backward_weight_out( + *grad_weight, *grad_output, *input, + padding, stride, dilation, groups, benchmark, deterministic); +#endif + + return grad_weight_t; +} + +Tensor cudnn_convolution_backward_weight( + IntList weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + setMIOpenStreamToCurrent(); + return cudnn_convolution_backward_weight( + "cudnn_convolution_backward_weight", + weight_size, grad_output, input, + padding, stride, dilation, groups, benchmark, deterministic); +} + +Tensor cudnn_convolution_transpose_backward_weight( + IntList weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + setMIOpenStreamToCurrent(); + return cudnn_convolution_backward_weight( + "cudnn_convolution_backward_weight", + weight_size, input, grad_output, + padding, stride, dilation, groups, benchmark, deterministic); +} + +// --------------------------------------------------------------------- +// +// Convolution backward (bias) +// +// --------------------------------------------------------------------- + +Tensor cudnn_convolution_backward_bias( + const Tensor& grad_output_t) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }; + setMIOpenStreamToCurrent(); + + auto grad_bias_t = grad_output->type().tensor( + { grad_output->size(output_channels_dim) }); + + TensorArg grad_bias{ grad_bias_t, "result", 0 }; + + // See Note [CuDNN broadcast padding]. Handle the left padding + // ourselves, but use TensorDescriptor's pad argument to do the rest. + TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), + static_cast(grad_output->dim())}; + TensorDescriptor odesc{*grad_output}; + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*grad_bias); + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), + &zero, bdesc.desc(), grad_bias->data_ptr())); + return *grad_bias; +} + + +}} // namespace + +#endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index d6b45b82673dbf..4cd9d347a071b9 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -574,6 +574,8 @@ - func: index_copy_(Tensor self, int64_t dim, IndexTensor index, Tensor source) -> Tensor variants: method +- func: index_put(Tensor self, TensorList indices, Tensor values) -> Tensor + - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor - func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor @@ -798,6 +800,9 @@ - func: randperm_out(Tensor result, int64_t n, *, Generator* generator=nullptr) -> Tensor variants: function + dispatch: + CPU: randperm_out_cpu + CUDA: randperm_out_cuda - func: range(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor variants: function diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index f7a40f58d17ef5..8515c95f0ca510 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -3,7 +3,6 @@ IF (MSVC) return() ENDIF() ENDIF(MSVC) - list(APPEND ATen_CPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/scalar_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/apply_utils_test.cpp diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index e299da20bcdaf9..1ad8491d8c97db 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -8,6 +8,22 @@ #include +typedef struct _THTensor +{ + int64_t *size; + int64_t *stride; + int nDimension; + + // Note: storage->size may be greater than the recorded size + // of a tensor + THStorage *storage; + ptrdiff_t storageOffset; + std::atomic refcount; + + char flag; + +} _THTensor; + #include "generic/THTensor.hpp" #include "THGenerateAllTypes.h" diff --git a/aten/src/TH/generic/THTensor.hpp b/aten/src/TH/generic/THTensor.hpp index c7bd01d803e0cf..1f5d99fa6d17b6 100644 --- a/aten/src/TH/generic/THTensor.hpp +++ b/aten/src/TH/generic/THTensor.hpp @@ -2,20 +2,9 @@ #define TH_GENERIC_FILE "generic/THTensor.hpp" #else -typedef struct THTensor -{ - int64_t *size; - int64_t *stride; - int nDimension; - - // Note: storage->size may be greater than the recorded size - // of a tensor - THStorage *storage; - ptrdiff_t storageOffset; - std::atomic refcount; - - char flag; - +// This allows one to write generic functions (i.e. functions that work across all THRealTensor types) +// without having to explicitly cast the THRealTensor. +typedef struct THTensor : _THTensor { } THTensor; #endif diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index 88f4e3c9daaa9b..677972333e59c7 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -2,6 +2,8 @@ #define TH_GENERIC_FILE "generic/THTensorRandom.cpp" #else +#include + #ifdef _OPENMP #include #endif @@ -61,7 +63,7 @@ void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p) #define BERNOULLI_OMP 800 #define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000 -void iBernoulli_generate_copy(THTensor *self, THGenerator *_generator, const double p) +void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator, const double p) { int64_t seed = THRandom_random(_generator); int64_t n = THTensor_(nElement)(self); @@ -135,7 +137,7 @@ void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p) #ifdef TH_BLAS_MKL if(cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) { std::lock_guard lock(_generator->mutex); - iBernoulli_generate_copy(self, _generator, p); + THTensor_(iBernoulli_generate_copy)(self, _generator, p); } else { std::lock_guard lock(_generator->mutex); TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p);); @@ -403,6 +405,10 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), 2, "invalid multinomial distribution (encountering probability entry < 0)"); + THArgCheckWithCleanup((std::isfinite(val)), + THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), + 2, + "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); sum += val; THDoubleStorage_set( cum_dist->storage, \ diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh index 724820d6726005..c202e6d1e0b663 100644 --- a/aten/src/THC/THCApply.cuh +++ b/aten/src/THC/THCApply.cuh @@ -165,13 +165,11 @@ inline dim3 getApplyBlock() { return dim3(THC_APPLY_THREADS_PER_BLOCK); } -inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid) { - int curDevice = -1; - cudaGetDevice(&curDevice); +inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid, int curDevice) { if (curDevice == -1) return false; uint64_t numBlocks = THCCeilDiv(totalElements, static_cast(THC_APPLY_THREADS_PER_BLOCK)); - uint64_t maxGridX = THCState_getCurrentDeviceProperties(state)->maxGridSize[0]; + uint64_t maxGridX = THCState_getDeviceProperties(state, curDevice)->maxGridSize[0]; if (numBlocks > maxGridX) numBlocks = maxGridX; @@ -192,11 +190,11 @@ bool THC_pointwiseApply1(THCState* state, const Op& op, TensorArgType aType = ReadWrite) { static_assert(std::is_same::DataType>::value, "ScalarTypeA must match"); - if (TensorUtils::getDims(state, a) > MAX_CUTORCH_DIMS) { + if (THCTensor_nDimension(state, a) > MAX_CUTORCH_DIMS) { return false; } - if (TensorUtils::getDims(state, a) == 0) { + if (THCTensor_nDimension(state, a) == 0) { // Zero-dim tensor; do nothing return true; } @@ -204,9 +202,11 @@ bool THC_pointwiseApply1(THCState* state, const dim3 block = getApplyBlock(); dim3 grid; - ptrdiff_t totalElements = TensorUtils::getNumElements(state, a); - - if (!getApplyGrid(state, totalElements, grid)) { + ptrdiff_t totalElements = THCTensor_nElement(state, a); + + int curDevice = -1; + cudaGetDevice(&curDevice); + if (!getApplyGrid(state, totalElements, grid, curDevice)) { return false; } @@ -218,7 +218,7 @@ bool THC_pointwiseApply1(THCState* state, TensorTypeA* oldA = NULL; if (aType == ReadWrite && - TensorUtils::maybeOverlappingIndices(state, a)) { + THCTensor_maybeOverlappingIndices(state, a)) { // Must perform in contiguous space oldA = a; a = TensorUtils::newContiguous(state, a); @@ -236,8 +236,8 @@ bool THC_pointwiseApply1(THCState* state, kernelPointwiseApply1 \ - <<>>( \ - OffsetInfo \ + <<>>( \ + OffsetInfo \ (aInfo), \ (TYPE) totalElements, op); @@ -259,7 +259,7 @@ bool THC_pointwiseApply1(THCState* state, // and the resulting non-linear offset is all computable using 32-bit math?) // We also use unsigned index math in the kernel, as signed div/mod has // additional overhead. - if (TensorUtils::canUse32BitIndexMath(state, a)) { + if (THCTensor_canUse32BitIndexMath(state, a)) { TensorInfo aInfo = getTensorInfo(state, a); rearrangeDims(&aInfo); @@ -310,7 +310,7 @@ bool THC_pointwiseApply1(THCState* state, // instead, it will recursively try and invoke ourselves to make // oldA contiguous. TensorUtils::copyIgnoringOverlaps(state, oldA, a); - TensorUtils::free(state, a); + THCTensor_free(state, a); a = oldA; } @@ -331,17 +331,17 @@ bool THC_pointwiseApply2(THCState* state, static_assert(std::is_same::DataType>::value, "ScalarTypeA must match"); static_assert(std::is_same::DataType>::value, "ScalarTypeB must match"); - ptrdiff_t totalElements = TensorUtils::getNumElements(state, a); - if (totalElements != TensorUtils::getNumElements(state, b)) { + ptrdiff_t totalElements = THCTensor_nElement(state, a); + if (totalElements != THCTensor_nElement(state, b)) { return false; } - if (TensorUtils::getDims(state, a) > MAX_CUTORCH_DIMS || - TensorUtils::getDims(state, b) > MAX_CUTORCH_DIMS) { + if (THCTensor_nDimension(state, a) > MAX_CUTORCH_DIMS || + THCTensor_nDimension(state, b) > MAX_CUTORCH_DIMS) { return false; } - if (TensorUtils::getDims(state, a) == 0) { + if (THCTensor_nDimension(state, a) == 0) { // Zero-dim tensor; do nothing return true; } @@ -349,7 +349,9 @@ bool THC_pointwiseApply2(THCState* state, const dim3 block = getApplyBlock(); dim3 grid; - if (!getApplyGrid(state, totalElements, grid)) { + int curDevice = -1; + cudaGetDevice(&curDevice); + if (!getApplyGrid(state, totalElements, grid, curDevice)) { return false; } @@ -362,13 +364,13 @@ bool THC_pointwiseApply2(THCState* state, TensorTypeB* oldB = NULL; if (aType == ReadWrite && - TensorUtils::maybeOverlappingIndices(state, a)) { + THCTensor_maybeOverlappingIndices(state, a)) { // Must perform in contiguous space oldA = a; a = TensorUtils::newContiguous(state, a); } if (bType == ReadWrite && - TensorUtils::maybeOverlappingIndices(state, b)) { + THCTensor_maybeOverlappingIndices(state, b)) { // Must perform in contiguous space oldB = b; b = TensorUtils::newContiguous(state, b); @@ -387,8 +389,8 @@ bool THC_pointwiseApply2(THCState* state, ScalarTypeA, \ ScalarTypeB, \ TYPE, A, B> \ - <<>>( \ - OffsetInfo \ + <<>>( \ + OffsetInfo \ (aInfo), \ OffsetInfo \ (bInfo), \ @@ -422,8 +424,8 @@ bool THC_pointwiseApply2(THCState* state, } \ } - if (TensorUtils::canUse32BitIndexMath(state, a) && - TensorUtils::canUse32BitIndexMath(state, b)) { + if (THCTensor_canUse32BitIndexMath(state, a) && + THCTensor_canUse32BitIndexMath(state, b)) { TensorInfo aInfo = getTensorInfo(state, a); @@ -490,7 +492,7 @@ bool THC_pointwiseApply2(THCState* state, // instead, it will recursively try and invoke ourselves to make // oldA contiguous. TensorUtils::copyIgnoringOverlaps(state, oldA, a); - TensorUtils::free(state, a); + THCTensor_free(state, a); a = oldA; } @@ -499,7 +501,7 @@ bool THC_pointwiseApply2(THCState* state, // instead, it will recursively try and invoke ourselves to make // oldB contiguous. TensorUtils::copyIgnoringOverlaps(state, oldB, b); - TensorUtils::free(state, b); + THCTensor_free(state, b); b = oldB; } @@ -525,20 +527,20 @@ bool THC_pointwiseApply3(THCState* state, static_assert(std::is_same::DataType>::value, "ScalarTypeB must match"); static_assert(std::is_same::DataType>::value, "ScalarTypeB must match"); - ptrdiff_t totalElements = TensorUtils::getNumElements(state, a); + ptrdiff_t totalElements = THCTensor_nElement(state, a); - if (totalElements != TensorUtils::getNumElements(state, b) || - totalElements != TensorUtils::getNumElements(state, c)) { + if (totalElements != THCTensor_nElement(state, b) || + totalElements != THCTensor_nElement(state, c)) { return false; } - if (TensorUtils::getDims(state, a) > MAX_CUTORCH_DIMS || - TensorUtils::getDims(state, b) > MAX_CUTORCH_DIMS || - TensorUtils::getDims(state, c) > MAX_CUTORCH_DIMS) { + if (THCTensor_nDimension(state, a) > MAX_CUTORCH_DIMS || + THCTensor_nDimension(state, b) > MAX_CUTORCH_DIMS || + THCTensor_nDimension(state, c) > MAX_CUTORCH_DIMS) { return false; } - if (TensorUtils::getDims(state, a) == 0) { + if (THCTensor_nDimension(state, a) == 0) { // Zero-dim tensor; do nothing return true; } @@ -546,7 +548,9 @@ bool THC_pointwiseApply3(THCState* state, const dim3 block = getApplyBlock(); dim3 grid; - if (!getApplyGrid(state, totalElements, grid)) { + int curDevice = -1; + cudaGetDevice(&curDevice); + if (!getApplyGrid(state, totalElements, grid, curDevice)) { return false; } @@ -560,19 +564,19 @@ bool THC_pointwiseApply3(THCState* state, TensorTypeC* oldC = NULL; if (aType == ReadWrite && - TensorUtils::maybeOverlappingIndices(state, a)) { + THCTensor_maybeOverlappingIndices(state, a)) { // Must perform in contiguous space oldA = a; a = TensorUtils::newContiguous(state, a); } if (bType == ReadWrite && - TensorUtils::maybeOverlappingIndices(state, b)) { + THCTensor_maybeOverlappingIndices(state, b)) { // Must perform in contiguous space oldB = b; b = TensorUtils::newContiguous(state, b); } if (cType == ReadWrite && - TensorUtils::maybeOverlappingIndices(state, c)) { + THCTensor_maybeOverlappingIndices(state, c)) { // Must perform in contiguous space oldC = c; c = TensorUtils::newContiguous(state, c); @@ -584,7 +588,7 @@ bool THC_pointwiseApply3(THCState* state, ScalarTypeB, \ ScalarTypeC, \ TYPE, A, B, C> \ - <<>>( \ + <<>>( \ OffsetInfo \ (aInfo), \ OffsetInfo \ @@ -635,9 +639,9 @@ bool THC_pointwiseApply3(THCState* state, } \ } - if (TensorUtils::canUse32BitIndexMath(state, a) && - TensorUtils::canUse32BitIndexMath(state, b) && - TensorUtils::canUse32BitIndexMath(state, c)) { + if (THCTensor_canUse32BitIndexMath(state, a) && + THCTensor_canUse32BitIndexMath(state, b) && + THCTensor_canUse32BitIndexMath(state, c)) { TensorInfo aInfo = getTensorInfo(state, a); @@ -720,7 +724,7 @@ bool THC_pointwiseApply3(THCState* state, // instead, it will recursively try and invoke ourselves to make // oldA contiguous. TensorUtils::copyIgnoringOverlaps(state, oldA, a); - TensorUtils::free(state, a); + THCTensor_free(state, a); a = oldA; } @@ -729,7 +733,7 @@ bool THC_pointwiseApply3(THCState* state, // instead, it will recursively try and invoke ourselves to make // oldB contiguous. TensorUtils::copyIgnoringOverlaps(state, oldB, b); - TensorUtils::free(state, b); + THCTensor_free(state, b); b = oldB; } @@ -738,7 +742,7 @@ bool THC_pointwiseApply3(THCState* state, // instead, it will recursively try and invoke ourselves to make // oldC contiguous. TensorUtils::copyIgnoringOverlaps(state, oldC, c); - TensorUtils::free(state, c); + THCTensor_free(state, c); c = oldC; } diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index 9e54c56dc45534..e89bf424e30aea 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -103,7 +103,7 @@ static inline __device__ void atomicAdd(half *address, half val) { do { assumed = old; -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); hsum = THCNumerics::add(hsum, val); @@ -135,7 +135,7 @@ static inline __device__ void atomicAdd(double *address, double val) { // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); } -#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) +#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__) // This needs to be defined for the host side pass static inline __device__ void atomicAdd(double *address, double val) { } #endif diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h index 43ffd76e316896..d9b8cba72d2081 100644 --- a/aten/src/THC/THCHalf.h +++ b/aten/src/THC/THCHalf.h @@ -4,22 +4,16 @@ #include "THCGeneral.h" /* We compile with CudaHalfTensor support if we have this: */ -#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 +#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__) #define CUDA_HALF_TENSOR 1 #endif -/* For HIP, rely on the half instructions as well.*/ -#if defined(__HIP_PLATFORM_HCC__) -#define CUDA_HALF_TENSOR 1 -#define CUDA_HALF_INSTRUCTIONS 1 -#endif - #ifdef CUDA_HALF_TENSOR #include #include -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) #ifndef __cplusplus typedef __half_raw half; #endif diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh index 1e7f15e5d51473..40d6b9274b2bc9 100644 --- a/aten/src/THC/THCNumerics.cuh +++ b/aten/src/THC/THCNumerics.cuh @@ -48,6 +48,7 @@ struct THCNumerics { static inline __host__ __device__ uint8_t abs(uint8_t a) { return a; } static inline __host__ __device__ uint8_t pow(uint8_t a, uint8_t b) { return powi(a, b); } static inline __host__ __device__ bool isnan(uint8_t a) { return false; } + static inline __host__ __device__ bool isinf(uint8_t a) { return false; } }; template <> @@ -70,6 +71,7 @@ struct THCNumerics { static inline __host__ __device__ int8_t abs(int8_t a) { return ::abs((int)a); } static inline __host__ __device__ int8_t pow(int8_t a, int8_t b) { return powi(a, b); } static inline __host__ __device__ bool isnan(int8_t a) { return false; } + static inline __host__ __device__ bool isinf(int8_t a) { return false; } }; template <> @@ -92,6 +94,7 @@ struct THCNumerics { static inline __host__ __device__ int16_t abs(int16_t a) { return ::abs((int)a); } static inline __host__ __device__ int16_t pow(int16_t a, int16_t b) { return powi(a, b); } static inline __host__ __device__ bool isnan(int16_t a) { return false; } + static inline __host__ __device__ bool isinf(int16_t a) { return false; } }; template <> @@ -114,6 +117,7 @@ struct THCNumerics { static inline __host__ __device__ int32_t abs(int32_t a) { return ::abs(a); } static inline __host__ __device__ int32_t pow(int32_t a, int32_t b) { return powi(a, b); } static inline __host__ __device__ bool isnan(int32_t a) { return false; } + static inline __host__ __device__ bool isinf(int32_t a) { return false; } }; template <> @@ -142,6 +146,7 @@ struct THCNumerics { static inline __host__ __device__ int64_t abs(int64_t a) { return labs(a); } static inline __host__ __device__ int64_t pow(int64_t a, int64_t b) { return powi(a, b); } static inline __host__ __device__ bool isnan(int64_t a) { return false; } + static inline __host__ __device__ bool isinf(int64_t a) { return false; } }; #ifdef CUDA_HALF_TENSOR @@ -624,6 +629,19 @@ static inline __host__ __device__ half lgamma(half a) { return ne(a, a); } + static inline __host__ __device__ bool isinf(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hisinf(a) != 0; +#else + float fa = __half2float(a); + return ::isinf(fa); +#endif +#else // __CUDA_ARCH__ + return ::isinf(THC_half2float(a)); +#endif + } + }; #endif @@ -677,6 +695,7 @@ struct THCNumerics { static inline __host__ __device__ float pow (float a, float b) { return powf(a, b); } static inline __host__ __device__ float atan2(float a, float b) { return atan2f(a, b); } static inline __host__ __device__ bool isnan(float a) { return ::isnan(a); } + static inline __host__ __device__ bool isinf(float a) { return ::isinf(a); } }; template <> @@ -729,6 +748,7 @@ struct THCNumerics { static inline __host__ __device__ double pow (double a, double b) { return ::pow(a, b); } static inline __host__ __device__ double atan2(double a, double b) { return ::atan2(a, b); } static inline __host__ __device__ bool isnan(double a) { return ::isnan(a); } + static inline __host__ __device__ bool isinf(double a) { return ::isinf(a); } }; /// `half` has some type conversion issues associated with it, since it diff --git a/aten/src/THC/THCReduce.cuh b/aten/src/THC/THCReduce.cuh index 01afbd5fc44067..e046a7041e4012 100644 --- a/aten/src/THC/THCReduce.cuh +++ b/aten/src/THC/THCReduce.cuh @@ -283,18 +283,18 @@ bool THC_reduceDim(THCState* state, int dim, int keepdim) { static_assert(std::is_same::DataType>::value, "ScalarType must match"); - ptrdiff_t inElements = TensorUtils::getNumElements(state, in); + ptrdiff_t inElements = THCTensor_nElement(state, in); - int64_t reductionSize = TensorUtils::getSize(state, in, dim); - int64_t reductionStride = TensorUtils::getStride(state, in, dim); + int64_t reductionSize = THCTensor_size(state, in, dim); + int64_t reductionStride = THCTensor_stride(state, in, dim); ptrdiff_t outElements = inElements / reductionSize; - if (TensorUtils::getDims(state, out) > MAX_CUTORCH_DIMS || - TensorUtils::getDims(state, in) > MAX_CUTORCH_DIMS) { + if (THCTensor_nDimension(state, out) > MAX_CUTORCH_DIMS || + THCTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) { return false; } - if (TensorUtils::getDims(state, in) == 0) { + if (THCTensor_nDimension(state, in) == 0) { // Zero-dim tensor; do nothing return true; } @@ -343,13 +343,13 @@ bool THC_reduceDim(THCState* state, // Resize out to correspond to the reduced size with keepdim=True. // Preserve noncontiguities by unsqueezing out if necessary - TensorUtils::preserveReduceDimSemantics( - state, out, TensorUtils::getDims(state, in), dim, keepdim); + THCTensor_preserveReduceDimSemantics( + state, out, THCTensor_nDimension(state, in), dim, keepdim); // Resize out - THLongStorage* sizes = TensorUtils::newSizeOf(state, in); + THLongStorage* sizes = THCTensor_newSizeOf(state, in); THLongStorage_set(sizes, dim, 1); - TensorUtils::resize(state, out, sizes, NULL); + THCTensor_resize(state, out, sizes, NULL); THLongStorage_free(sizes); // It is possible that the tensor dimensions are able to be collapsed, @@ -418,8 +418,8 @@ bool THC_reduceDim(THCState* state, } \ } - if (TensorUtils::canUse32BitIndexMath(state, out) && - TensorUtils::canUse32BitIndexMath(state, in)) { + if (THCTensor_canUse32BitIndexMath(state, out) && + THCTensor_canUse32BitIndexMath(state, in)) { TensorInfo outInfo = getTensorInfo(state, out); @@ -459,7 +459,7 @@ bool THC_reduceDim(THCState* state, if (!keepdim) { - TensorUtils::squeeze1d(state, out, out, dim); + THCTensor_squeeze1d(state, out, out, dim); } return true; } diff --git a/aten/src/THC/THCReduceAll.cuh b/aten/src/THC/THCReduceAll.cuh index 1237d1998534c6..db9431de8de1da 100644 --- a/aten/src/THC/THCReduceAll.cuh +++ b/aten/src/THC/THCReduceAll.cuh @@ -232,13 +232,13 @@ bool THC_reduceAll(THCState* state, AccT* out, int outOnDevice) { static_assert(std::is_same::DataType>::value, "ScalarTypeA must match"); - ptrdiff_t inElements = TensorUtils::getNumElements(state, in); + ptrdiff_t inElements = THCTensor_nElement(state, in); - if (TensorUtils::getDims(state, in) > MAX_CUTORCH_DIMS) { + if (THCTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) { return false; } - if (TensorUtils::getDims(state, in) == 0) { + if (THCTensor_nDimension(state, in) == 0) { // Zero-dim tensor; do nothing *out = init; return true; @@ -283,7 +283,7 @@ bool THC_reduceAll(THCState* state, } \ } - if (TensorUtils::canUse32BitIndexMath(state, in)) { + if (THCTensor_canUse32BitIndexMath(state, in)) { TensorInfo inInfo = getTensorInfo(state, in); inInfo.collapseDims(); diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index c1e09275ade571..6547beb9d6ede8 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -8,6 +8,63 @@ #include "generic/THCStorage.cpp" #include "THCGenerateAllTypes.h" +THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type) +{ + return THCStorage_newWithSize(state, scalar_type, 0); +} + +THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size) +{ + return THCStorage_newWithAllocator( + state, scalar_type, size, + state->cudaDeviceAllocator, + state->cudaDeviceAllocator->state); +} + +THCStorage* THCStorage_newWithAllocator(THCState *state, + at::ScalarType scalar_type, + ptrdiff_t size, + THCDeviceAllocator* allocator, + void* allocatorContext) +{ + THArgCheck(size >= 0, 2, "invalid size"); + int device; + THCudaCheck(cudaGetDevice(&device)); + + THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); + memset(storage, 0, sizeof(THCStorage)); + new (&storage->refcount) std::atomic(1); + storage->scalar_type = scalar_type; + storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; + storage->allocator = allocator; + storage->allocatorContext = allocatorContext; + storage->size = size; + storage->device = device; + + if(size > 0) + { + // update heap *before* attempting malloc, to free space for the malloc + cudaError_t err = + (*allocator->malloc)(allocatorContext, + (void**)&(storage->data_ptr), + size * at::elementSize(scalar_type), + THCState_getCurrentStreamOnDevice(state, device)); + if(err != cudaSuccess){ + free(storage); + } + THCudaCheck(err); + } else { + storage->data_ptr = NULL; + } + return storage; +} + +void THCStorage_retain(THCState *state, THCStorage *self) +{ + if(self && (self->flag & TH_STORAGE_REFCOUNTED)) + self->refcount++; +} + void THCStorage_free(THCState *state, THCStorage *self) { if(!(self->flag & TH_STORAGE_REFCOUNTED)) @@ -26,3 +83,75 @@ void THCStorage_free(THCState *state, THCStorage *self) THFree(self); } } + +void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) +{ + THArgCheck(size >= 0, 2, "invalid size"); + THAssert(self->allocator != NULL); + int device; + THCudaCheck(cudaGetDevice(&device)); + + if(!(self->flag & TH_STORAGE_RESIZABLE)) + THError("Trying to resize storage that is not resizable"); + + size_t elementSize = at::elementSize(self->scalar_type); + + if (self->allocator->realloc) { + void * data_ptr = self->data_ptr; + cudaError_t err = (*self->allocator->realloc)( + self->allocatorContext, + (void**)&(data_ptr), + self->size * elementSize, + size * elementSize, THCState_getCurrentStreamOnDevice(state, device)); + if (err != cudaSuccess) { + THCudaCheck(err); + } + self->size = size; + self->device = device; + return; + } + + if(size == 0) + { + if(self->flag & TH_STORAGE_FREEMEM) { + THCudaCheck( + (*self->allocator->free)(self->allocatorContext, self->data_ptr)); + } + self->data_ptr = NULL; + self->size = 0; + self->device = device; + } + else + { + void *data = NULL; + cudaError_t err = + (*self->allocator->malloc)(self->allocatorContext, + (void**)&(data), + size * elementSize, + THCState_getCurrentStreamOnDevice(state, device)); + THCudaCheck(err); + + if (self->data_ptr) { + // Enable p2p access when the memcpy is across devices + THCState_getPeerToPeerAccess(state, device, self->device); + + THCudaCheck(cudaMemcpyAsync(data, + self->data_ptr, + THMin(self->size, size) * elementSize, + cudaMemcpyDeviceToDevice, + THCState_getCurrentStream(state))); + if(self->flag & TH_STORAGE_FREEMEM) { + THCudaCheck( + (*self->allocator->free)(self->allocatorContext, self->data_ptr)); + } + } + + self->data_ptr = data; + self->size = size; + self->device = device; + } +} + +int THCStorage_getDevice(THCState* state, const THCStorage* storage) { + return storage->device; +} \ No newline at end of file diff --git a/aten/src/THC/THCStorage.h b/aten/src/THC/THCStorage.h index 20afc851fabc94..22a607ce43107f 100644 --- a/aten/src/THC/THCStorage.h +++ b/aten/src/THC/THCStorage.h @@ -9,7 +9,4 @@ #include "generic/THCStorage.h" #include "THCGenerateAllTypes.h" -// This exists to have a data-type independent way of freeing (necessary for THPPointer). -THC_API void THCStorage_free(THCState *state, THCStorage *self); - #endif diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp index 817d522ff5163d..69b7ac63dcf53c 100644 --- a/aten/src/THC/THCStorage.hpp +++ b/aten/src/THC/THCStorage.hpp @@ -37,3 +37,20 @@ typedef struct THCStorage return static_cast(this->data_ptr); } } THCStorage; + +THC_API THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type); +THC_API THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size); + +THC_API THCStorage* THCStorage_newWithAllocator(THCState *state, + at::ScalarType scalar_type, + ptrdiff_t size, + THCDeviceAllocator* allocator, + void* allocatorContext); + +THC_API void THCStorage_retain(THCState *state, THCStorage *storage); + +// This exists to have a data-type independent way of freeing (necessary for THPPointer). +THC_API void THCStorage_free(THCState *state, THCStorage *self); + +THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size); +THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage); diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index d749ccb713398b..93950c13da7069 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -6,3 +6,409 @@ #include "generic/THCTensor.cpp" #include "THCGenerateAllTypes.h" + +#include "THCTensorInfo.cuh" + +int THCTensor_nDimension(THCState *state, const _THCTensor *self) { + return self->nDimension; +} + +int64_t THCTensor_size(THCState *state, const _THCTensor *self, int dim) { + THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range"); + return self->size[dim]; +} + +int64_t THCTensor_stride(THCState *state, const _THCTensor *self, int dim) { + THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range"); + return self->stride[dim]; +} +THLongStorage *THCTensor_newSizeOf(THCState *state, _THCTensor *self) { + THLongStorage *size = THLongStorage_newWithSize(self->nDimension); + THLongStorage_rawCopy(size, self->size); + return size; +} + +void THCTensor_resize(THCState *state, _THCTensor *self, THLongStorage *size, THLongStorage *stride) { + THArgCheck(size != NULL, 2, "invalid size"); + if(stride) + THArgCheck(stride->size == size->size, 3, "invalid stride"); + + THCTensor_resizeNd(state, self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL)); +} + +void THCTensor_resizeAs(THCState *state, _THCTensor *self, _THCTensor *src) { + int isSame = 0; + int d; + if(self->nDimension == src->nDimension) + { + isSame = 1; + for(d = 0; d < self->nDimension; d++) + { + if(self->size[d] != src->size[d]) + { + isSame = 0; + break; + } + } + } + + if(!isSame) + THCTensor_resizeNd(state, self, src->nDimension, src->size, NULL); +} + +void THCTensor_resizeNd(THCState *state, _THCTensor *self, int nDimension, int64_t *size, int64_t *stride) +{ + int d; + int nDimension_; + ptrdiff_t totalSize; + int hascorrectsize = 1; + + nDimension_ = 0; + for(d = 0; d < nDimension; d++) + { + if(size[d] > 0) + { + nDimension_++; + if((self->nDimension > d) && (size[d] != self->size[d])) + hascorrectsize = 0; + + if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d])) + hascorrectsize = 0; + } + else + break; + } + nDimension = nDimension_; + + if(nDimension != self->nDimension) + hascorrectsize = 0; + + if(hascorrectsize) + return; + + if(nDimension > 0) + { + if(nDimension != self->nDimension) + { + self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension); + self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension); + self->nDimension = nDimension; + } + + totalSize = 1; + for(d = self->nDimension-1; d >= 0; d--) + { + self->size[d] = size[d]; + if(stride && (stride[d] >= 0) ) + self->stride[d] = stride[d]; + else + { + if(d == self->nDimension-1) + self->stride[d] = 1; + else + self->stride[d] = self->size[d+1]*self->stride[d+1]; + } + totalSize += (self->size[d]-1)*self->stride[d]; + } + + if(totalSize+self->storageOffset > 0) + { + if(!self->storage) + THError("Tensor: invalid null storage"); + if(totalSize+self->storageOffset > self->storage->size) + THCStorage_resize(state, self->storage, totalSize+self->storageOffset); + } + } + else + self->nDimension = 0; +} + +void THCTensor_set(THCState *state, _THCTensor *self, _THCTensor *src) +{ + if(self != src) + THCTensor_setStorageNd(state, + self, + src->storage, + src->storageOffset, + src->nDimension, + src->size, + src->stride); +} + +void THCTensor_setStorageNd(THCState *state, _THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) +{ + /* storage */ + if(self->storage != storage) + { + if (!self->storage) { + THError("Tensor: invalid null storage"); + } + auto scalar_type = self->storage->scalar_type; + THCStorage_free(state, self->storage); + + if(storage) + { + self->storage = storage; + THCStorage_retain(state, self->storage); + } + else + self->storage = THCStorage_new(state, scalar_type); + } + + /* storageOffset */ + if(storageOffset < 0) + THError("Tensor: invalid storage offset"); + self->storageOffset = storageOffset; + + /* size and stride */ + THCTensor_resizeNd(state, self, nDimension, size, stride); +} + + +void THCTensor_squeeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension) +{ + int d; + + if(!src) + src = self; + + THArgCheck(dimension < src->nDimension, 3, "dimension out of range"); + + THCTensor_set(state, self, src); + + if(src->size[dimension] == 1 && src->nDimension > 1) + { + for(d = dimension; d < self->nDimension-1; d++) + { + self->size[d] = self->size[d+1]; + self->stride[d] = self->stride[d+1]; + } + self->nDimension--; + } +} + +void THCTensor_unsqueeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension) +{ + int d; + + if(!src) + src = self; + + THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range"); + THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor"); + + THCTensor_set(state, self, src); + + self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->nDimension+1)); + self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->nDimension+1)); + self->nDimension++; + for (d = self->nDimension-1; d > dimension; d--) { + self->size[d] = self->size[d-1]; + self->stride[d] = self->stride[d-1]; + } + if (dimension+1 < self->nDimension) { + self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1]; + } else { + self->stride[dimension] = 1; + } + self->size[dimension] = 1; +} + +bool THCTensor_isContiguous(THCState *state, const _THCTensor *self) { + int64_t z = 1; + int d; + for(d = self->nDimension-1; d >= 0; d--) + { + if(self->size[d] != 1) + { + if(self->stride[d] == z) + z *= self->size[d]; + else + return false; + } + } + return true; +} + +bool THCTensor_allContiguous(THCState *state, const _THCTensor **inputs, int numInputs) { + THAssert(numInputs > 0); + for (int i = 0; i < numInputs; ++i) { + if (!THCTensor_isContiguous(state, inputs[i])) { + return false; + } + } + return true; +} + +ptrdiff_t THCTensor_nElement(THCState *state, const _THCTensor *self) { + if(self->nDimension == 0) + return 0; + else + { + ptrdiff_t nElement = 1; + int d; + for(d = 0; d < self->nDimension; d++) + nElement *= self->size[d]; + return nElement; + } +} + +void THCTensor_retain(THCState *state, _THCTensor *self) { + if(self->flag & TH_TENSOR_REFCOUNTED) + self->refcount++; +} + + +void THCTensor_free(THCState *state, _THCTensor *self) { + if(!self) + return; + + if(self->flag & TH_TENSOR_REFCOUNTED) + { + if(--self->refcount == 0) + { + THFree(self->size); + THFree(self->stride); + if(self->storage) + THCStorage_free(state, self->storage); + self->refcount.~atomic(); + THFree(self); + } + } +} + +int THCTensor_getDevice(THCState* state, const _THCTensor* tensor) { + if (!tensor->storage) return -1; + return THCStorage_getDevice(state, tensor->storage); +} + +bool THCTensor_allSameDevice(THCState* state, const _THCTensor ** inputs, int numInputs) { + THAssert(numInputs > 0); + int device = THCTensor_getDevice(state, inputs[0]); + for (int i = 1; i < numInputs; ++i) { + if (THCTensor_getDevice(state, inputs[i]) != device) { + return false; + } + } + return true; +} + +bool THCTensor_canUse32BitIndexMath(THCState* state, const _THCTensor* t, ptrdiff_t max_elem) { + ptrdiff_t elements = THCTensor_nElement(state, t); + if (elements >= max_elem) { + return false; + } + + ptrdiff_t offset = 0; + ptrdiff_t linearId = elements - 1; + + for (int i = THCTensor_nDimension(state, t) - 1; i >= 0; --i) { + ptrdiff_t curDimIndex = + linearId % THCTensor_size(state, t, i); + ptrdiff_t curDimOffset = curDimIndex * + THCTensor_stride(state, t, i); + offset += curDimOffset; + linearId /= THCTensor_size(state, t, i); + } + + if (offset >= max_elem) { + return false; + } + + return true; +} + +bool THCTensor_all32BitIndexable(THCState* state, const _THCTensor** inputs, int numInputs) { + for (int i = 0; i < numInputs; ++i) { + if (!THCTensor_canUse32BitIndexMath(state, inputs[i])) { + return false; + } + } + return true; +} + +/* Due to the resize semantics of ops with `out=` keywords, if */ \ +/* the output `tensor` has the same shape as the output of the */ \ +/* reduction operation, then any noncontiguities in the output */ \ +/* `tensor` should be preserved. This needs to be special cased b/c */ \ +/* otherwise, when keepdim=False, the implementations of reduction */ \ +/* ops resize `tensor` to the reduced size with keepdim=True, and */ \ +/* then later squeeze `tensor` to the correct output size, breaking */ \ +/* the contiguity guarantees of the resize semantics. */ \ +void THCTensor_preserveReduceDimSemantics(THCState *state, _THCTensor *tensor, + int in_dims, int64_t dimension, int keepdim) { + int out_dims = THCTensor_nDimension(state, tensor); + if (out_dims > 0 && !keepdim && out_dims == in_dims - 1) { + THCTensor_unsqueeze1d(state, tensor, tensor, dimension); + } +} + +namespace { + +struct SizeAndStride { + int64_t size; + int64_t stride; +}; + +/* + A comparator that will sort SizeAndStride structs by stride, + in ascending order. + */ +int compareSizeAndStride(const void* a, const void* b) { + const SizeAndStride* aS = (const SizeAndStride*) a; + const SizeAndStride* bS = (const SizeAndStride*) b; + + if (aS->stride < bS->stride) return -1; + if (aS->stride == bS->stride) return 0; + return 1; +} + +} + +/* Returns false if there is no possibility that the tensor */ +/* has "overlapping" indices and true otherwise. */ +/* "Overlapping" indices are two+ valid indices that specify */ +/* the same offset within the tensor. */ +/* The function does this by checking for a sufficient but not */ +/* necessary condition of no overlap. In particular, that */ +/* that there exists an ordering of the tensor's dimensions */ +/* that is nicely "nested," with each dimension contained */ +/* within the next one. */ +bool THCTensor_maybeOverlappingIndices(THCState* state, const _THCTensor* t) { + /* Extract size/stride arrays; only consider size >1 dims. */ + SizeAndStride info[MAX_CUTORCH_DIMS]; + + int dims = THCTensor_nDimension(state, t); + int nonSize1Dims = 0; + for (int i = 0; i < dims; ++i) { + int64_t size = THCTensor_size(state, t, i); + + if (size > 1) { + info[nonSize1Dims].size = size; + info[nonSize1Dims].stride = + THCTensor_stride(state, t, i); + + if (info[nonSize1Dims].stride < 1) { + return true; + } + + ++nonSize1Dims; + } + } + + /* Short-circuits if tensor is a single element. */ + if (nonSize1Dims == 0) { + return false; + } + + /* Ascending order (innermost dimension in sorted view is at [0]) */ + qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); + + for (int i = 0; i < (nonSize1Dims - 1); ++i) { + if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) { + return true; + } + } + + return false; +} diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp index b82279df731e32..c230d117913f7c 100644 --- a/aten/src/THC/THCTensor.hpp +++ b/aten/src/THC/THCTensor.hpp @@ -9,5 +9,55 @@ #include +typedef struct _THCTensor +{ + int64_t *size; + int64_t *stride; + int nDimension; + + THCStorage *storage; + ptrdiff_t storageOffset; + std::atomic refcount; + + char flag; + +} _THCTensor; + #include "generic/THCTensor.hpp" #include "THCGenerateAllTypes.h" + +THC_API int THCTensor_nDimension(THCState *state, const _THCTensor *self); +THC_API int64_t THCTensor_size(THCState *state, const _THCTensor *self, int dim); +THC_API int64_t THCTensor_stride(THCState *state, const _THCTensor *self, int dim); +THC_API THLongStorage *THCTensor_newSizeOf(THCState *state, _THCTensor *self); + +THC_API void THCTensor_resize(THCState *state, _THCTensor *tensor, THLongStorage *size, THLongStorage *stride); +THC_API void THCTensor_resizeAs(THCState *state, _THCTensor *tensor, _THCTensor *src); +THC_API void THCTensor_resizeNd(THCState *state, _THCTensor *tensor, int nDimension, int64_t *size, int64_t *stride); + +THC_API void THCTensor_set(THCState *state, _THCTensor *self, _THCTensor *src); +THC_API void THCTensor_setStorageNd(THCState *state, _THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride); + +THC_API void THCTensor_squeeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension_); +THC_API void THCTensor_unsqueeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension_); + +THC_API bool THCTensor_isContiguous(THCState *state, const _THCTensor *self); +THC_API bool THCTensor_allContiguous(THCState *state, const _THCTensor **inputs, int numInputs); +THC_API ptrdiff_t THCTensor_nElement(THCState *state, const _THCTensor *self); + +THC_API void THCTensor_retain(THCState *state, _THCTensor *self); +THC_API void THCTensor_free(THCState *state, _THCTensor *self); + +THC_API int THCTensor_getDevice(THCState* state, const _THCTensor* tensor); +THC_API bool THCTensor_allSameDevice(THCState* state, const _THCTensor ** inputs, int numInputs); + +/* Can we use 32 bit math for indexing? */ +THC_API bool THCTensor_canUse32BitIndexMath(THCState* state, const _THCTensor* t, ptrdiff_t max_elem=INT32_MAX); +/* Are all tensors 32-bit indexable? */ +THC_API bool THCTensor_all32BitIndexable(THCState* state, const _THCTensor** inputs, int numInputs); +THC_API void THCTensor_preserveReduceDimSemantics(THCState *state, _THCTensor *tensor, int in_dims, + int64_t dimension, int keepdim); +/* Returns false if there is no possibility that the tensor */ +/* has more than one index that references the same datapoint, */ +/* true otherwise. */ +THC_API bool THCTensor_maybeOverlappingIndices(THCState* state, const _THCTensor* t); diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu index 503f90172d2263..c7f1eee78a7060 100644 --- a/aten/src/THC/THCTensorCopy.cu +++ b/aten/src/THC/THCTensorCopy.cu @@ -27,13 +27,13 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) { static_assert(std::is_same::DataType>::value, "ScalarTypeDst must match"); static_assert(std::is_same::DataType>::value, "ScalarTypeSrc must match"); - ptrdiff_t totalElements = TensorUtils::getNumElements(state, dst); + ptrdiff_t totalElements = THCTensor_nElement(state, dst); THArgCheck(totalElements == - TensorUtils::getNumElements(state, src), + THCTensor_nElement(state, src), 2, "sizes do not match"); - if (TensorUtils::getDims(state, dst) == 0) { + if (THCTensor_nDimension(state, dst) == 0) { // Zero-dim tensor; copy nothing return; } @@ -47,13 +47,13 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) { // contiguous). // -AND: both tensors have the same type. bool sameType = isSameType(); - bool srcContig = TensorUtils::isContiguous(state, src); - bool dstContig = TensorUtils::isContiguous(state, dst); + bool srcContig = THCTensor_isContiguous(state, src); + bool dstContig = THCTensor_isContiguous(state, dst); bool memcpyEligible = ((srcContig && dstContig) || (totalElements == 1)) && sameType; - int srcDev = TensorUtils::getDevice(state, src); - int dstDev = TensorUtils::getDevice(state, dst); + int srcDev = THCTensor_getDevice(state, src); + int dstDev = THCTensor_getDevice(state, dst); int oldDev = curGPU(); // Try to enable p2p access. This also handles the case srcDev == dstDev. @@ -140,7 +140,7 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) { // Types are different // Copy into the new format, contiguous, on the source device srcContig = TensorUtils::newTensor(state); - TensorUtils::resizeAs(state, srcContig, dst); + THCTensor_resizeAs(state, srcContig, dst); bool succ = THC_pointwiseApply2::free(state, srcContig); + THCTensor_free(state, srcContig); if (dst != dstContig) { TensorUtils::freeCopyTo(state, dstContig, dst); } else { - TensorUtils::free(state, dstContig); + THCTensor_free(state, dstContig); } // We're still on srcDev at this point diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu index 32954c4a3ebf9d..68bea1b16d6f26 100644 --- a/aten/src/THC/THCTensorIndex.cu +++ b/aten/src/THC/THCTensorIndex.cu @@ -459,7 +459,7 @@ void dispatchTakePutImpl(THCState *state, TensorType *a, TensorType *b, THCudaLo auto aInfo = getTensorInfo(state, a); aInfo.collapseDims(); - auto numel = TensorUtils::getNumElements(state, a); + auto numel = THCTensor_nElement(state, a); if (aInfo.isContiguous()) { auto op = Op(aInfo, numel, start, end); THC_pointwiseApply2(state, b, index, op); @@ -471,7 +471,7 @@ void dispatchTakePutImpl(THCState *state, TensorType *a, TensorType *b, THCudaLo template class Op, typename TensorType> void dispatchTakePut(THCState *state, TensorType *a, TensorType *b, THCudaLongTensor *index) { - if (TensorUtils::canUse32BitIndexMath(state, a, INT_MAX)) { + if (THCTensor_canUse32BitIndexMath(state, a, INT_MAX)) { dispatchTakePutImpl(state, a, b, index); } else { dispatchTakePutImpl(state, a, b, index); diff --git a/aten/src/THC/THCTensorMathCompare.cuh b/aten/src/THC/THCTensorMathCompare.cuh index 953f6a59090561..9fac6089acb2aa 100644 --- a/aten/src/THC/THCTensorMathCompare.cuh +++ b/aten/src/THC/THCTensorMathCompare.cuh @@ -73,8 +73,8 @@ void THC_logicalValue(THCState *state, TensorTypeOut *self_, TensorType *src, Op op) { - THLongStorage* st = TensorUtils::newSizeOf(state, src); - TensorUtils::resize(state, self_, st, NULL); + THLongStorage* st = THCTensor_newSizeOf(state, src); + THCTensor_resize(state, self_, st, NULL); THLongStorage_free(st); if (!THC_pointwiseApply2(state, self_, src, op)) { diff --git a/aten/src/THC/THCTensorMathCompareT.cuh b/aten/src/THC/THCTensorMathCompareT.cuh index e2351e1404b3b9..9b1fb4e50bdddc 100644 --- a/aten/src/THC/THCTensorMathCompareT.cuh +++ b/aten/src/THC/THCTensorMathCompareT.cuh @@ -56,12 +56,12 @@ void THC_logicalTensor(THCState *state, TensorType *src1, TensorType *src2, Op op) { - THLongStorage* st = TensorUtils::newSizeOf(state, src1); - TensorUtils::resize(state, self_, st, NULL); + THLongStorage* st = THCTensor_newSizeOf(state, src1); + THCTensor_resize(state, self_, st, NULL); THLongStorage_free(st); - THArgCheck(TensorUtils::getNumElements(state, src1) == - TensorUtils::getNumElements(state, src2), 3, + THArgCheck(THCTensor_nElement(state, src1) == + THCTensor_nElement(state, src2), 3, "sizes do not match"); if (!THC_pointwiseApply3(state, self_, src1, src2, op)) { diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh index 67dfaf06a43748..26389c3a990baf 100644 --- a/aten/src/THC/THCTensorMathPointwise.cuh +++ b/aten/src/THC/THCTensorMathPointwise.cuh @@ -37,7 +37,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* out, half* in) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); + *out = __hdiv(one, __hadd(one, hexp(__hneg(*in)))); #else float fin = __half2float(*in); *out = __float2half(1.0f / (1.0f + expf(- fin))); @@ -47,7 +47,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* v) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); + *v = __hdiv(one, __hadd(one, hexp(__hneg(*v)))); #else float fv = __half2float(*v); *v = __float2half(1.0f / (1.0f + expf(- fv))); diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh index b95eb5d9bb3962..e4def67689fa03 100644 --- a/aten/src/THC/THCTensorMathReduce.cuh +++ b/aten/src/THC/THCTensorMathReduce.cuh @@ -290,17 +290,17 @@ __global__ void THCTensor_kernel_varOuterDim(T *tgt, T *src_, unsigned num_orows template __host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int64_t dimension, int flag) { - unsigned ndim = TensorUtils::getDims(state, src); + unsigned ndim = THCTensor_nDimension(state, src); // Treat all outer dimensions (i.e. dim < dimension) as one. unsigned num_orows = 1; for (int64_t dim = 0; dim < dimension; dim++) { - num_orows *= TensorUtils::getSize(state, src, dim); + num_orows *= THCTensor_size(state, src, dim); } - unsigned row_size = TensorUtils::getSize(state, src, dimension); + unsigned row_size = THCTensor_size(state, src, dimension); // Treat all inner dimensions (i.e. dim > dimension) as one. unsigned num_irows = 1; for (unsigned dim = dimension + 1; dim < ndim; dim++) { - num_irows *= TensorUtils::getSize(state, src, dim); + num_irows *= THCTensor_size(state, src, dim); } dim3 threads(min(512, num_irows)); @@ -436,13 +436,13 @@ __global__ void THCTensor_kernel_varInnermostDim(T *tgt, T *src_, unsigned num_r template __host__ void THCTensor_varInnermostDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int flag) { - unsigned ndim = TensorUtils::getDims(state, src); + unsigned ndim = THCTensor_nDimension(state, src); // Treat all outer dimensions as a single dimension. unsigned num_rows = 1; for (unsigned dim = 0; dim < ndim - 1; dim++) { - num_rows *= TensorUtils::getSize(state, src, dim); + num_rows *= THCTensor_size(state, src, dim); } - unsigned row_size = TensorUtils::getSize(state, src, ndim - 1); + unsigned row_size = THCTensor_size(state, src, ndim - 1); // From limited testing, 16x32 seemed a good compromise for handling both long and short dimensions. dim3 threads(16, 32); @@ -509,15 +509,15 @@ THC_transformReduceOuterDimIndex(THCState *state, int64_t rdim, const thrust::pair& init, BinaryFunction binary_op) { - unsigned ndim = TensorUtils::getDims(state, src); + unsigned ndim = THCTensor_nDimension(state, src); unsigned num_orows = 1; for (int64_t dim = 0; dim < rdim; dim++) { - num_orows *= TensorUtils::getSize(state, src, dim); + num_orows *= THCTensor_size(state, src, dim); } - unsigned row_size = TensorUtils::getSize(state, src, rdim); + unsigned row_size = THCTensor_size(state, src, rdim); unsigned num_irows = 1; for (unsigned dim = rdim + 1; dim < ndim; dim++) { - num_irows *= TensorUtils::getSize(state, src, dim); + num_irows *= THCTensor_size(state, src, dim); } dim3 threads(min(512, num_irows)); @@ -612,12 +612,12 @@ THC_transformReduceInnermostDimIndex(THCState *state, TensorTypeK *src, const thrust::pair& init, BinaryFunction binary_op) { - unsigned ndim = TensorUtils::getDims(state, src); + unsigned ndim = THCTensor_nDimension(state, src); unsigned num_rows = 1; for (unsigned dim = 0; dim < ndim - 1; dim++) { - num_rows *= TensorUtils::getSize(state, src, dim); + num_rows *= THCTensor_size(state, src, dim); } - unsigned row_size = TensorUtils::getSize(state, src, ndim - 1); + unsigned row_size = THCTensor_size(state, src, ndim - 1); dim3 threads(16, 32); dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y))); @@ -650,40 +650,40 @@ THC_reduceDimIndex(THCState *state, static_assert(std::is_same::DataType>::value, "ScalarTypeK must match"); static_assert(std::is_same::DataType>::value, "ScalarTypeIndex must match"); THArgCheck(dimension >= 0 && - dimension < TensorUtils::getDims(state, src), + dimension < THCTensor_nDimension(state, src), 3, "dimension out of range"); // Unsqueeze tgt1_/tgt_2 if necessary so that their contiguity traits // are preserved if they are the same size as the correct reduction output. - int src_dims = TensorUtils::getDims(state, src); - TensorUtils::preserveReduceDimSemantics( + int src_dims = THCTensor_nDimension(state, src); + THCTensor_preserveReduceDimSemantics( state, tgt1_, src_dims, dimension, keepdim); - TensorUtils::preserveReduceDimSemantics( + THCTensor_preserveReduceDimSemantics( state, tgt2_, src_dims, dimension, keepdim); - THLongStorage *dim = TensorUtils::newSizeOf(state, src); + THLongStorage *dim = THCTensor_newSizeOf(state, src); THLongStorage_set(dim, dimension, 1); - TensorUtils::resize(state, tgt1_, dim, NULL); - TensorUtils::resize(state, tgt2_, dim, NULL); + THCTensor_resize(state, tgt1_, dim, NULL); + THCTensor_resize(state, tgt2_, dim, NULL); THLongStorage_free(dim); TensorTypeK *tgt1 = TensorUtils::newContiguous(state, tgt1_); TensorTypeIndex *tgt2 = TensorUtils::newContiguous(state, tgt2_); src = TensorUtils::newContiguous(state, src); - if (dimension == TensorUtils::getDims(state, src) - 1) { + if (dimension == THCTensor_nDimension(state, src) - 1) { THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op); } else { THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op); } - TensorUtils::free(state, src); + THCTensor_free(state, src); TensorUtils::freeCopyTo(state, tgt1, tgt1_); TensorUtils::freeCopyTo(state, tgt2, tgt2_); if (!keepdim) { - TensorUtils::squeeze1d(state, tgt1_, tgt1_, dimension); - TensorUtils::squeeze1d(state, tgt2_, tgt2_, dimension); + THCTensor_squeeze1d(state, tgt1_, tgt1_, dimension); + THCTensor_squeeze1d(state, tgt2_, tgt2_, dimension); } } diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index 432138493dd72f..8ee4b122e3382b 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -13,7 +13,7 @@ #include #include -#define MAX_NUM_BLOCKS 200 +#define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 @@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size, // NOTE: curand_uniform is (0, 1] and we want [a, b) GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) +GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a) GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean) diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh index 4637c4555963b6..7749f231c5c771 100644 --- a/aten/src/THC/THCTensorRandom.cuh +++ b/aten/src/THC/THCTensorRandom.cuh @@ -183,6 +183,8 @@ sampleMultinomialOnce(int64_t* dest, for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) { val = dist[curDist * stride_dist + cat * stride_categories]; assert(THCNumerics::ge(val, zero)); + assert(!THCNumerics::isinf(val)); + assert(!THCNumerics::isnan(val)); sum = THCNumerics::add(sum, ScalarConvert::to(val)); } diff --git a/aten/src/THC/THCTensorSort.cu b/aten/src/THC/THCTensorSort.cu index 6d3dbb5cced3ad..3635886300fc39 100644 --- a/aten/src/THC/THCTensorSort.cu +++ b/aten/src/THC/THCTensorSort.cu @@ -29,7 +29,7 @@ void THCudaLongTensor_fillSliceWithIndex(THCState* state, <<>>( \ info, numSlices, sliceSize, info.strides[collapseDim]) - if (TensorUtils::canUse32BitIndexMath(state, t)) { + if (THCTensor_canUse32BitIndexMath(state, t)) { TensorInfo info = getTensorInfo(state, t); info.reduceDim(dim); diff --git a/aten/src/THC/THCTensorTypeUtils.cu b/aten/src/THC/THCTensorTypeUtils.cu index b38766381bd013..f29348a92d9b6b 100644 --- a/aten/src/THC/THCTensorTypeUtils.cu +++ b/aten/src/THC/THCTensorTypeUtils.cu @@ -4,28 +4,6 @@ #include "THCHalf.h" #include -namespace { - -struct SizeAndStride { - int64_t size; - int64_t stride; -}; - -/* - A comparator that will sort SizeAndStride structs by stride, - in ascending order. - */ -int compareSizeAndStride(const void* a, const void* b) { - const SizeAndStride* aS = (const SizeAndStride*) a; - const SizeAndStride* bS = (const SizeAndStride*) b; - - if (aS->stride < bS->stride) return -1; - if (aS->stride == bS->stride) return 0; - return 1; -} - -} - #define IMPL_TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \ \ TENSOR_TYPE* \ @@ -39,24 +17,6 @@ TensorUtils::newContiguous(THCState* state, \ return TENSOR_TYPE##_newContiguous(state, t); \ } \ \ -THLongStorage* \ -TensorUtils::newSizeOf(THCState* state, \ - TENSOR_TYPE* t) { \ - return TENSOR_TYPE##_newSizeOf(state, t); \ -} \ - \ -void \ -TensorUtils::retain(THCState* state, \ - TENSOR_TYPE* t) { \ - TENSOR_TYPE##_retain(state, t); \ -} \ - \ -void \ -TensorUtils::free(THCState* state, \ - TENSOR_TYPE* t) { \ - TENSOR_TYPE##_free(state, t); \ -} \ - \ void \ TensorUtils::freeCopyTo(THCState* state, \ TENSOR_TYPE* src, \ @@ -64,37 +24,6 @@ TensorUtils::freeCopyTo(THCState* state, \ TENSOR_TYPE##_freeCopyTo(state, src, dst); \ } \ \ -void \ -TensorUtils::resize(THCState* state, \ - TENSOR_TYPE* out, \ - THLongStorage* sizes, \ - THLongStorage* strides) { \ - TENSOR_TYPE##_resize(state, out, sizes, strides); \ -} \ - \ -void \ -TensorUtils::resizeAs(THCState* state, \ - TENSOR_TYPE* dst, \ - TENSOR_TYPE* src) { \ - TENSOR_TYPE##_resizeAs(state, dst, src); \ -} \ - \ -void \ -TensorUtils::squeeze1d(THCState *state, \ - TENSOR_TYPE *dst, \ - TENSOR_TYPE *src, \ - int dimension) { \ - TENSOR_TYPE##_squeeze1d(state, dst, src, dimension); \ -} \ - \ -void \ -TensorUtils::unsqueeze1d(THCState *state, \ - TENSOR_TYPE *dst, \ - TENSOR_TYPE *src, \ - int dimension) { \ - TENSOR_TYPE##_unsqueeze1d(state, dst, src, dimension); \ -} \ - \ DATA_TYPE* \ TensorUtils::getData(THCState* state, \ TENSOR_TYPE* t) { \ @@ -102,185 +31,12 @@ TensorUtils::getData(THCState* state, \ return (DATA_TYPE*) TENSOR_TYPE##_data(state, t); \ } \ \ -ptrdiff_t \ -TensorUtils::getNumElements(THCState* state, \ - TENSOR_TYPE* t) { \ - return TENSOR_TYPE##_nElement(state, t); \ -} \ - \ -int64_t \ -TensorUtils::getSize(THCState* state, \ - TENSOR_TYPE* t, \ - int dim) { \ - return TENSOR_TYPE##_size(state, t, dim); \ -} \ - \ -int64_t \ -TensorUtils::getStride(THCState* state, \ - TENSOR_TYPE* t, \ - int dim) { \ - return TENSOR_TYPE##_stride(state, t, dim); \ -} \ - \ -int \ -TensorUtils::getDims(THCState* state, \ - TENSOR_TYPE* t) { \ - return TENSOR_TYPE##_nDimension(state, t); \ -} \ - \ -bool \ -TensorUtils::isContiguous(THCState* state, \ - TENSOR_TYPE* t) { \ - return TENSOR_TYPE##_isContiguous(state, t); \ -} \ - \ -bool \ -TensorUtils::allContiguous(THCState* state, \ - TENSOR_TYPE** inputs, \ - int numInputs) { \ - THAssert(numInputs > 0); \ - for (int i = 0; i < numInputs; ++i) { \ - if (!TensorUtils::isContiguous(state, inputs[i])) { \ - return false; \ - } \ - } \ - return true; \ -} \ - \ -/* Due to the resize semantics of ops with `out=` keywords, if */ \ -/* the output `tensor` has the same shape as the output of the */ \ -/* reduction operation, then any noncontiguities in the output */ \ -/* `tensor` should be preserved. This needs to be special cased b/c */ \ -/* otherwise, when keepdim=False, the implementations of reduction */ \ -/* ops resize `tensor` to the reduced size with keepdim=True, and */ \ -/* then later squeeze `tensor` to the correct output size, breaking */ \ -/* the contiguity guarantees of the resize semantics. */ \ -void \ -TensorUtils::preserveReduceDimSemantics( \ - THCState *state, TENSOR_TYPE *tensor, \ - int in_dims, int64_t dimension, int keepdim) {\ - int out_dims = TensorUtils::getDims(state, tensor); \ - if (out_dims > 0 && !keepdim && out_dims == in_dims - 1) { \ - TensorUtils::unsqueeze1d(state, tensor, tensor, dimension);\ - } \ -} \ - \ -int \ -TensorUtils::getDevice(THCState* state, \ - TENSOR_TYPE* t) { \ - return TENSOR_TYPE##_getDevice(state, t); \ -} \ - \ -bool \ -TensorUtils::allSameDevice(THCState* state, \ - TENSOR_TYPE** inputs, \ - int numInputs) { \ - THAssert(numInputs > 0); \ - int device = TensorUtils::getDevice(state, inputs[0]); \ - for (int i = 1; i < numInputs; ++i) { \ - if (TensorUtils::getDevice(state, inputs[i]) != device) { \ - return false; \ - } \ - } \ - return true; \ -} \ - \ void \ TensorUtils::copyIgnoringOverlaps(THCState* state, \ TENSOR_TYPE* dst, \ TENSOR_TYPE* src) { \ return TENSOR_TYPE##_copyIgnoringOverlaps(state, dst, src); \ } \ - \ -/* Returns false if there is no possibility that the tensor */ \ -/* has "overlapping" indices and true otherwise. */ \ -/* "Overlapping" indices are two+ valid indices that specify */ \ -/* the same offset within the tensor. */ \ -/* The function does this by checking for a sufficient but not */ \ -/* necessary condition of no overlap. In particular, that */ \ -/* that there exists an ordering of the tensor's dimensions */ \ -/* that is nicely "nested," with each dimension contained */ \ -/* within the next one. */ \ -bool \ -TensorUtils::maybeOverlappingIndices(THCState* state, \ - TENSOR_TYPE* t) { \ - /* Extract size/stride arrays; only consider size >1 dims. */ \ - SizeAndStride info[MAX_CUTORCH_DIMS]; \ - \ - int dims = TensorUtils::getDims(state, t); \ - int nonSize1Dims = 0; \ - for (int i = 0; i < dims; ++i) { \ - int64_t size = TensorUtils::getSize(state, t, i); \ - \ - if (size > 1) { \ - info[nonSize1Dims].size = size; \ - info[nonSize1Dims].stride = \ - TensorUtils::getStride(state, t, i); \ - \ - if (info[nonSize1Dims].stride < 1) { \ - return true; \ - } \ - \ - ++nonSize1Dims; \ - } \ - } \ - \ - /* Short-circuits if tensor is a single element. */ \ - if (nonSize1Dims == 0) { \ - return false; \ - } \ - \ - /* Ascending order (innermost dimension in sorted view is at [0]) */ \ - qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); \ - \ - for (int i = 0; i < (nonSize1Dims - 1); ++i) { \ - if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) { \ - return true; \ - } \ - } \ - \ - return false; \ -} \ - \ -bool \ -TensorUtils::canUse32BitIndexMath(THCState* state, \ - TENSOR_TYPE* t, \ - ptrdiff_t max_elem) { \ - ptrdiff_t elements = TensorUtils::getNumElements(state, t); \ - if (elements >= max_elem) { \ - return false; \ - } \ - \ - ptrdiff_t offset = 0; \ - ptrdiff_t linearId = elements - 1; \ - \ - for (int i = TensorUtils::getDims(state, t) - 1; i >= 0; --i) { \ - ptrdiff_t curDimIndex = \ - linearId % TensorUtils::getSize(state, t, i); \ - ptrdiff_t curDimOffset = curDimIndex * \ - TensorUtils::getStride(state, t, i); \ - offset += curDimOffset; \ - linearId /= TensorUtils::getSize(state, t, i); \ - } \ - \ - if (offset >= max_elem) { \ - return false; \ - } \ - \ - return true; \ -} \ - \ -bool \ -TensorUtils::all32BitIndexable(THCState* state, \ - TENSOR_TYPE** inputs, \ - int numInputs) { \ - for (int i = 0; i < numInputs; ++i) { \ - if (!TensorUtils::canUse32BitIndexMath(state, inputs[i])) { \ - return false; \ - } \ - } \ - return true; \ -} IMPL_TENSOR_UTILS(THCudaByteTensor, uint8_t) IMPL_TENSOR_UTILS(THCudaCharTensor, int8_t) diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh index 3ec5f367583c8d..f8f3d4f01cbfeb 100644 --- a/aten/src/THC/THCTensorTypeUtils.cuh +++ b/aten/src/THC/THCTensorTypeUtils.cuh @@ -5,8 +5,9 @@ #include #include "THCGeneral.h" #include "THCHalf.h" -#include "THCTensor.h" +#include "THCTensor.hpp" #include "THCTensorInfo.cuh" +#include "THCTensor.hpp" /// A utility for accessing THCuda*Tensor types in a generic manner @@ -39,42 +40,11 @@ struct TensorUtils { \ static TENSOR_TYPE* newTensor(THCState* state); \ static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \ - static THLongStorage* newSizeOf(THCState* state, TENSOR_TYPE* t); \ - static void retain(THCState* state, TENSOR_TYPE* t); \ - static void free(THCState* state, TENSOR_TYPE* t); \ static void freeCopyTo(THCState* state, TENSOR_TYPE* src, \ TENSOR_TYPE* dst); \ - static void resize(THCState* state, TENSOR_TYPE* out, \ - THLongStorage* sizes, \ - THLongStorage* strides); \ - static void resizeAs(THCState* state, TENSOR_TYPE* dst, \ - TENSOR_TYPE* src); \ - static void squeeze1d(THCState *state, TENSOR_TYPE *dst, \ - TENSOR_TYPE *src, int dimension); \ - static void unsqueeze1d(THCState *state, TENSOR_TYPE *dst, \ - TENSOR_TYPE *src, int dimension); \ - static void preserveReduceDimSemantics( \ - THCState *state, TENSOR_TYPE *tensor, \ - int in_dims, int64_t dimension, int keepdim); \ static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \ - static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t); \ - static int64_t getSize(THCState* state, TENSOR_TYPE* t, int dim); \ - static int64_t getStride(THCState* state, TENSOR_TYPE* t, int dim); \ - static int getDims(THCState* state, TENSOR_TYPE* t); \ - static bool isContiguous(THCState* state, TENSOR_TYPE* t); \ - static bool allContiguous(THCState* state, TENSOR_TYPE** inputs, int numInputs); \ - static int getDevice(THCState* state, TENSOR_TYPE* t); \ - static bool allSameDevice(THCState* state, TENSOR_TYPE** inputs, int numInputs); \ static void copyIgnoringOverlaps(THCState* state, \ TENSOR_TYPE* dst, TENSOR_TYPE* src); \ - /* Returns false if there is no possibility that the tensor */ \ - /* has more than one index that references the same datapoint, */ \ - /* true otherwise. */ \ - static bool maybeOverlappingIndices(THCState* state, TENSOR_TYPE* t); \ - /* Can we use 32 bit math for indexing? */ \ - static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t, ptrdiff_t max_elem=INT32_MAX); \ - /* Are all tensors 32-bit indexable? */ \ - static bool all32BitIndexable(THCState* state, TENSOR_TYPE** inputs, int numInputs); \ } TENSOR_UTILS(THCudaByteTensor, uint8_t, int64_t); @@ -124,10 +94,10 @@ getTensorInfo(THCState* state, TensorType* t) { IndexType sz[MAX_CUTORCH_DIMS]; IndexType st[MAX_CUTORCH_DIMS]; - int dims = TensorUtils::getDims(state, t); + int dims = THCTensor_nDimension(state, t); for (int i = 0; i < dims; ++i) { - sz[i] = TensorUtils::getSize(state, t, i); - st[i] = TensorUtils::getStride(state, t, i); + sz[i] = THCTensor_size(state, t, i); + st[i] = THCTensor_stride(state, t, i); } return TensorInfo( @@ -155,7 +125,7 @@ struct ScalarNegate { return __float2half(-__half2float(v)); #endif #else -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) half out = v; #else __half_raw out = __half_raw(v); @@ -180,7 +150,7 @@ struct ScalarInv { }; inline bool operator==(half a, half b) { -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) return a.x == b.x; #else __half_raw araw, braw; @@ -191,7 +161,7 @@ inline bool operator==(half a, half b) { } inline bool operator!=(half a, half b) { -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) return a.x != b.x; #else __half_raw araw, braw; diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index fba23a2ad94922..5f7baabf71b37f 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -40,51 +40,20 @@ real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) THCStorage* THCStorage_(new)(THCState *state) { - return THCStorage_(newWithSize)(state, 0); + return THCStorage_new(state, at::CTypeToScalarType>::to()); } THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) { - return THCStorage_(newWithAllocator)( - state, size, - state->cudaDeviceAllocator, - state->cudaDeviceAllocator->state); + return THCStorage_newWithSize(state, at::CTypeToScalarType>::to(), size); } THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, THCDeviceAllocator* allocator, void* allocatorContext) { - THArgCheck(size >= 0, 2, "invalid size"); - int device; - THCudaCheck(cudaGetDevice(&device)); - - THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); - memset(storage, 0, sizeof(THCStorage)); - new (&storage->refcount) std::atomic(1); - storage->scalar_type = at::CTypeToScalarType>::to(); - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; - storage->allocator = allocator; - storage->allocatorContext = allocatorContext; - storage->size = size; - storage->device = device; - - if(size > 0) - { - // update heap *before* attempting malloc, to free space for the malloc - cudaError_t err = - (*allocator->malloc)(allocatorContext, - (void**)&(storage->data_ptr), - size * sizeof(real), - THCState_getCurrentStream(state)); - if(err != cudaSuccess){ - free(storage); - } - THCudaCheck(err); - } else { - storage->data_ptr = NULL; - } - return storage; + return THCStorage_newWithAllocator(state, at::CTypeToScalarType>::to(), + size, allocator, allocatorContext); } THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0) @@ -170,8 +139,7 @@ void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char fla void THCStorage_(retain)(THCState *state, THCStorage *self) { - if(self && (self->flag & TH_STORAGE_REFCOUNTED)) - self->refcount++; + THCStorage_retain(state, self); } int THCStorage_(retainIfLive)(THCState *state, THCStorage *storage) diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu index 9676b297b09b6c..c3f25f4f037c03 100644 --- a/aten/src/THC/generic/THCStorage.cu +++ b/aten/src/THC/generic/THCStorage.cu @@ -15,72 +15,11 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value) void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) { - THArgCheck(size >= 0, 2, "invalid size"); - THAssert(self->allocator != NULL); - int device; - THCudaCheck(cudaGetDevice(&device)); - - if(!(self->flag & TH_STORAGE_RESIZABLE)) - THError("Trying to resize storage that is not resizable"); - - if (self->allocator->realloc) { - real * data_ptr = self->data(); - cudaError_t err = (*self->allocator->realloc)( - self->allocatorContext, - (void**)&(data_ptr), - self->size * sizeof(real), - size * sizeof(real), THCState_getCurrentStream(state)); - if (err != cudaSuccess) { - THCudaCheck(err); - } - self->size = size; - self->device = device; - return; - } - - if(size == 0) - { - if(self->flag & TH_STORAGE_FREEMEM) { - THCudaCheck( - (*self->allocator->free)(self->allocatorContext, THCStorage_(data)(state, self))); - } - self->data_ptr = NULL; - self->size = 0; - self->device = device; - } - else - { - real *data = NULL; - cudaError_t err = - (*self->allocator->malloc)(self->allocatorContext, - (void**)&(data), - size * sizeof(real), - THCState_getCurrentStream(state)); - THCudaCheck(err); - - if (THCStorage_(data)(state, self)) { - // Enable p2p access when the memcpy is across devices - THCState_getPeerToPeerAccess(state, device, self->device); - - THCudaCheck(cudaMemcpyAsync(data, - THCStorage_(data)(state, self), - THMin(self->size, size) * sizeof(real), - cudaMemcpyDeviceToDevice, - THCState_getCurrentStream(state))); - if(self->flag & TH_STORAGE_FREEMEM) { - THCudaCheck( - (*self->allocator->free)(self->allocatorContext, THCStorage_(data)(state, self))); - } - } - - self->data_ptr = data; - self->size = size; - self->device = device; - } + THCStorage_resize(state, self, size); } THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { - return storage->device; + return THCStorage_getDevice(state, storage); } #endif diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index 6867c8da57dd93..147c71491ed0de 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -15,26 +15,22 @@ ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self) int THCTensor_(nDimension)(THCState *state, const THCTensor *self) { - return self->nDimension; + return THCTensor_nDimension(state, self); } int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim) { - THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range"); - return self->size[dim]; + return THCTensor_size(state, self, dim); } int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim) { - THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range"); - return self->stride[dim]; + return THCTensor_stride(state, self, dim); } THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self) { - THLongStorage *size = THLongStorage_newWithSize(self->nDimension); - THLongStorage_rawCopy(size, self->size); - return size; + return THCTensor_newSizeOf(state, self); } THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self) @@ -300,32 +296,12 @@ THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input) { /* Resize */ void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride) { - THArgCheck(size != NULL, 2, "invalid size"); - if(stride) - THArgCheck(stride->size == size->size, 3, "invalid stride"); - - THCTensor_(resizeNd)(state, self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL)); + THCTensor_resize(state, self, size, stride); } void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src) { - int isSame = 0; - int d; - if(self->nDimension == src->nDimension) - { - isSame = 1; - for(d = 0; d < self->nDimension; d++) - { - if(self->size[d] != src->size[d]) - { - isSame = 0; - break; - } - } - } - - if(!isSame) - THCTensor_(resizeNd)(state, self, src->nDimension, src->size, NULL); + THCTensor_resizeAs(state, self, src); } void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0) @@ -359,14 +335,7 @@ void THCTensor_(resize5d)(THCState *state, THCTensor *self, int64_t size0, int64 void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src) { - if(self != src) - THCTensor_(setStorageNd)(state, - self, - src->storage, - src->storageOffset, - src->nDimension, - src->size, - src->stride); + THCTensor_set(state, self, src); } void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_) @@ -570,68 +539,17 @@ void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src) void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension) { - int d; - - if(!src) - src = self; - - THArgCheck(dimension < src->nDimension, 3, "dimension out of range"); - - THCTensor_(set)(state, self, src); - - if(src->size[dimension] == 1 && src->nDimension > 1) - { - for(d = dimension; d < self->nDimension-1; d++) - { - self->size[d] = self->size[d+1]; - self->stride[d] = self->stride[d+1]; - } - self->nDimension--; - } + THCTensor_squeeze1d(state, self, src, dimension); } void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension) { - int d; - - if(!src) - src = self; - - THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range"); - THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor"); - - THCTensor_(set)(state, self, src); - - self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->nDimension+1)); - self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->nDimension+1)); - self->nDimension++; - for (d = self->nDimension-1; d > dimension; d--) { - self->size[d] = self->size[d-1]; - self->stride[d] = self->stride[d-1]; - } - if (dimension+1 < self->nDimension) { - self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1]; - } else { - self->stride[dimension] = 1; - } - self->size[dimension] = 1; + THCTensor_unsqueeze1d(state, self, src, dimension); } int THCTensor_(isContiguous)(THCState *state, const THCTensor *self) { - int64_t z = 1; - int d; - for(d = self->nDimension-1; d >= 0; d--) - { - if(self->size[d] != 1) - { - if(self->stride[d] == z) - z *= self->size[d]; - else - return 0; - } - } - return 1; + return THCTensor_isContiguous(state, self); } int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims) @@ -680,41 +598,17 @@ int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTe ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self) { - if(self->nDimension == 0) - return 0; - else - { - ptrdiff_t nElement = 1; - int d; - for(d = 0; d < self->nDimension; d++) - nElement *= self->size[d]; - return nElement; - } + return THCTensor_nElement(state, self); } void THCTensor_(retain)(THCState *state, THCTensor *self) { - if(self->flag & TH_TENSOR_REFCOUNTED) - self->refcount++; + THCTensor_retain(state, self); } void THCTensor_(free)(THCState *state, THCTensor *self) { - if(!self) - return; - - if(self->flag & TH_TENSOR_REFCOUNTED) - { - if(--self->refcount == 0) - { - THFree(self->size); - THFree(self->stride); - if(self->storage) - THCStorage_(free)(state, self->storage); - self->refcount.~atomic(); - THFree(self); - } - } + THCTensor_free(state, self); } void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst) @@ -740,95 +634,12 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self) void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) { - /* storage */ - if(self->storage != storage) - { - if(self->storage) - THCStorage_(free)(state, self->storage); - - if(storage) - { - self->storage = storage; - THCStorage_(retain)(state, self->storage); - } - else - self->storage = THCStorage_(new)(state); - } - - /* storageOffset */ - if(storageOffset < 0) - THError("Tensor: invalid storage offset"); - self->storageOffset = storageOffset; - - /* size and stride */ - THCTensor_(resizeNd)(state, self, nDimension, size, stride); + THCTensor_setStorageNd(state, self, storage, storageOffset, nDimension, size, stride); } void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride) { - int d; - int nDimension_; - ptrdiff_t totalSize; - int hascorrectsize = 1; - - nDimension_ = 0; - for(d = 0; d < nDimension; d++) - { - if(size[d] > 0) - { - nDimension_++; - if((self->nDimension > d) && (size[d] != self->size[d])) - hascorrectsize = 0; - - if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d])) - hascorrectsize = 0; - } - else - break; - } - nDimension = nDimension_; - - if(nDimension != self->nDimension) - hascorrectsize = 0; - - if(hascorrectsize) - return; - - if(nDimension > 0) - { - if(nDimension != self->nDimension) - { - self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension); - self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension); - self->nDimension = nDimension; - } - - totalSize = 1; - for(d = self->nDimension-1; d >= 0; d--) - { - self->size[d] = size[d]; - if(stride && (stride[d] >= 0) ) - self->stride[d] = stride[d]; - else - { - if(d == self->nDimension-1) - self->stride[d] = 1; - else - self->stride[d] = self->size[d+1]*self->stride[d+1]; - } - totalSize += (self->size[d]-1)*self->stride[d]; - } - - if(totalSize+self->storageOffset > 0) - { - if(!self->storage) - self->storage = THCStorage_(new)(state); - if(totalSize+self->storageOffset > self->storage->size) - THCStorage_(resize)(state, self->storage, totalSize+self->storageOffset); - } - } - else - self->nDimension = 0; + THCTensor_resizeNd(state, self, nDimension, size, stride); } void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value) diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu index fbe0ed1d0d87d5..98478341575c75 100644 --- a/aten/src/THC/generic/THCTensor.cu +++ b/aten/src/THC/generic/THCTensor.cu @@ -3,8 +3,7 @@ #else THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) { - if (!tensor->storage) return -1; - return THCStorage_(getDevice)(state, tensor->storage); + return THCTensor_getDevice(state, tensor); } #endif diff --git a/aten/src/THC/generic/THCTensor.hpp b/aten/src/THC/generic/THCTensor.hpp index ebffb56f47c9f6..f3f28557b17a83 100644 --- a/aten/src/THC/generic/THCTensor.hpp +++ b/aten/src/THC/generic/THCTensor.hpp @@ -2,18 +2,9 @@ #define THC_GENERIC_FILE "generic/THCTensor.hpp" #else -typedef struct THCTensor -{ - int64_t *size; - int64_t *stride; - int nDimension; - - THCStorage *storage; - ptrdiff_t storageOffset; - std::atomic refcount; - - char flag; - +// This allows one to write generic functions (i.e. functions that work across all THRealTensor types) +// without having to explicitly cast the THRealTensor. +typedef struct THCTensor : _THCTensor { } THCTensor; #endif diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index 85276772c30a1d..5b40fab49f735c 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -141,9 +141,9 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128)); - if (TensorUtils::canUse32BitIndexMath(state, dst) && - TensorUtils::canUse32BitIndexMath(state, src) && - TensorUtils::canUse32BitIndexMath(state, indices)) { + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, indices)) { TensorInfo dstInfo = getTensorInfo(state, dst); int dstCopyDim = dstInfo.collapseDims(dim); @@ -330,9 +330,9 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128)); - if (TensorUtils::canUse32BitIndexMath(state, dst) && - TensorUtils::canUse32BitIndexMath(state, src) && - TensorUtils::canUse32BitIndexMath(state, indices)) { + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, indices)) { TensorInfo dstInfo = getTensorInfo(state, dst); int dstAddDim = dstInfo.collapseDims(dim); @@ -447,8 +447,8 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128)); - if (TensorUtils::canUse32BitIndexMath(state, dst) && - TensorUtils::canUse32BitIndexMath(state, indices)) { + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, indices)) { TensorInfo dstInfo = getTensorInfo(state, dst); int dstFillDim = dstInfo.collapseDims(dim); @@ -579,9 +579,9 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128)); - if (TensorUtils::canUse32BitIndexMath(state, dst) && - TensorUtils::canUse32BitIndexMath(state, src) && - TensorUtils::canUse32BitIndexMath(state, indices)) { + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, indices)) { TensorInfo dstInfo = getTensorInfo(state, dst); int dstSelectDim = dstInfo.collapseDims(dim); diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index c9e267204f8a0e..37dc86cc31e1db 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -192,10 +192,10 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result, if (numInputs > 1 && !hasEmptyInput && THCTensor_(nDimension)(state, result) <= CAT_ARRAY_MAX_INPUT_DIMS && - TensorUtils::canUse32BitIndexMath(state, result) && - TensorUtils::allContiguous(state, inputs, numInputs) && - TensorUtils::all32BitIndexable(state, inputs, numInputs) && - TensorUtils::allSameDevice(state, inputs, numInputs)) { + THCTensor_canUse32BitIndexMath(state, result) && + THCTensor_allContiguous(state, (const _THCTensor **)inputs, numInputs) && + THCTensor_all32BitIndexable(state, (const _THCTensor **)inputs, numInputs) && + THCTensor_allSameDevice(state, (const _THCTensor **)inputs, numInputs)) { // First, let's set up our kernel parameters. We start with a raw pointer to the storage // for the output Tensor. diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu index 79fb0c1b496cdf..be3fc3e489dbd0 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -130,7 +130,7 @@ THC_API void THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - THCTensor_(mul)(state, self_, src_, pow(2, value)); + THCTensor_(mul)(state, self_, src_, pow(2, -value)); #elif defined(THC_REAL_IS_HALF) return THError("rshift not supported for torch.CudaHalfTensor"); #else diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu index 4987ad9436abc7..e60e33c7b46821 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.cu +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -90,7 +90,7 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); - TensorUtils::preserveReduceDimSemantics( + THCTensor_preserveReduceDimSemantics( state, self_, THCTensor_(nDimension)(state, src), dimension, keepdim); THLongStorage *dim = THCTensor_(newSizeOf)(state, src); THLongStorage_set(dim, dimension, 1); @@ -119,7 +119,7 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); - TensorUtils::preserveReduceDimSemantics( + THCTensor_preserveReduceDimSemantics( state, self_, THCTensor_(nDimension)(state, src), dimension, keepdim); THLongStorage *dim = THCTensor_(newSizeOf)(state, src); THLongStorage_set(dim, dimension, 1); diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu index 5eb7b50b4d6c17..6349e07b178dfe 100644 --- a/aten/src/THC/generic/THCTensorMode.cu +++ b/aten/src/THC/generic/THCTensorMode.cu @@ -180,9 +180,9 @@ THC_API void THCTensor_(mode)(THCState *state, // Resize output value, index Tensors to appropriate sizes (i.e. the same as // the input Tensor, except at dim=dimension, the size is 1) - TensorUtils::preserveReduceDimSemantics( + THCTensor_preserveReduceDimSemantics( state, values, ndim, dimension, keepdim); - TensorUtils::preserveReduceDimSemantics( + THCTensor_preserveReduceDimSemantics( state, indices, ndim, dimension, keepdim); dim = THCTensor_(newSizeOf)(state, input); THLongStorage_set(dim, dimension, 1); @@ -209,7 +209,7 @@ THC_API void THCTensor_(mode)(THCState *state, // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed) if (sliceSize <= MAX_BLOCK_SIZE && slices <= MAX_GRID_SIZE && - TensorUtils::canUse32BitIndexMath(state, input)) { + THCTensor_canUse32BitIndexMath(state, input)) { // Beginning our optimized implementation. First thing we want to do is to transpose // the input Tensor along the sort dimension, and then make it contiguous transposed = THCTensor_(newTranspose)(state, input, dimension, ndim - 1); diff --git a/aten/src/THC/generic/THCTensorScatterGather.cu b/aten/src/THC/generic/THCTensorScatterGather.cu index c026075a954cd5..58ad914c8910f2 100644 --- a/aten/src/THC/generic/THCTensorScatterGather.cu +++ b/aten/src/THC/generic/THCTensorScatterGather.cu @@ -4,7 +4,7 @@ #define RUN(TYPE, DIMS, REAL) \ THCudaTensor_gatherKernel \ - <<>>( \ + <<>>( \ tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); void THCTensor_(gather)(THCState* state, THCTensor *tensor, @@ -37,17 +37,19 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor, const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; - THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCTensor* oldTensor = NULL; - if (TensorUtils::maybeOverlappingIndices(state, tensor)) { + if (THCTensor_maybeOverlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCTensor_(newContiguous)(state, tensor); } - if (TensorUtils::canUse32BitIndexMath(state, tensor) && - TensorUtils::canUse32BitIndexMath(state, src) && - TensorUtils::canUse32BitIndexMath(state, index)) { + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, index)) { TensorInfo tensorInfo = getTensorInfo(state, tensor); TensorInfo srcInfo = @@ -98,7 +100,7 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor, #define RUN(TYPE, DIMS, REAL) \ THCudaTensor_scatterKernel \ - <<>>( \ + <<>>( \ tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) { @@ -130,17 +132,19 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; - THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCTensor* oldTensor = NULL; - if (TensorUtils::maybeOverlappingIndices(state, tensor)) { + if (THCTensor_maybeOverlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCTensor_(newContiguous)(state, tensor); } - if (TensorUtils::canUse32BitIndexMath(state, tensor) && - TensorUtils::canUse32BitIndexMath(state, src) && - TensorUtils::canUse32BitIndexMath(state, index)) { + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, index)) { TensorInfo tensorInfo = getTensorInfo(state, tensor); TensorInfo srcInfo = @@ -186,7 +190,7 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong #define RUN(TYPE, DIMS, REAL) \ THCudaTensor_scatterAddKernel \ - <<>>( \ + <<>>( \ tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) { @@ -217,17 +221,19 @@ void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaL const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; - THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCTensor* oldTensor = NULL; - if (TensorUtils::maybeOverlappingIndices(state, tensor)) { + if (THCTensor_maybeOverlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCTensor_(newContiguous)(state, tensor); } - if (TensorUtils::canUse32BitIndexMath(state, tensor) && - TensorUtils::canUse32BitIndexMath(state, src) && - TensorUtils::canUse32BitIndexMath(state, index)) { + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, index)) { TensorInfo tensorInfo = getTensorInfo(state, tensor); TensorInfo srcInfo = @@ -273,7 +279,7 @@ void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaL #define RUN(TYPE, DIMS, REAL) \ THCudaTensor_scatterFillKernel \ - <<>>( \ + <<>>( \ tensorInfo, indexInfo, value, dim, (TYPE)totalElements); void @@ -302,16 +308,18 @@ THCTensor_(scatterFill)(THCState* state, THCTensor *tensor, const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; - THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCTensor* oldTensor = NULL; - if (TensorUtils::maybeOverlappingIndices(state, tensor)) { + if (THCTensor_maybeOverlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCTensor_(newContiguous)(state, tensor); } - if (TensorUtils::canUse32BitIndexMath(state, tensor) && - TensorUtils::canUse32BitIndexMath(state, index)) { + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, index)) { TensorInfo tensorInfo = getTensorInfo(state, tensor); TensorInfo indexInfo = diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu index 8d4e4720eebfe2..47dde9d7f5d41b 100644 --- a/aten/src/THC/generic/THCTensorSort.cu +++ b/aten/src/THC/generic/THCTensorSort.cu @@ -109,7 +109,7 @@ THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, // The constructed key/value tensor info is used to select the slice // we are sorting on a per-block basis - if (TensorUtils::canUse32BitIndexMath(state, key)) { + if (THCTensor_canUse32BitIndexMath(state, key)) { TensorInfo keyInfo = getTensorInfo(state, key); keyInfo.reduceDim(dim); @@ -153,11 +153,11 @@ THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, THCudaCheck(cudaGetLastError()); } -void sortViaThrust(THCState* state, - THCTensor* sorted, - THCudaLongTensor* indices, - THCTensor* input, - int dim, bool dir) { +void THCTensor_(sortViaThrust)(THCState* state, + THCTensor* sorted, + THCudaLongTensor* indices, + THCTensor* input, + int dim, bool dir) { int nDims = THCTensor_(nDimension)(state, input); ptrdiff_t totalElements = THCTensor_(nElement)(state, input); @@ -327,7 +327,7 @@ THC_API void THCTensor_(sort)(THCState* state, } else { // Otherwise, fall back upon Thrust, which handles all other cases // (potentially slowly, with extra copies/memory allocations) - sortViaThrust(state, sorted, indices, input, dim, (bool) order); + THCTensor_(sortViaThrust)(state, sorted, indices, input, dim, (bool) order); } THCudaCheck(cudaGetLastError()); diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu index f10dac01f451db..1d280df04ae1bb 100644 --- a/aten/src/THC/generic/THCTensorTopK.cu +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -110,9 +110,9 @@ THC_API void THCTensor_(topk)(THCState* state, // Based on required index size, run the algorithm with the // appropriate index type - if (TensorUtils::canUse32BitIndexMath(state, input) && - TensorUtils::canUse32BitIndexMath(state, topK) && - TensorUtils::canUse32BitIndexMath(state, indices)) { + if (THCTensor_canUse32BitIndexMath(state, input) && + THCTensor_canUse32BitIndexMath(state, topK) && + THCTensor_canUse32BitIndexMath(state, indices)) { RUN_T(uint32_t); } else { RUN_T(uint64_t); diff --git a/aten/src/THCS/generic/THCSTensorMath.cu b/aten/src/THCS/generic/THCSTensorMath.cu index 02fbe9e09a365d..ac77b5f198de3e 100644 --- a/aten/src/THCS/generic/THCSTensorMath.cu +++ b/aten/src/THCS/generic/THCSTensorMath.cu @@ -260,19 +260,22 @@ void THCSTensor_(spcadd)(THCState *state, THCTensor *r_, THCTensor *dense, real // TODO benchmark to decide whether to remove this special case const dim3 block = getApplyBlock(); dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); if (sparse->nDimensionV == 0) { - THArgCheck(getApplyGrid(state, nnz, grid), 1, CUTORCH_DIM_WARNING); + + THArgCheck(getApplyGrid(state, nnz, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCSTensor_sparseElementwiseKernelScalar, uint64_t, real> - <<>>( + <<>>( TensorCAddOp(value), V_INFO(r_), I_INFO(indices), V_INFO(values), (uint64_t) nnz); } else { - THArgCheck(getApplyGrid(state, nnz * block.x, grid), 1, CUTORCH_DIM_WARNING); + THArgCheck(getApplyGrid(state, nnz * block.x, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCSTensor_sparseElementwiseKernel, uint64_t, real> - <<>>( + <<>>( TensorCAddOp(value), V_INFO(r_), I_INFO(indices), V_INFO(values), (uint64_t) nnz); @@ -474,10 +477,12 @@ void THCSTensor_(cmul)(THCState *state, THCSTensor *r_, THCSTensor *t_, THCSTens int64_t valueSize = t_values_->stride[0]; const dim3 block = dim3(min((int64_t) getApplyBlock().x, valueSize)); dim3 grid; - THArgCheck(getApplyGrid(state, valueSize, grid), 1, CUTORCH_DIM_WARNING); + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, valueSize, grid, curDevice), 1, CUTORCH_DIM_WARNING); THCSTensor_valueSparseIntersectionKernel, uint64_t, real> - <<>>( + <<>>( TensorMulOp(), I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_), V_INFO(r_values_), V_INFO(t_values_), V_INFO(s_values_), @@ -486,7 +491,7 @@ void THCSTensor_(cmul)(THCState *state, THCSTensor *r_, THCSTensor *t_, THCSTens THCudaLongStorage *resultNnz = THCudaLongStorage_newWithSize(state, 1); THCSTensor_indexSparseIntersectionKernel - <<<1, 1, 0, THCState_getCurrentStream(state)>>>( + <<<1, 1, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>( I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_), (uint64_t)t_nnz, (uint64_t)s_nnz, (uint64_t*)THCudaLongStorage_data(state, resultNnz)); THCudaCheck(cudaGetLastError()); diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu index b0874d78509d12..cb0f47510bc559 100644 --- a/aten/src/THCUNN/AbsCriterion.cu +++ b/aten/src/THCUNN/AbsCriterion.cu @@ -2,6 +2,7 @@ #include "common.h" #include "THCHalf.h" #include "THCHalfAutoNumerics.cuh" +#include "THCApply.cuh" #include #include diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu index 066f562e4a3768..dfef9082de9498 100644 --- a/aten/src/THCUNN/BCECriterion.cu +++ b/aten/src/THCUNN/BCECriterion.cu @@ -2,12 +2,15 @@ #include "common.h" #include "THCHalf.h" #include "THCHalfAutoNumerics.cuh" +#include "THCThrustAllocator.cuh" +#include "THCApply.cuh" #include #include #include #include #include +#include template inline __host__ __device__ T eps(); diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu index 3c0b0bc86c5092..e4e85b71045f8e 100644 --- a/aten/src/THCUNN/DistKLDivCriterion.cu +++ b/aten/src/THCUNN/DistKLDivCriterion.cu @@ -2,6 +2,7 @@ #include "common.h" #include "THCHalf.h" #include "THCHalfAutoNumerics.cuh" +#include "THCApply.cuh" #include #include diff --git a/aten/src/THCUNN/FusedRNNKernel.cu b/aten/src/THCUNN/FusedRNNKernel.cu index 58e22f8cfb08a0..d8b594ab046d04 100644 --- a/aten/src/THCUNN/FusedRNNKernel.cu +++ b/aten/src/THCUNN/FusedRNNKernel.cu @@ -23,7 +23,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* out, half* in) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); + *out = __hdiv(one, __hadd(one, hexp(__hneg(*in)))); #else float fin = ScalarConvert::to(*in); *out = ScalarConvert::to(1.0f / (1.0f + expf(- fin))); @@ -33,7 +33,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* v) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); + *v = __hdiv(one, __hadd(one, hexp(__hneg(*v)))); #else float fv = ScalarConvert::to(*v); *v = ScalarConvert::to(1.0f / (1.0f + expf(- fv))); diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu index dd26e88bff5aef..e9571fe06c4e30 100644 --- a/aten/src/THCUNN/MSECriterion.cu +++ b/aten/src/THCUNN/MSECriterion.cu @@ -3,6 +3,7 @@ #include "THCHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" +#include "THCApply.cuh" #include #include diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu index 8e35599ba1dadf..c8018d997365dc 100644 --- a/aten/src/THCUNN/SmoothL1Criterion.cu +++ b/aten/src/THCUNN/SmoothL1Criterion.cu @@ -3,6 +3,7 @@ #include "THCHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" +#include "THCApply.cuh" #include #include diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu index e38f80c0a77204..ee53e76dca2625 100644 --- a/aten/src/THCUNN/SoftMarginCriterion.cu +++ b/aten/src/THCUNN/SoftMarginCriterion.cu @@ -2,6 +2,7 @@ #include "common.h" #include "THCHalf.h" #include "THCHalfAutoNumerics.cuh" +#include "THCApply.cuh" #include #include diff --git a/aten/src/THCUNN/generic/AbsCriterion.cu b/aten/src/THCUNN/generic/AbsCriterion.cu index ba6bb55694d730..ab8904e0ca0e9c 100644 --- a/aten/src/THCUNN/generic/AbsCriterion.cu +++ b/aten/src/THCUNN/generic/AbsCriterion.cu @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/AbsCriterion.cu" #else -#include "THCApply.cuh" - void THNN_(AbsCriterion_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/BCECriterion.cu b/aten/src/THCUNN/generic/BCECriterion.cu index 777700498cfdee..0fe9efdd21b64f 100644 --- a/aten/src/THCUNN/generic/BCECriterion.cu +++ b/aten/src/THCUNN/generic/BCECriterion.cu @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/BCECriterion.cu" #else -#include "THCApply.cuh" - void THNN_(BCECriterion_updateOutput)( THCState *state, THCTensor *input, @@ -32,7 +30,7 @@ void THNN_(BCECriterion_updateOutput)( input = THCTensor_(newContiguous)(state, input); target = THCTensor_(newContiguous)(state, target); - + THCThrustAllocator thrustAlloc(state); thrust::device_ptr input_data(THCTensor_(data)(state, input)); thrust::device_ptr target_data(THCTensor_(data)(state, target)); @@ -41,6 +39,7 @@ void THNN_(BCECriterion_updateOutput)( weights = THCTensor_(newContiguous)(state, weights); thrust::device_ptr weights_data(THCTensor_(data)(state, weights)); sum = thrust::transform_reduce( + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), bce_functor_weights(), @@ -50,6 +49,7 @@ void THNN_(BCECriterion_updateOutput)( THCTensor_(free)(state, weights); } else { sum = thrust::transform_reduce( + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), bce_functor(), diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu index eb2dc8405e9b39..eb3535898ae02c 100644 --- a/aten/src/THCUNN/generic/BatchNormalization.cu +++ b/aten/src/THCUNN/generic/BatchNormalization.cu @@ -6,18 +6,18 @@ #define DeviceTensor1 THCDeviceTensor template -static THCDeviceTensor devicetensor(THCState *state, THCTensor *t) { +static THCDeviceTensor THNN_(devicetensor)(THCState *state, THCTensor *t) { if (!t) { return THCDeviceTensor(); } - int inDim = THCTensor_(nDimension)(state, t); + int inDim = THCTensor_nDimension(state, t); if (inDim == Dim) { return toDeviceTensor(state, t); } // View in which the last dimensions are collapsed or expanded as needed - THAssert(THCTensor_(isContiguous)(state, t)); + THAssert(THCTensor_isContiguous(state, t)); int size[Dim]; for (int i = 0; i < Dim || i < inDim; ++i) { if (i < Dim && i < inDim) { @@ -43,14 +43,14 @@ void THNN_(BatchNormalization_updateOutput)( THCTensor_(resize1d)(state, saveMean_, nInput); THCTensor_(resize1d)(state, saveStd_, nInput); } - DeviceTensor3 input = devicetensor<3>(state, input_); - DeviceTensor3 output = devicetensor<3>(state, output_); - DeviceTensor1 weight = devicetensor<1>(state, weight_); - DeviceTensor1 bias = devicetensor<1>(state, bias_); - DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); - DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); - DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); - DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); + DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_); + DeviceTensor3 output = THNN_(devicetensor)<3>(state, output_); + DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_); + DeviceTensor1 bias = THNN_(devicetensor)<1>(state, bias_); + DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_); + DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_); + DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_); + DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_); cudaStream_t s = THCState_getCurrentStream(state); cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); @@ -81,16 +81,16 @@ void THNN_(BatchNormalization_backward)( THCTensor_(resizeAs)(state, gradInput_, input_); } - DeviceTensor3 input = devicetensor<3>(state, input_); - DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_); - DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_); - DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_); - DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_); - DeviceTensor1 weight = devicetensor<1>(state, weight_); - DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); - DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); - DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); - DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); + DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_); + DeviceTensor3 gradOutput = THNN_(devicetensor)<3>(state, gradOutput_); + DeviceTensor3 gradInput = THNN_(devicetensor)<3>(state, gradInput_); + DeviceTensor1 gradWeight = THNN_(devicetensor)<1>(state, gradWeight_); + DeviceTensor1 gradBias = THNN_(devicetensor)<1>(state, gradBias_); + DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_); + DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_); + DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_); + DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_); + DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_); cudaStream_t s = THCState_getCurrentStream(state); diff --git a/aten/src/THCUNN/generic/DistKLDivCriterion.cu b/aten/src/THCUNN/generic/DistKLDivCriterion.cu index 6ea5d3944a7c4d..b6a282527900ef 100644 --- a/aten/src/THCUNN/generic/DistKLDivCriterion.cu +++ b/aten/src/THCUNN/generic/DistKLDivCriterion.cu @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu" #else -#include "THCApply.cuh" - void THNN_(DistKLDivCriterion_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/FeatureLPPooling.cu b/aten/src/THCUNN/generic/FeatureLPPooling.cu index 31cf82582b3e17..06a9d00dff5b4c 100644 --- a/aten/src/THCUNN/generic/FeatureLPPooling.cu +++ b/aten/src/THCUNN/generic/FeatureLPPooling.cu @@ -159,7 +159,7 @@ void THNN_(FeatureLPPooling_updateOutput)(THCState* state, "input must be 1-3 dimensions for non-batch mode"); } - THArgCheck(TensorUtils::canUse32BitIndexMath(state, inputTH), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 2, "input tensor must fit into 32-bit index math"); THCDeviceTensor input; @@ -201,9 +201,9 @@ void THNN_(FeatureLPPooling_updateGradInput)(THCState* state, int width, int stride, bool batchMode) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutputTH), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutputTH), 2, "output gradient tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils::canUse32BitIndexMath(state, inputTH), 3, + THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 3, "input tensor must fit into 32-bit index math"); THCUNN_assertSameGPU(state, 4, gradOutputTH, inputTH, outputTH, gradInputTH); diff --git a/aten/src/THCUNN/generic/FusedRNNKernel.cu b/aten/src/THCUNN/generic/FusedRNNKernel.cu index b96ac829ea6769..e7b4fec0d557c7 100644 --- a/aten/src/THCUNN/generic/FusedRNNKernel.cu +++ b/aten/src/THCUNN/generic/FusedRNNKernel.cu @@ -18,10 +18,10 @@ void THNN_(FusedRNNAssertSizes)(THCState *state, int factor, int count, ...) THCTensor_(nElement)(state, hidden), 3, "Input and Hidden tensor sizes should be the same."); - THAssertMsg(TensorUtils::getDims(state, input) <= MAX_CUTORCH_DIMS, + THAssertMsg(THCTensor_nDimension(state, input) <= MAX_CUTORCH_DIMS, "Tensor dimension is too large."); - THAssertMsg(TensorUtils::getDims(state, hidden) <= MAX_CUTORCH_DIMS, + THAssertMsg(THCTensor_nDimension(state, hidden) <= MAX_CUTORCH_DIMS, "Tensor dimension is too large."); for (int arg=2; arg < count; ++arg){ @@ -29,7 +29,7 @@ void THNN_(FusedRNNAssertSizes)(THCState *state, int factor, int count, ...) THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, tens)*factor, 3, "A pointwise tensor was not the right size, should have 1/%u the elements of input/hidden tensor.", arg, factor); - THAssertMsg(TensorUtils::getDims(state, tens) <= MAX_CUTORCH_DIMS, + THAssertMsg(THCTensor_nDimension(state, tens) <= MAX_CUTORCH_DIMS, "Tensor dimension is too large."); } @@ -42,13 +42,13 @@ int THNN_(minIndexType)(THCState *state, int count, ...) va_start(list, count); THCTensor* tens = va_arg(list, THCTensor*); - int startDim = TensorUtils::getDims(state, tens); + int startDim = THCTensor_nDimension(state, tens); bool canCollapse = THCTensor_(isContiguous)(state,tens); for (int arg=1; arg < count; ++arg){ tens = va_arg(list, THCTensor*); canCollapse = canCollapse && THCTensor_(isContiguous)(state, tens); - if(TensorUtils::getDims(state, tens) != startDim){ + if(THCTensor_nDimension(state, tens) != startDim){ va_end(list); return -1; } @@ -65,7 +65,7 @@ bool THNN_(canUse32BitIndexMath)(THCState *state, int count, ...) for (int arg=0; arg < count; ++arg){ THCTensor *tens = va_arg(list, THCTensor*); - if (!TensorUtils::canUse32BitIndexMath(state, tens)){ + if (!THCTensor_canUse32BitIndexMath(state, tens)){ va_end(list); return false; } @@ -385,26 +385,26 @@ __global__ void #define LSTM_FORWARD(ITYPE, DIM) THNN_(LSTMForward) \ \ - <<>> \ + <<>> \ (inputI, hiddenI, \ bias1I, bias2I, cxI, hyI, cyI, \ hid_size, totalElements); #define LSTM_BACKWARD(ITYPE, DIM) THNN_(LSTMBackward) \ \ - <<>> \ + <<>> \ (storageI, gradingatesI, cxI, cyI, \ gradoutI, gradoutcI, gradincxI, \ hid_size, totalElements); #define GRU_FORWARD(ITYPE, DIM) THNN_(GRUForward) \ - <<>> \ + <<>> \ (inputI, hiddenI, bias1I, bias2I, hxI, hyI, storageI, \ hid_size, totalElements); #define GRU_BACKWARD(ITYPE, DIM) THNN_(GRUBackward) \ \ - <<>> \ + <<>> \ (gradininputI, gradinhiddenI, gradoutI, gradinhxI, storageI, \ hid_size, totalElements); @@ -434,11 +434,13 @@ void THNN_(LSTM_forw_ind_wrap)( (state, 5, input, hidden, hy, cy, cx); } - ptrdiff_t totalElements = TensorUtils::getNumElements(state, cx); + ptrdiff_t totalElements = THCTensor_nElement(state, cx); const dim3 block = getApplyBlock(); dim3 grid; - THAssertMsg(getApplyGrid(state, totalElements, grid), + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), "Could not get grid size for pointwise apply."); TINFO inputI = getTensorInfo(state, input); @@ -527,11 +529,13 @@ void THNN_(LSTM_back_ind_wrap)( int maxDim = THNN_(minIndexType) (state, 7, storage, gradInGates, cx, cy, gradOutput, gradOutputCell, gradInputCx); - ptrdiff_t totalElements = TensorUtils::getNumElements(state, gradOutput); + ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput); const dim3 block = getApplyBlock(); dim3 grid; - THAssertMsg(getApplyGrid(state, totalElements, grid), + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), "Could not get grid size for pointwise apply"); TINFO storageI = getTensorInfo(state, storage); @@ -616,11 +620,13 @@ void THNN_(GRU_forw_ind_wrap)( (state, 5, input, hidden, hx, hy, storage); } - ptrdiff_t totalElements = TensorUtils::getNumElements(state, hx); + ptrdiff_t totalElements = THCTensor_nElement(state, hx); const dim3 block = getApplyBlock(); dim3 grid; - THAssertMsg(getApplyGrid(state, totalElements, grid), + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), "Could not get grid size for pointwise apply."); TINFO inputI = getTensorInfo(state, input); @@ -713,11 +719,13 @@ void THNN_(GRU_back_ind_wrap)( int maxDim = THNN_(minIndexType)(state, 5, gradInInput, gradInHidden, gradOutput, gradInputHx, storage); - ptrdiff_t totalElements = TensorUtils::getNumElements(state, gradOutput); + ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput); const dim3 block = getApplyBlock(); dim3 grid; - THAssertMsg(getApplyGrid(state, totalElements, grid), + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), "Could not get grid size for pointwise apply"); TINFO gradininputI = getTensorInfo(state, gradInInput); diff --git a/aten/src/THCUNN/generic/IndexLinear.cu b/aten/src/THCUNN/generic/IndexLinear.cu index ec91d3f5f20881..f65c21dfe199e9 100644 --- a/aten/src/THCUNN/generic/IndexLinear.cu +++ b/aten/src/THCUNN/generic/IndexLinear.cu @@ -2,7 +2,7 @@ #define THC_GENERIC_FILE "generic/IndexLinear.cu" #else -static bool THCUNN_checkKeysValues(THCState *state, THCudaLongTensor* keys, +static bool THNN_(checkKeysValues)(THCState *state, THCudaLongTensor* keys, THCTensor* values) { return THCudaLongTensor_size(state, keys, 0) == THCTensor_(nElement)(state, values) @@ -38,7 +38,7 @@ void THNN_(IndexLinear_updateOutput)( "weight matrix must be contiguous"); THArgCheck(THCTensor_(isContiguous)(state, bias), 8, "bias vector must be contiguous"); - THArgCheck(THCUNN_checkKeysValues(state, keys, values), 1, + THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, "Keys and values should have the same number of elements"); int64_t batchSize = sizes->size[0]; @@ -127,7 +127,7 @@ void THNN_(IndexLinear_accGradParameters)( "bias vector must be contiguous"); THArgCheck(THCTensor_(isContiguous)(state, valuesBuffer), 11, "valuesBuffer vector must be contiguous"); - THArgCheck(THCUNN_checkKeysValues(state, keys, values), 1, + THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, "Keys and values should have the same number of elements"); THCTensor_(resize2d)(state, gradWeight, keysSize, outDim * (maxNormalize > 0 ? 2 : 1)); @@ -179,7 +179,7 @@ void THNN_(IndexLinear_accUpdateGradParameters)( "weight matrix must be contiguous"); THArgCheck(THCTensor_(isContiguous)(state, bias), 8, "bias vector must be contiguous"); - THArgCheck(THCUNN_checkKeysValues(state, keys, values), 1, + THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, "Keys and values should have the same number of elements"); int64_t batchSize = sizes->size[0]; diff --git a/aten/src/THCUNN/generic/LookupTable.cu b/aten/src/THCUNN/generic/LookupTable.cu index 38179fa848ae70..b13215209af5fc 100644 --- a/aten/src/THCUNN/generic/LookupTable.cu +++ b/aten/src/THCUNN/generic/LookupTable.cu @@ -184,7 +184,7 @@ void THNN_(LookupTable_renorm)( // At launch time figure out what the index type is and norm type int Norm = ScalarConvert::to(normType); - if (TensorUtils::canUse32BitIndexMath(state, idx)) { + if (THCTensor_canUse32BitIndexMath(state, idx)) { if (Norm == 1) { RUN(1, unsigned int); } else if (Norm == 2) { diff --git a/aten/src/THCUNN/generic/MSECriterion.cu b/aten/src/THCUNN/generic/MSECriterion.cu index bd23f9d512350f..99e29be0502e15 100644 --- a/aten/src/THCUNN/generic/MSECriterion.cu +++ b/aten/src/THCUNN/generic/MSECriterion.cu @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/MSECriterion.cu" #else -#include "THCApply.cuh" - void THNN_(MSECriterion_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/SmoothL1Criterion.cu b/aten/src/THCUNN/generic/SmoothL1Criterion.cu index 765ddbb71df8e4..c338f173790dd3 100644 --- a/aten/src/THCUNN/generic/SmoothL1Criterion.cu +++ b/aten/src/THCUNN/generic/SmoothL1Criterion.cu @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu" #else -#include "THCApply.cuh" - void THNN_(SmoothL1Criterion_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/SoftMarginCriterion.cu b/aten/src/THCUNN/generic/SoftMarginCriterion.cu index 356def09778c0b..a68a9c691879ca 100644 --- a/aten/src/THCUNN/generic/SoftMarginCriterion.cu +++ b/aten/src/THCUNN/generic/SoftMarginCriterion.cu @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu" #else -#include "THCApply.cuh" - void THNN_(SoftMarginCriterion_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu index 3d143697eaf847..4e8844b481ebc9 100644 --- a/aten/src/THCUNN/generic/SparseLinear.cu +++ b/aten/src/THCUNN/generic/SparseLinear.cu @@ -2,22 +2,22 @@ #define THC_GENERIC_FILE "generic/SparseLinear.cu" #else -static bool checkInput(THCTensor* t) +static bool THNN_(checkInput)(THCTensor* t) { return t->nDimension == 2 && t->size[1] == 3; } -static bool checkSize2D(THCTensor* t, int64_t size0, int64_t size1) +static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1) { return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; } -static bool checkSize1D(THCTensor* t, int64_t size0) +static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0) { return t->nDimension == 1 && t->size[0] == size0; } -static inline void copyCudaFloatingType(THCState *state, THCudaIntTensor *buf, THCTensor *t) { +static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) { #ifdef THC_REAL_IS_FLOAT THCudaIntTensor_copyCudaFloat(state, buf, t); #elif defined(THC_REAL_IS_DOUBLE) @@ -40,9 +40,9 @@ void THNN_(SparseLinear_updateOutput)( int64_t outDim = THCTensor_(size)(state, weight, 0); int64_t inDim = THCTensor_(size)(state, weight, 1); - THArgCheck(checkInput(input), 2, "input size must be nnz x 3"); + THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 3"); THArgCheck(THCTensor_(nDimension)(state, output) == 2, 3, "output must be batchsize x outputsize"); - THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); weight = THCTensor_(newContiguous)(state, weight); @@ -66,9 +66,9 @@ void THNN_(SparseLinear_updateOutput)( // If rows might get out of order in future implementations, or if cusparse // complains with an illegal memory access, sort like we do in AccGradParameters THCTensor_(select)(state, sel, input, 1, 0); - copyCudaFloatingType(state, rowbuf, sel); + THNN_(copyCudaFloatingType)(state, rowbuf, sel); THCTensor_(select)(state, sel, input, 1, 1); - copyCudaFloatingType(state, colInds, sel); + THNN_(copyCudaFloatingType)(state, colInds, sel); THCTensor_(select)(state, sel, input, 1, 2); THCTensor_(copyCuda)(state, values, sel); @@ -136,9 +136,9 @@ void THNN_(SparseLinear_accGradParameters)( int64_t outDim = THCTensor_(size)(state, weight, 0); int64_t inDim = THCTensor_(size)(state, weight, 1); - THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2"); - THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); - THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong"); + THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); weight = THCTensor_(newContiguous)(state, weight); int64_t nnz = THCTensor_(size)(state, input, 0); @@ -166,9 +166,9 @@ void THNN_(SparseLinear_accGradParameters)( // Get data ready for cusparse, need CudaInt buffers THCTensor_(select)(state, sel, buf, 1, 0); - copyCudaFloatingType(state, rowInds, sel); + THNN_(copyCudaFloatingType)(state, rowInds, sel); THCTensor_(select)(state, sel, buf, 1, 1); - copyCudaFloatingType(state, colbuf, sel); + THNN_(copyCudaFloatingType)(state, colbuf, sel); THCTensor_(select)(state, sel, buf, 1, 2); THCTensor_(copyCuda)(state, values, sel); diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu index 61a1c7046dd25e..117fe9a8109c42 100644 --- a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu +++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu @@ -2,7 +2,7 @@ #define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu" #else -void LRNforward(THCState* state, THCTensor* input, THCTensor* output, +void THNN_(LRNforward)(THCState* state, THCTensor* input, THCTensor* output, THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_) { real alpha = ScalarConvert::to(alpha_); @@ -47,7 +47,7 @@ void LRNforward(THCState* state, THCTensor* input, THCTensor* output, } -void LRNbackward(THCState* state, THCTensor* input, THCTensor* output, +void THNN_(LRNbackward)(THCState* state, THCTensor* input, THCTensor* output, THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_) { @@ -101,7 +101,7 @@ void THNN_(SpatialCrossMapLRN_updateOutput)( accreal beta, accreal k) { - LRNforward(state, input, output, scale, size, alpha, beta, k); + THNN_(LRNforward)(state, input, output, scale, size, alpha, beta, k); } void THNN_(SpatialCrossMapLRN_updateGradInput)( @@ -116,7 +116,7 @@ void THNN_(SpatialCrossMapLRN_updateGradInput)( accreal beta, accreal k) { - LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k); + THNN_(LRNbackward)(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k); } #endif diff --git a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu index 734e23a1bc2287..dc9bc99d0db203 100644 --- a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu +++ b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu @@ -7,7 +7,7 @@ void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state, THCTensor *output, int padL, int padR, int padT, int padB) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); int planeDim = 0; @@ -82,9 +82,9 @@ void THNN_(SpatialReflectionPadding_updateGradInput)( int padL, int padR, int padT, int padB) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), 3, + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, "output gradient tensor must fit into 32-bit index math"); int planeDim = 0; diff --git a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu index 4bc787205d4418..26150adb437d6b 100644 --- a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu +++ b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu @@ -8,7 +8,7 @@ void THNN_(SpatialReplicationPadding_updateOutput)( THCTensor *output, int padL, int padR, int padT, int padB) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); int planeDim = 0; @@ -72,9 +72,9 @@ void THNN_(SpatialReplicationPadding_updateGradInput)( int padL, int padR, int padT, int padB) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), 3, + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, "output gradient tensor must fit into 32-bit index math"); int planeDim = 0; diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu index 1caf7a356b544e..1dac0219d96dfb 100644 --- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu +++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu @@ -6,7 +6,7 @@ void THNN_(TemporalReflectionPadding_updateOutput)(THCState *state, THCTensor *input, THCTensor *output, int padL, int padR) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); int planeDim = 0; @@ -71,9 +71,9 @@ void THNN_(TemporalReflectionPadding_updateGradInput)( THCTensor *gradInput, int padL, int padR) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), 3, + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, "output gradient tensor must fit into 32-bit index math"); int planeDim = 0; diff --git a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu index a6c7fd5a81b29a..6e1379b3c3f4af 100644 --- a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu +++ b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu @@ -7,7 +7,7 @@ void THNN_(TemporalReplicationPadding_updateOutput)( THCTensor *input, THCTensor *output, int padL, int padR) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); int planeDim = 0; @@ -66,9 +66,9 @@ void THNN_(TemporalReplicationPadding_updateGradInput)( THCTensor *gradInput, int padL, int padR) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), 3, + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, "output gradient tensor must fit into 32-bit index math"); int planeDim = 0; diff --git a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu index aecbd19078fdb6..cf5a298b3f05db 100644 --- a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu +++ b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu @@ -9,7 +9,7 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( int pleft, int pright, int ptop, int pbottom, int pfront, int pback) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, "input tensor must fit into 32-bit index math"); int numInputDims = THCTensor_(nDimension)(state, input); @@ -40,7 +40,7 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( idepth, iheight, iwidth, odepth, oheight, owidth); if (gradOutput != NULL) { - THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, "output gradient tensor must fit into 32-bit index math"); THArgCheck(numPlanes == THCTensor_(size)(state, gradOutput, planeDim), 3, diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc index c88b785b04aea4..7a70698508656e 100644 --- a/binaries/benchmark_helper.cc +++ b/binaries/benchmark_helper.cc @@ -179,6 +179,7 @@ void loadInput( void runNetwork( shared_ptr workspace, caffe2::NetDef& net_def, + const bool wipe_cache, const bool run_individual, const int warmup, const int iter) { @@ -196,7 +197,9 @@ void runNetwork( CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); } - caffe2::wipe_cache(); + if (wipe_cache) { + caffe2::wipe_cache(); + } LOG(INFO) << "Main runs."; CAFFE_ENFORCE( iter >= 0, @@ -206,11 +209,15 @@ void runNetwork( for (int i = 0; i < iter; ++i) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup); CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); - caffe2::wipe_cache(); + if (wipe_cache) { + caffe2::wipe_cache(); + } if (run_individual) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup); CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); - caffe2::wipe_cache(); + if (wipe_cache) { + caffe2::wipe_cache(); + } } } } diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h index fa4a2bcebbed5c..4ee32dba219194 100644 --- a/binaries/benchmark_helper.h +++ b/binaries/benchmark_helper.h @@ -87,5 +87,6 @@ void runNetwork( shared_ptr, caffe2::NetDef&, const bool, + const bool, const int, const int); diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc index b6f8ce6193c836..16c1173d0ff0ff 100644 --- a/binaries/caffe2_benchmark.cc +++ b/binaries/caffe2_benchmark.cc @@ -66,6 +66,10 @@ CAFFE2_DEFINE_bool( false, "Whether to write out output in text format for regression purpose."); CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up."); +CAFFE2_DEFINE_bool( + wipe_cache, + false, + "Whether to evict the cache before running network."); int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); @@ -103,6 +107,7 @@ int main(int argc, char** argv) { runNetwork( workspace, net_def, + caffe2::FLAGS_wipe_cache, caffe2::FLAGS_run_individual, caffe2::FLAGS_warmup, caffe2::FLAGS_iter); diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt new file mode 100644 index 00000000000000..1cb1d31d76d328 --- /dev/null +++ b/c10/CMakeLists.txt @@ -0,0 +1,50 @@ +project(c10 CXX C) + +set(LIB_SOURCES + dummy.cpp +) + +set(TEST_SOURCES + dummy_test.cpp +) + +if(MSVC) + # TODO Also add some warning options that MSVC can understand + set(WARNING_OPTIONS "") +else() + set(WARNING_OPTIONS + -Wall + -Wextra + -Wold-style-cast + -Wno-missing-braces + -Wcast-align + -Wcast-qual + -Wctor-dtor-privacy + -Wdisabled-optimization + -Wformat=2 + -Winit-self + -Wmissing-include-dirs + -Woverloaded-virtual + -Wredundant-decls + -Wshadow + -Wsign-promo + -Wstrict-overflow=5 + -fdiagnostics-show-option + -Wconversion + -Wpedantic + -Wno-gnu-zero-variadic-macro-arguments + -Wundef + -Werror + ) +endif() + +add_library(${PROJECT_NAME} OBJECT ${LIB_SOURCES}) +target_compile_options(${PROJECT_NAME} PRIVATE ${WARNING_OPTIONS}) + +if(BUILD_TEST) + add_executable(${PROJECT_NAME}_test ${TEST_SOURCES} $) + add_test(NAME ${PROJECT_NAME}_test COMMAND $) + target_compile_options(${PROJECT_NAME}_test PRIVATE ${WARNING_OPTIONS}) + target_link_libraries(${PROJECT_NAME}_test gtest_main) + install(TARGETS ${PROJECT_NAME}_test DESTINATION test) +endif() diff --git a/c10/dummy.cpp b/c10/dummy.cpp new file mode 100644 index 00000000000000..7eb029f2618e7f --- /dev/null +++ b/c10/dummy.cpp @@ -0,0 +1,3 @@ +// This is going to be replaced with actual c10 files + +#include diff --git a/c10/dummy.h b/c10/dummy.h new file mode 100644 index 00000000000000..c4be4d6b808728 --- /dev/null +++ b/c10/dummy.h @@ -0,0 +1,3 @@ +#pragma once + +// This is going to be replaced with actual c10 files diff --git a/c10/dummy_test.cpp b/c10/dummy_test.cpp new file mode 100644 index 00000000000000..4e32bef2cd3aa5 --- /dev/null +++ b/c10/dummy_test.cpp @@ -0,0 +1,6 @@ +#include +#include + +TEST(DummyTest, dummy) { + EXPECT_TRUE(true); +} diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 3a4db6f1819479..a31c7521196c17 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -8,8 +8,6 @@ if(BUILD_ATEN) set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(AT_LINK_STYLE INTERFACE) - # Disable contrib for root-level build - set(ATEN_NO_CONTRIB ON) add_subdirectory(../aten aten) set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE}) @@ -44,6 +42,11 @@ if(BUILD_ATEN) list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS}) list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS}) list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE}) + IF(USE_ROCM) + # Set the HIP Variables + set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) + set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) + ENDIF() endif() # ---[ Caffe2 build @@ -177,6 +180,7 @@ if(BUILD_CAFFE2) endif() # Compile exposed libraries. +list(APPEND Caffe2_CPU_SRCs $) add_library(caffe2 ${Caffe2_CPU_SRCS}) if (BUILD_CAFFE2) caffe2_interface_library(caffe2_protos caffe2_protos_whole) @@ -199,13 +203,12 @@ target_compile_options(caffe2 INTERFACE "-std=c++11") target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") -set_target_properties(caffe2 PROPERTIES VERSION 1 SOVERSION 1) install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) caffe2_interface_library(caffe2 caffe2_library) list(APPEND Caffe2_MAIN_LIBS caffe2_library) # ---[ CUDA library. -if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2)) +if(USE_CUDA) # A hack to deal with cuda library dependencies and modern CMake: the # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result, # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This @@ -236,7 +239,6 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2)) # Set standard properties on the target aten_set_target_props(caffe2_gpu) - set_target_properties(caffe2_gpu PROPERTIES VERSION 1 SOVERSION 1) install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib) caffe2_interface_library(caffe2_gpu caffe2_gpu_library) @@ -244,25 +246,40 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2)) endif() # ---[ Caffe2 HIP sources. -if(BUILD_CAFFE2) - if(USE_ROCM) - HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS}) - set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS}) +if(USE_ROCM) + HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES}) - target_include_directories( - caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES}) - target_include_directories( - caffe2_hip INTERFACE $) + IF(BUILD_ATEN) + # Set necessary HIPCC Flags + SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}") + SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}") + ENDIF() - target_link_libraries(caffe2_hip PUBLIC caffe2) - target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS}) + # Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it. + # Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY. + HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS}) - set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP) + set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS}) - caffe2_interface_library(caffe2_hip caffe2_hip_library) - list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library) - install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib) - endif() + target_include_directories( + caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES}) + target_include_directories( + caffe2_hip INTERFACE $) + + IF(BUILD_ATEN) + aten_set_target_props(caffe2_hip) + ENDIF() + + target_link_libraries(caffe2_hip PUBLIC caffe2) + target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS}) + + # https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine + # To avoid having to do above, keep this commented. + # set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP) + + caffe2_interface_library(caffe2_hip caffe2_hip_library) + list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library) + install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib) endif() # ---[ Check if warnings should be errors. @@ -292,7 +309,9 @@ if(BUILD_CAFFE2) target_compile_features(${test_name} PRIVATE cxx_range_for) endif() add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION test) + if (INSTALL_TEST) + install(TARGETS ${test_name} DESTINATION test) + endif() endforeach() if(USE_ROCM) @@ -314,7 +333,8 @@ if(BUILD_CAFFE2) # Aten tests should only run when Caffe2 is not built set(__aten_test_dir "test/aten") endif() -if(BUILD_ATEN) +# Todo - Set up ATen tests for ROCm in an upcoming PR +if(BUILD_ATEN AND NOT USE_ROCM) foreach(test_src ${ATen_CPU_TEST_SRCS}) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h index 6ee551c219480c..f56ffe893db54b 100644 --- a/caffe2/core/common_gpu.h +++ b/caffe2/core/common_gpu.h @@ -298,86 +298,126 @@ struct SimpleArray { constexpr int kCUDATensorMaxDims = 8; -#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, func, T, ...) \ +#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \ do { \ CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \ switch (val) { \ case 1: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 2: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 3: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 4: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 5: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 6: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 7: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 8: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ default: { break; } \ } \ } while (false) -#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, func, T1, T2, ...) \ +#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \ do { \ CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \ switch (val) { \ case 1: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 2: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 3: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 4: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 5: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 6: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 7: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ case 8: { \ - func(__VA_ARGS__); \ + Func(__VA_ARGS__); \ break; \ } \ default: { break; } \ } \ } while (false) +#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \ + do { \ + CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \ + switch (val) { \ + case 1: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 2: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 3: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 4: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 5: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 6: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 7: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + case 8: { \ + Func(__VA_ARGS__); \ + break; \ + } \ + default: { break; } \ + } \ + } while (false) + } // namespace caffe2 #endif // CAFFE2_CORE_COMMON_GPU_H_ diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 9a901d2ffe1e53..bbff0f4bec4185 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -51,7 +51,7 @@ class ThreadLocalCUDAObjects { cudaStream_t GetStream(int gpu, int stream_id) { vector& gpu_streams = cuda_streams_[gpu]; - if (gpu_streams.size() <= stream_id) { + if (gpu_streams.size() <= (unsigned)stream_id) { gpu_streams.resize(stream_id + 1, nullptr); } if (!gpu_streams[stream_id]) { @@ -65,7 +65,7 @@ class ThreadLocalCUDAObjects { cublasHandle_t GetHandle(int gpu, int stream_id) { DeviceGuard guard(gpu); vector& gpu_handles = cublas_handles_[gpu]; - if (gpu_handles.size() <= stream_id) { + if (gpu_handles.size() <= (unsigned)stream_id) { gpu_handles.resize(stream_id + 1, nullptr); } if (!gpu_handles[stream_id]) { @@ -84,7 +84,7 @@ class ThreadLocalCUDAObjects { cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) { DeviceGuard guard(gpu); vector& gpu_handles = cudnn_handles_[gpu]; - if (gpu_handles.size() <= stream_id) { + if (gpu_handles.size() <= (unsigned)stream_id) { gpu_handles.resize(stream_id + 1, nullptr); } if (!gpu_handles[stream_id]) { diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc index b88afce259d058..3dd993c925a2d4 100644 --- a/caffe2/core/db.cc +++ b/caffe2/core/db.cc @@ -52,10 +52,10 @@ class MiniDBCursor : public Cursor { CAFFE_ENFORCE_GT(key_len_, 0); CAFFE_ENFORCE_GT(value_len_, 0); // Resize if the key and value len is larger than the current one. - if (key_len_ > key_.size()) { + if (key_len_ > (int)key_.size()) { key_.resize(key_len_); } - if (value_len_ > value_.size()) { + if (value_len_ > (int)value_.size()) { value_.resize(value_len_); } // Actually read in the contents. diff --git a/caffe2/core/db.h b/caffe2/core/db.h index 7bcfa1a5f6306b..7c5b79df691918 100644 --- a/caffe2/core/db.h +++ b/caffe2/core/db.h @@ -226,7 +226,7 @@ class DBReader { *value = cursor_->value(); // In sharded mode, each read skips num_shards_ records - for (int s = 0; s < num_shards_; s++) { + for (uint32_t s = 0; s < num_shards_; s++) { cursor_->Next(); if (!cursor_->Valid()) { MoveToBeginning(); @@ -270,7 +270,7 @@ class DBReader { void MoveToBeginning() const { cursor_->SeekToFirst(); - for (auto s = 0; s < shard_id_; s++) { + for (uint32_t s = 0; s < shard_id_; s++) { cursor_->Next(); CAFFE_ENFORCE( cursor_->Valid(), "Db has less rows than shard id: ", s, shard_id_); diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc index 2df6edc7848edd..04edccfdef576b 100644 --- a/caffe2/core/flags.cc +++ b/caffe2/core/flags.cc @@ -85,7 +85,7 @@ bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) { string key; string value; - int prefix_idx = arg.find('='); + size_t prefix_idx = arg.find('='); if (prefix_idx == string::npos) { // If there is no equality char in the arg, it means that the // arg is specified in the next argument. diff --git a/caffe2/core/graph.cc b/caffe2/core/graph.cc index d7a900f802b4d0..ba5a3a136404c6 100644 --- a/caffe2/core/graph.cc +++ b/caffe2/core/graph.cc @@ -22,7 +22,7 @@ Graph::Graph(const NetDef& net) : netdef_(net) { // In python, this is known as "versions". std::unordered_map edge_parent; - for (int i = 0; i < nodes_.size(); i++) { + for (int i = 0; i < (int)nodes_.size(); i++) { for (const string& blob : node(i).op.input()) { auto it = edge_parent.find(blob); if (it != edge_parent.end()) { @@ -43,7 +43,7 @@ Graph::Graph(const NetDef& net) : netdef_(net) { // For any blob, which operator was the last to read to from it? std::unordered_map edge_child; - for (int i = nodes_.size() - 1; i >= 0; i--) { + for (int i = (int)nodes_.size() - 1; i >= 0; i--) { for (const string& blob : node(i).op.output()) { auto it = edge_child.find(blob); if (it == edge_child.end()) { @@ -76,7 +76,7 @@ const std::vector> Graph::GetSubgraphPerimeterHelper( const std::vector& match) { std::vector> edge_list; std::unordered_set match_set(match.begin(), match.end()); - for (int x = 0; x < nodes_.size(); x++) { + for (int x = 0; x < (int)nodes_.size(); x++) { if (!is_node_active(x)) { continue; } @@ -142,7 +142,7 @@ NetDef Graph::GetNetDef() { // However, giving each blob unique names via SSA will satisfy this condition. // Then, the resulting graph can be optimized with memonger. - for (int i = 0; i < nodes_.size(); i++) { + for (int i = 0; i < (int)nodes_.size(); i++) { unchecked_parent_count.push_back(node(i).parents.size()); if (node(i).parents.size() == 0 && is_node_active(i)) { q.push(i); diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc index 4a61ecb1060a85..121feaf30e736e 100644 --- a/caffe2/core/memonger.cc +++ b/caffe2/core/memonger.cc @@ -30,7 +30,7 @@ NetDef optimize_inference_net( // Step 1: count first and last operator for each blob std::unordered_set all_blobs; std::unordered_map> ranges; - for (int i = 0; i < ops.size(); i++) { + for (size_t i = 0; i < ops.size(); i++) { for (auto& inp : ops[i].input()) { if (ranges.find(inp) != ranges.end()) { ranges[inp].second = i; @@ -53,7 +53,7 @@ NetDef optimize_inference_net( std::unordered_map renaming; std::unordered_map mapping; - for (int i = 0; i < ops.size(); i++) { + for (int i = 0; i < (int)ops.size(); i++) { auto& op = ops[i]; std::unordered_set new_free_blobs; @@ -366,7 +366,7 @@ class ComputeBlobRecyclingForDag { for (const auto& input : current_op.input()) { if (has_key(shareable_blob_names, input)) { blob_input_count_[input]++; - if (blob_input_count_[input] == blob_to_ops_[input].size()) { + if (blob_input_count_[input] == (int)blob_to_ops_[input].size()) { const string& actual_blob = get_blob_or_mapped_blob(input); if (!has_key(dont_share_blob_names, actual_blob)) { new_free_blobs.emplace_back( @@ -456,7 +456,7 @@ class ComputeBlobRecyclingForDag { return 0; } int size = 1; - for (int i = 0; i < blob_shapes_iter->second.size(); ++i) { + for (size_t i = 0; i < blob_shapes_iter->second.size(); ++i) { size *= blob_shapes_iter->second[i]; } return size; @@ -526,7 +526,7 @@ class ComputeBlobRecyclingForDag { const int blob_size = infer_blob_size(blob_name, blob_shapes); int best_size = -1; int free_blob_index = -1; - for (int i = 0; i < free_blobs->size(); ++i) { + for (size_t i = 0; i < free_blobs->size(); ++i) { const string& cb_name = (*free_blobs)[i].second; if (can_use_blob(cb_name, tokens, device)) { const int cand_bz = blob_sizes_[cb_name]; diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc index b14e55ea5a5b6e..9d6f536c063467 100644 --- a/caffe2/core/net.cc +++ b/caffe2/core/net.cc @@ -109,7 +109,7 @@ void checkExecutorOverride(std::string& net_type) { CAFFE_ENFORCE( executors.size() % 2 == 0, "Invalid override executors flag value"); std::unordered_map overrides; - for (auto idx = 0; idx < executors.size() - 1; idx += 2) { + for (size_t idx = 0; idx < executors.size() - 1; idx += 2) { overrides[executors[idx]] = executors[idx + 1]; } if (overrides.count(net_type)) { diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc index 30c2371b58b783..742a3457ec9164 100644 --- a/caffe2/core/net_async_base.cc +++ b/caffe2/core/net_async_base.cc @@ -163,7 +163,7 @@ int AsyncNetBase::stream(int task_id) { if (device_option.device_type() == CUDA) { int gpu_id = device_option.cuda_gpu_id(); CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id)); - if (gpu_id >= stream_counters_.size()) { + if ((unsigned)gpu_id >= stream_counters_.size()) { stream_counters_.resize(gpu_id + 1, 0); } do { diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc index 402b27c47141f1..12bd33ac7e247d 100644 --- a/caffe2/core/net_async_dag_gpu.cc +++ b/caffe2/core/net_async_dag_gpu.cc @@ -114,7 +114,7 @@ int AsyncDAGNet::stream(const DeviceOption& device_option) { if (device_option.device_type() == CUDA) { int gpu_id = device_option.cuda_gpu_id(); CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id)); - if (gpu_id >= stream_counters_.size()) { + if ((unsigned)gpu_id >= stream_counters_.size()) { stream_counters_.resize(gpu_id + 1, 0); } do { diff --git a/caffe2/core/net_async_tracing.cc b/caffe2/core/net_async_tracing.cc index 4ef17f68742e0c..0d3f83aba277ed 100644 --- a/caffe2/core/net_async_tracing.cc +++ b/caffe2/core/net_async_tracing.cc @@ -255,7 +255,7 @@ Tracer::~Tracer() { renameThreads(); std::stringstream serialized; serialized << "[\n"; - for (auto idx = 0; idx < events_.size(); ++idx) { + for (size_t idx = 0; idx < events_.size(); ++idx) { serialized << serializeEvent(events_[idx]); if (idx != events_.size() - 1) { serialized << ",\n"; diff --git a/caffe2/core/net_dag.cc b/caffe2/core/net_dag.cc index fca21866f10420..8ecc17f902c958 100644 --- a/caffe2/core/net_dag.cc +++ b/caffe2/core/net_dag.cc @@ -51,7 +51,7 @@ DAGNetBase::DAGNetBase( // Figure out the initial frontier - this is the one we will feed into the job // queue to start a run. - for (int idx = 0; idx < operator_nodes_.size(); ++idx) { + for (size_t idx = 0; idx < operator_nodes_.size(); ++idx) { if (operator_nodes_[idx].parents_.size() == 0) { initial_frontier_.push_back(idx); } @@ -66,7 +66,7 @@ DAGNetBase::DAGNetBase( } num_workers_ = num_workers; - for (int idx = 0; idx < operator_nodes_.size(); ++idx) { + for (size_t idx = 0; idx < operator_nodes_.size(); ++idx) { if (operator_nodes_[idx].is_chain_start_) { task_timers_[idx] = caffe2::make_unique(); } @@ -112,11 +112,11 @@ bool DAGNetBase::DoRunAsync() { job_queue_ = caffe2::make_unique>(); } // Figure out number of workers to start. - auto num_workers_to_start = num_workers_ - workers_.size(); + size_t num_workers_to_start = num_workers_ - workers_.size(); // Ensure the number of workers matches the defined in case // any of the previously started threads terminated. - for (auto i = 0; i < num_workers_to_start; i++) { + for (size_t i = 0; i < num_workers_to_start; i++) { VLOG(1) << "Start worker #" << workers_.size(); workers_.push_back(std::thread(&DAGNetBase::WorkerFunction, this)); } diff --git a/caffe2/core/net_dag_utils.cc b/caffe2/core/net_dag_utils.cc index 9cec13906dc095..6c24a3fdba06f4 100644 --- a/caffe2/core/net_dag_utils.cc +++ b/caffe2/core/net_dag_utils.cc @@ -30,7 +30,7 @@ void prune(int node_idx, std::vector& nodes) { // If the node has already been visited, pop curr out of // stack and clean up the ancestor table - CAFFE_ENFORCE(curr < ancestors.size(), "Out of bound access"); + CAFFE_ENFORCE(curr < (int)ancestors.size(), "Out of bound access"); if (ancestors[curr]) { ancestors[curr] = false; nodes_stack.pop(); @@ -93,7 +93,7 @@ std::vector pruneOpNodeGraph( pruned.push_back(nd); } - for (int i = 0; i < pruned.size(); ++i) { + for (int i = 0; i < (int)pruned.size(); ++i) { if (pruned[i].parents_.size() == 0) { prune(i, pruned); } @@ -107,7 +107,7 @@ std::vector pruneOpNodeGraph( void updateOperatorNodes( std::vector& nodes, const ExecutionChains& chains) { - for (int i = 0; i < nodes.size(); ++i) { + for (int i = 0; i < (int)nodes.size(); ++i) { auto& node = nodes[i]; if (chains.find(i) != chains.end()) { node.is_chain_start_ = true; @@ -123,7 +123,7 @@ void updateOperatorNodes( ExecutionChains computeChains(std::vector& orig_nodes) { const std::vector nodes = pruneOpNodeGraph(orig_nodes); vector initial_frontier; - for (int idx = 0; idx < nodes.size(); ++idx) { + for (int idx = 0; idx < (int)nodes.size(); ++idx) { if (nodes[idx].parents_.size() == 0) { initial_frontier.push_back(idx); } @@ -280,7 +280,7 @@ ExecutionChains computeChains(std::vector& orig_nodes) { ExecutionChains singleChains(std::vector& nodes) { ExecutionChains chains; - for (auto i = 0; i < nodes.size(); ++i) { + for (int i = 0; i < (int)nodes.size(); ++i) { chains[i] = {i}; } updateOperatorNodes(nodes, chains); @@ -362,7 +362,7 @@ std::vector prepareOperatorNodes( // Now, make sure that the parent list and the children list do not contain // duplicated items. - for (int i = 0; i < operator_nodes.size(); ++i) { + for (int i = 0; i < (int)operator_nodes.size(); ++i) { auto& node = operator_nodes[i]; // Sort, remove duplicates, and delete self dependency. auto& p = node.parents_; @@ -383,7 +383,7 @@ std::vector prepareChainGraphNodes( const std::vector& operator_nodes, const std::vector>& execution_chains) { std::unordered_map op_to_chain_idx; - for (int chain_idx = 0; chain_idx < execution_chains.size(); ++chain_idx) { + for (int chain_idx = 0; chain_idx < (int)execution_chains.size(); ++chain_idx) { const auto& chain_indices = execution_chains[chain_idx]; for (const auto& chain_op_idx : chain_indices) { CAFFE_ENFORCE(!op_to_chain_idx.count(chain_op_idx)); @@ -392,7 +392,7 @@ std::vector prepareChainGraphNodes( } std::vector chain_nodes(execution_chains.size()); - for (int op_idx = 0; op_idx < operator_nodes.size(); ++op_idx) { + for (int op_idx = 0; op_idx < (int)operator_nodes.size(); ++op_idx) { CAFFE_ENFORCE(op_to_chain_idx.count(op_idx)); auto chain_idx = op_to_chain_idx[op_idx]; auto& chain = chain_nodes[chain_idx]; diff --git a/caffe2/core/net_simple.cc b/caffe2/core/net_simple.cc index 43d6aec84c5dbd..46c1e8f5956d51 100644 --- a/caffe2/core/net_simple.cc +++ b/caffe2/core/net_simple.cc @@ -178,7 +178,7 @@ vector SimpleNet::TEST_Benchmark( ++idx; } } - int idx = 0; + size_t idx = 0; for (auto& op : operators_) { const string& op_type = op->debug_def().type(); const string& print_name = @@ -232,7 +232,7 @@ vector SimpleNet::TEST_Benchmark( metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type); metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type); metric_per_op_type_vec_vec.emplace_back(¶m_bytes_per_op_type); - for (int i = 0; i < metric_per_op_type_vec_vec.size(); ++i) { + for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) { std::cout << metric[i] << " per operator type:" << std::endl; auto* item = metric_per_op_type_vec_vec[i]; std::vector> metric_per_op_type_vec( @@ -260,7 +260,7 @@ vector SimpleNet::TEST_Benchmark( } } // We will reuse time_per_op to return the result of BenchmarkNet. - for (int i = 0; i < time_per_op.size(); ++i) { + for (size_t i = 0; i < time_per_op.size(); ++i) { time_per_op[i] /= main_runs; } if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) { diff --git a/caffe2/core/numa.cc b/caffe2/core/numa.cc index 26c1284b4c4d0e..a8c2d4f22f541d 100644 --- a/caffe2/core/numa.cc +++ b/caffe2/core/numa.cc @@ -75,7 +75,7 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) { size_t page_start_ptr = (((size_t)ptr) & ~(getpagesize() - 1)); size_t offset = ((size_t)ptr) - page_start_ptr; // Avoid extra dynamic allocation and NUMA api calls - CAFFE_ENFORCE(numa_node_id >= 0 && numa_node_id < sizeof(unsigned long) * 8); + CAFFE_ENFORCE(numa_node_id >= 0 && (unsigned)numa_node_id < sizeof(unsigned long) * 8); unsigned long mask = 1UL << numa_node_id; CAFFE_ENFORCE( mbind( diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index 1f54254fadadea..9a71993e826fa6 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -148,7 +148,7 @@ unique_ptr _CreateOperator( << engine; auto op = TryCreateOperator(key, operator_def, ws); if (op) { - if (engine.size() <= FLAGS_caffe2_operator_max_engine_name_length) { + if (engine.size() <= (unsigned)FLAGS_caffe2_operator_max_engine_name_length) { op->annotate_engine(engine); } else { op->annotate_engine( @@ -450,7 +450,7 @@ TensorShapes InferBlobShapesAndTypes( CaffeMap grads_to_params = GradientMakerBase::MatchGradsToParams(op); - for (int i = 0; i < out.size(); i++) { + for (size_t i = 0; i < out.size(); i++) { if (out[i].unknown_shape()) { std::string gradout = op.output(i); @@ -479,7 +479,7 @@ TensorShapes InferBlobShapesAndTypes( return tps; } - if (out.size() != op.output_size()) { + if (out.size() != (unsigned)op.output_size()) { if (op.type() == "Slice") { CAFFE_ENFORCE( out.size() == 0, @@ -495,7 +495,7 @@ TensorShapes InferBlobShapesAndTypes( out.size()); } } else { - for (int i = 0; i < out.size(); i++) { + for (size_t i = 0; i < out.size(); i++) { blob_desc[op.output(i)] = out[i]; } } diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index f31ac181b7aae7..9372ed49f4243a 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -189,7 +189,7 @@ class OperatorBase : public Observable { bool found_input; if (err->caller() != nullptr) { - for (int i = 0; i < inputs_.size(); i++) { + for (size_t i = 0; i < inputs_.size(); i++) { if (inputs_[i]->GetRaw() == err->caller()) { found_input = true; err->AppendMessage( @@ -197,7 +197,7 @@ class OperatorBase : public Observable { break; } } - for (int i = 0; i < outputs_.size(); i++) { + for (size_t i = 0; i < outputs_.size(); i++) { if (outputs_[i]->GetRaw() == err->caller()) { if (found_input) { err->AppendMessage("\n OR "); diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc index c38c3f724ad822..678646a520a261 100644 --- a/caffe2/core/operator_schema.cc +++ b/caffe2/core/operator_schema.cc @@ -286,7 +286,7 @@ DEFINE_STANDARG_ARG(IsTest, is_test) #undef DEFINE_STANDARG_ARG OpSchema& OpSchema::Input(const int n, const char* name, const char* description) { - if (input_desc_.size() <= n) { + if (input_desc_.size() <= (unsigned)n) { input_desc_.resize(n + 1); } input_desc_[n] = std::make_pair(name, description); @@ -294,7 +294,7 @@ OpSchema& OpSchema::Input(const int n, const char* name, const char* description } OpSchema& OpSchema::Output(const int n, const char* name, const char* description) { - if (output_desc_.size() <= n) { + if (output_desc_.size() <= (unsigned)n) { output_desc_.resize(n + 1); } output_desc_[n] = std::make_pair(name, description); @@ -328,7 +328,7 @@ std::ostream& operator<<(std::ostream& out, const OpSchema& schema) { if (schema.max_input_ > 0) { out << "Inputs:" << std::endl; if (!schema.input_desc_.empty()) { - for (int i = 0; i < schema.input_desc_.size(); ++i) { + for (size_t i = 0; i < schema.input_desc_.size(); ++i) { const auto& p = schema.input_desc_[i]; out << " " << i << ", " << (p.first ? p.first : "(unnamed)") << " : " << (p.second ? p.second : "(no doc)") << std::endl; @@ -340,7 +340,7 @@ std::ostream& operator<<(std::ostream& out, const OpSchema& schema) { if (schema.max_output_ > 0) { out << "Outputs:" << std::endl; if (!schema.output_desc_.empty()) { - for (int i = 0; i < schema.output_desc_.size(); ++i) { + for (size_t i = 0; i < schema.output_desc_.size(); ++i) { const auto& p = schema.output_desc_[i]; out << " " << i << ", " << (p.first ? p.first : "(unnamed)") << " : " << (p.second ? p.second : "(no doc)") << std::endl; diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc index 7574470f798b34..5f6ff342ed79c5 100644 --- a/caffe2/core/plan_executor.cc +++ b/caffe2/core/plan_executor.cc @@ -437,7 +437,7 @@ bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) { if (step.has_num_concurrent_instances()) { numThreads *= step.num_concurrent_instances(); } - for (int64_t i = 0; i < numThreads; ++i) { + for (size_t i = 0; i < numThreads; ++i) { threads.emplace_back(worker); } for (auto& thread : threads) { diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc index 0462ad7aded3e0..e82aa098ce6e8e 100644 --- a/caffe2/core/predictor.cc +++ b/caffe2/core/predictor.cc @@ -122,8 +122,8 @@ Predictor::Predictor( } bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) { - CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size()); - for (auto i = 0; i < inputs.size(); ++i) { + CAFFE_ENFORCE(inputs.size() <= (unsigned)run_net_.external_input_size()); + for (size_t i = 0; i < inputs.size(); ++i) { shareInputTensor(&ws_, run_net_.external_input(i), inputs[i]); } @@ -132,7 +132,7 @@ bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) { } outputs->resize(run_net_.external_output_size()); - for (auto i = 0; i < outputs->size(); ++i) { + for (size_t i = 0; i < outputs->size(); ++i) { (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i)); } return true; @@ -158,7 +158,7 @@ bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) { } outputs->resize(run_net_.external_output_size()); - for (auto i = 0; i < outputs->size(); ++i) { + for (size_t i = 0; i < outputs->size(); ++i) { (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i)); } return true; diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index 852b605ac2aecf..1be8193400f11c 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -84,7 +84,7 @@ struct TensorCPUStatGetter : BlobStatGetter { auto nbytes = tensor.nbytes(); if (nbytes > 0 && tensor.IsType()) { const auto* data = tensor.data(); - for (size_t i = 0; i < tensor.size(); ++i) { + for (int i = 0; i < tensor.size(); ++i) { nbytes += data[i].size(); } } diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 338ffe03d70539..84c7c98e73b930 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -39,7 +39,7 @@ inline vector ToVectorTIndex(const std::vector& src) { */ inline TIndex size_from_dim_(int k, const vector& dims) { TIndex r = 1; - for (int i = k; i < dims.size(); ++i) { + for (size_t i = k; i < dims.size(); ++i) { r *= dims[i]; } return r; @@ -47,7 +47,7 @@ inline TIndex size_from_dim_(int k, const vector& dims) { // Product of all dims up to inline TIndex size_to_dim_(int k, const vector& dims) { - CAFFE_ENFORCE(k <= dims.size()); + CAFFE_ENFORCE((unsigned)k <= dims.size()); TIndex r = 1; for (int i = 0; i < k; ++i) { r *= dims[i]; @@ -57,7 +57,7 @@ inline TIndex size_to_dim_(int k, const vector& dims) { // Product of all dims between k and l (not including dims[k] and dims[l]) inline TIndex size_between_dim_(int k, int l, const vector& dims) { - CAFFE_ENFORCE(l < dims.size()); + CAFFE_ENFORCE((unsigned)l < dims.size()); TIndex r = 1; if (k < l) { for (int i = k + 1; i < l; ++i) { @@ -694,7 +694,7 @@ class Tensor { auto old_size = size_; dims_.resize(src.size()); TIndex new_size = 1; - for (unsigned int i = 0; i < src.size(); ++i) { + for (size_t i = 0; i < src.size(); ++i) { new_size *= src[i]; dims_[i] = src[i]; } diff --git a/caffe2/core/transform.cc b/caffe2/core/transform.cc index 51ac5d1297cec3..5ea42fe83b9b83 100644 --- a/caffe2/core/transform.cc +++ b/caffe2/core/transform.cc @@ -20,7 +20,7 @@ std::vector> Transform::PatternMatch(const Graph& graph) { std::vector> matches; // Consider every possible node as the starting point. - for (int idx = 0; idx < graph.size(); ++idx) { + for (int idx = 0; idx < (int)graph.size(); ++idx) { // The current working subgraph. We will try to add new nodes to this, // when invoking the PatternRule. std::vector subgraph; @@ -81,7 +81,7 @@ void Transform::PatternMatchHelper( best_subgraph = subgraph; } - int size_before = subgraph.size(); + size_t size_before = subgraph.size(); if (pattern_match_type_ == CONNECTED_SUBGRAPH) { // Connected Component Order Pattern Matching @@ -89,7 +89,7 @@ void Transform::PatternMatchHelper( // Try adding each parent and child of every node in the subgraph, // and see if we can accept it. - for (int i = 0; i < subgraph.size(); i++) { + for (size_t i = 0; i < subgraph.size(); i++) { int x = subgraph[i]; TryNeighbors( graph, @@ -119,11 +119,11 @@ void Transform::PatternMatchHelper( // node in our current subgraph. // Thus, we simply iterate over the nodes that come AFTER the last node of // our current subgraph. - int start_idx = 0; + size_t start_idx = 0; if (subgraph.size() > 0) { start_idx = subgraph.back() + 1; } - for (int i = start_idx; i < graph.size(); i++) { + for (size_t i = start_idx; i < graph.size(); i++) { if (!matched.at(i) && PatternRule(graph, subgraph, i)) { subgraph.push_back(i); PatternMatchHelper(graph, matched, subgraph_ptr, best_subgraph_ptr); @@ -136,7 +136,7 @@ void Transform::PatternMatchHelper( // For every current subgraph, we consider all nodes to be // the next candidate node, as long as it isn't already matched. - for (int i = 0; i < graph.size(); i++) { + for (size_t i = 0; i < graph.size(); i++) { if (std::find(subgraph.begin(), subgraph.end(), i) == subgraph.end()) { // Then we try appending it to the subgraph. if (!matched.at(i) && PatternRule(graph, subgraph, i)) { diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h index 3d57baceff7cec..cad012e43211ef 100644 --- a/caffe2/core/typeid.h +++ b/caffe2/core/typeid.h @@ -219,7 +219,7 @@ class TypeMeta { template static void _Ctor(void* ptr, size_t n) { T* typed_ptr = static_cast(ptr); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { new (typed_ptr + i) T; } } @@ -231,7 +231,7 @@ class TypeMeta { static void _Copy(const void* src, void* dst, size_t n) { const T* typed_src = static_cast(src); T* typed_dst = static_cast(dst); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { typed_dst[i] = typed_src[i]; } } @@ -253,7 +253,7 @@ class TypeMeta { template static void _Dtor(void* ptr, size_t n) { T* typed_ptr = static_cast(ptr); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { typed_ptr[i].~T(); } } diff --git a/caffe2/ideep/operators/elementwise_sum_op.cc b/caffe2/ideep/operators/elementwise_sum_op.cc index eb9baf99d6990a..373d8801368b91 100644 --- a/caffe2/ideep/operators/elementwise_sum_op.cc +++ b/caffe2/ideep/operators/elementwise_sum_op.cc @@ -12,7 +12,7 @@ class IDEEPSumOp final : public IDEEPOperator { virtual ~IDEEPSumOp() {} bool RunOnDevice() override { - const auto &X = Input(INPUT0); + const auto& X = Input(INPUT0); auto* Y = Output(OUTPUT); if (InputSize() == 1) { @@ -20,8 +20,15 @@ class IDEEPSumOp final : public IDEEPOperator { } else { vector inputs; - vector scales (InputSize(), 1.0); + const vector scales(InputSize(), 1.0); + const auto dims = X.get_dims(); for (int i = 0; i < InputSize(); ++i) { + if (Input(i).get_dims() != dims) { + CAFFE_ENFORCE_EQ( + dims, + Input(i).get_dims(), + "Broadcast is not yet supported with IDEEP."); + } inputs.emplace_back(Input(i)); } @@ -32,11 +39,11 @@ class IDEEPSumOp final : public IDEEPOperator { } private: - INPUT_TAGS(INPUT0); OUTPUT_TAGS(OUTPUT); }; REGISTER_IDEEP_OPERATOR(Sum, IDEEPSumOp); +REGISTER_IDEEP_OPERATOR(Add, IDEEPSumOp); } // namespace caffe2 diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc index 216633e38d8a06..1f3ad1eb98ce41 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.cc +++ b/caffe2/ideep/operators/operator_fallback_ideep.cc @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -53,5 +54,7 @@ REGISTER_IDEEP_OPERATOR( REGISTER_IDEEP_OPERATOR( GivenTensorFill, IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR(Load, IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR(Save, IDEEPFallbackOp>); } // namespace caffe2 diff --git a/caffe2/ideep/operators/squeeze_op.cc b/caffe2/ideep/operators/squeeze_op.cc new file mode 100644 index 00000000000000..fe78e30a7d3369 --- /dev/null +++ b/caffe2/ideep/operators/squeeze_op.cc @@ -0,0 +1,59 @@ +#include +#include "caffe2/operators/expand_squeeze_dims_op.h" + +namespace caffe2 { + +class IDEEPSqueezeOp final : public IDEEPOperator { + public: + USE_IDEEP_DEF_ALIASES(); + USE_IDEEP_OPERATOR_FUNCTIONS(); + + IDEEPSqueezeOp(const OperatorDef& operator_def, Workspace* ws) + : IDEEPOperator(operator_def, ws), + dims_(OperatorBase::GetRepeatedArgument("dims")) { + auto originalSize = dims_.size(); + CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided."); + + std::sort(dims_.begin(), dims_.end()); + dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end()); + if (dims_.size() < originalSize) { + LOG(WARNING) << "Parameter `dims` has repeated dimensions."; + } + CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative."); + } + + virtual ~IDEEPSqueezeOp() {} + + bool RunOnDevice() override { + const auto& X = Input(INPUT); + auto* Y = Output(OUTPUT); + + CAFFE_ENFORCE_GT( + X.ndims(), + dims_.back(), + "Input needs at least ", + (dims_.back() + 1), + " dimensions."); + const auto& ideep_dims = X.get_dims(); + vector dims(ideep_dims.begin(), ideep_dims.end()); + const auto& new_dims = SqueezeOp::ComputeDims(dims, dims_); + itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end()); + if (&X != Y) { + // Copy if not inplace + ideep::direct_copy::compute(X, *Y); + } + Y->reshape(new_dims_ideep); + + return true; + } + + private: + vector dims_; + + INPUT_TAGS(INPUT); + OUTPUT_TAGS(OUTPUT); +}; + +REGISTER_IDEEP_OPERATOR(Squeeze, IDEEPSqueezeOp); + +} // namespace caffe2 diff --git a/caffe2/mkl/operators/operator_fallback_mkl.cc b/caffe2/mkl/operators/operator_fallback_mkl.cc index a7e9082b7c5994..cb865c5dcbe85b 100644 --- a/caffe2/mkl/operators/operator_fallback_mkl.cc +++ b/caffe2/mkl/operators/operator_fallback_mkl.cc @@ -5,7 +5,7 @@ #include "caffe2/operators/cross_entropy_op.h" #include "caffe2/operators/dropout_op.h" #include "caffe2/operators/elementwise_linear_op.h" -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_ops.h" #include "caffe2/operators/filler_op.h" #include "caffe2/operators/load_save_op.h" #include "caffe2/operators/loss_op.h" @@ -16,14 +16,17 @@ namespace caffe2 { namespace { + struct SigmoidCPUFunctor { template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { + bool operator()(const int n, const T* x, T* y, CPUContext* /* context */) + const { ConstEigenVectorArrayMap xM(x, n); EigenVectorArrayMap(y, n) = 1. / (1. + (-xM).exp()); + return true; } }; + } // namespace } // namespace caffe2 diff --git a/caffe2/operators/abs_op.cc b/caffe2/operators/abs_op.cc index 75379beb4a6c12..40e4568309a61b 100644 --- a/caffe2/operators/abs_op.cc +++ b/caffe2/operators/abs_op.cc @@ -1,36 +1,37 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/abs_op.h" -namespace caffe2 { +#include +#include -struct AbsCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Abs(n, x, y, device_context); - } -}; +namespace caffe2 { -struct AbsGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = - (xM == T(0)).select(T(0), (xM > T(0)).select(dyM, -dyM)); - } -}; +template <> +template +bool AbsGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = + (X_arr == T(0)).select(T(0), (X_arr > T(0)).select(dY_arr, -dY_arr)); + return true; +} REGISTER_CPU_OPERATOR( Abs, - UnaryElementwiseOp, CPUContext, AbsCPUFunctor>); + UnaryElementwiseOp, CPUContext, AbsFunctor>); REGISTER_CPU_OPERATOR( AbsGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + AbsGradientFunctor>); OPERATOR_SCHEMA(Abs) .NumInputs(1) @@ -48,15 +49,21 @@ Calculates the absolute value of the given input tensor, element-wise. OPERATOR_SCHEMA(AbsGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +namespace { + class GetAbsGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "AbsGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Abs, GetAbsGradient); + } // namespace caffe2 diff --git a/caffe2/operators/abs_op.cu b/caffe2/operators/abs_op.cu index c4ccdd7ffde229..8da37af5777988 100644 --- a/caffe2/operators/abs_op.cu +++ b/caffe2/operators/abs_op.cu @@ -1,61 +1,60 @@ -#include +#include "caffe2/operators/abs_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { -template -__global__ void AbsKernel(const int N, const T* X, T* Y) { - CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = fabs(X[i]); - } -} +namespace { template -__global__ void AbsGradientKernel(const int N, const T* X, const T* dY, T* dX) { +__global__ void +AbsGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(X + i) == T(0) + ? T(0) + : (__ldg(X + i) > T(0) ? __ldg(dY + i) : -__ldg(dY + i)); +#else dX[i] = X[i] == T(0) ? T(0) : (X[i] > T(0) ? dY[i] : -dY[i]); +#endif } } -struct AbsCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - AbsKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct AbsGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - AbsGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; +} // namespace + +template <> +template +bool AbsGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + AbsGradientCUDAKernel + <<cuda_stream()>>>(size, dY, X, dX); + return true; +} REGISTER_CUDA_OPERATOR( Abs, - UnaryElementwiseOp, CUDAContext, AbsCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + AbsFunctor>); REGISTER_CUDA_OPERATOR( AbsGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + AbsGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/abs_op.h b/caffe2/operators/abs_op.h new file mode 100644 index 00000000000000..9b59557e040a62 --- /dev/null +++ b/caffe2/operators/abs_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_ABS_OP_H_ +#define CAFFE2_OPERATORS_ABS_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct AbsFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Abs(N, X, Y, context); + return true; + } +}; + +template +struct AbsGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ABS_OP_H_ diff --git a/caffe2/operators/acos_op.cc b/caffe2/operators/acos_op.cc index 8db8e7cf2343de..67e8477869cfa7 100644 --- a/caffe2/operators/acos_op.cc +++ b/caffe2/operators/acos_op.cc @@ -1,35 +1,39 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/acos_op.h" -namespace caffe2 { +#include +#include -struct AcosCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Acos(n, x, y, device_context); - } -}; +namespace caffe2 { -struct AcosGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = -dyM / sqrt(1 - xM * xM); - } -}; +template <> +template +bool AcosGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = -dY_arr * (T(1) - X_arr.square()).rsqrt(); + return true; +} REGISTER_CPU_OPERATOR( Acos, - UnaryElementwiseOp, CPUContext, AcosCPUFunctor>); + UnaryElementwiseOp< + TensorTypes, + CPUContext, + AcosFunctor>); REGISTER_CPU_OPERATOR( AcosGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + AcosGradientFunctor>); OPERATOR_SCHEMA(Acos) .NumInputs(1) @@ -44,18 +48,26 @@ Calculates the arccosine of the given input tensor, element-wise. "output", "The arccosine of the input tensor computed element-wise"); -OPERATOR_SCHEMA(AcosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +OPERATOR_SCHEMA(AcosGradient) + .NumInputs(2) + .NumOutputs(1) + .IdenticalTypeAndShape(); + +namespace { class GetAcosGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "AcosGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; +} // namespace + REGISTER_GRADIENT(Acos, GetAcosGradient); + } // namespace caffe2 diff --git a/caffe2/operators/acos_op.cu b/caffe2/operators/acos_op.cu index e15085249da83d..e1fcc23104e596 100644 --- a/caffe2/operators/acos_op.cu +++ b/caffe2/operators/acos_op.cu @@ -1,61 +1,60 @@ -#include +#include "caffe2/operators/acos_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { -template -__global__ void AcosKernel(const int N, const T* X, T* Y) { +namespace { + +__global__ void AcosGradientCUDAKernel( + const int N, + const float* dY, + const float* X, + float* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = acos(X[i]); +#if __CUDA_ARCH__ >= 350 + dX[i] = -__ldg(dY + i) * rsqrtf(1.0f - __ldg(X + i) * __ldg(X + i)); +#else + dX[i] = -dY[i] * rsqrtf(1.0f - X[i] * X[i]); +#endif } } +} // namespace + +template <> template -__global__ void AcosGradientKernel(const int N, const T* X, const T* dY, T* dX) { - CUDA_1D_KERNEL_LOOP(i, N) { - dX[i] = -dY[i] / sqrt(1 - X[i] * X[i]); - } +bool AcosGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + AcosGradientCUDAKernel<<< + CAFFE_GET_BLOCKS(size), + CAFFE_CUDA_NUM_THREADS, + 0, + context->cuda_stream()>>>(size, dY, X, dX); + return true; } -struct AcosCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - AcosKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct AcosGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - AcosGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; - REGISTER_CUDA_OPERATOR( Acos, - UnaryElementwiseOp, CUDAContext, AcosCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + AcosFunctor>); REGISTER_CUDA_OPERATOR( AcosGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + AcosGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/acos_op.h b/caffe2/operators/acos_op.h new file mode 100644 index 00000000000000..6ca53ce8d784f8 --- /dev/null +++ b/caffe2/operators/acos_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_ACOS_OP_H_ +#define CAFFE2_OPERATORS_ACOS_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct AcosFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Acos(N, X, Y, context); + return true; + } +}; + +template +struct AcosGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ACOS_OP_H_ diff --git a/caffe2/operators/asin_op.cc b/caffe2/operators/asin_op.cc index 8b8b7509603901..163a6403b4d11b 100644 --- a/caffe2/operators/asin_op.cc +++ b/caffe2/operators/asin_op.cc @@ -1,35 +1,39 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/asin_op.h" -namespace caffe2 { +#include +#include -struct AsinCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Asin(n, x, y, device_context); - } -}; +namespace caffe2 { -struct AsinGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = dyM / sqrt(1 - xM * xM); - } -}; +template <> +template +bool AsinGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = dY_arr * (T(1) - X_arr.square()).rsqrt(); + return true; +} REGISTER_CPU_OPERATOR( Asin, - UnaryElementwiseOp, CPUContext, AsinCPUFunctor>); + UnaryElementwiseOp< + TensorTypes, + CPUContext, + AsinFunctor>); REGISTER_CPU_OPERATOR( AsinGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + AsinGradientFunctor>); OPERATOR_SCHEMA(Asin) .NumInputs(1) @@ -44,18 +48,26 @@ Calculates the arcsine of the given input tensor, element-wise. "output", "The arcsine of the input tensor computed element-wise"); -OPERATOR_SCHEMA(AsinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +OPERATOR_SCHEMA(AsinGradient) + .NumInputs(2) + .NumOutputs(1) + .IdenticalTypeAndShape(); + +namespace { class GetAsinGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "AsinGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; +} // namespace + REGISTER_GRADIENT(Asin, GetAsinGradient); + } // namespace caffe2 diff --git a/caffe2/operators/asin_op.cu b/caffe2/operators/asin_op.cu index 61de4cf7a29a64..d285cac0d7e09d 100644 --- a/caffe2/operators/asin_op.cu +++ b/caffe2/operators/asin_op.cu @@ -1,61 +1,60 @@ -#include +#include "caffe2/operators/asin_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { -template -__global__ void AsinKernel(const int N, const T* X, T* Y) { +namespace { + +__global__ void AsinGradientCUDAKernel( + const int N, + const float* dY, + const float* X, + float* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = asin(X[i]); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) * rsqrtf(1.0f - __ldg(X + i) * __ldg(X + i)); +#else + dX[i] = dY[i] * rsqrtf(1.0f - X[i] * X[i]); +#endif } } +} // namespace + +template <> template -__global__ void AsinGradientKernel(const int N, const T* X, const T* dY, T* dX) { - CUDA_1D_KERNEL_LOOP(i, N) { - dX[i] = dY[i] / sqrt(1 - X[i] * X[i]); - } +bool AsinGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + AsinGradientCUDAKernel<<< + CAFFE_GET_BLOCKS(size), + CAFFE_CUDA_NUM_THREADS, + 0, + context->cuda_stream()>>>(size, dY, X, dX); + return true; } -struct AsinCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - AsinKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct AsinGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - AsinGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; - REGISTER_CUDA_OPERATOR( Asin, - UnaryElementwiseOp, CUDAContext, AsinCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + AsinFunctor>); REGISTER_CUDA_OPERATOR( AsinGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + AsinGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/asin_op.h b/caffe2/operators/asin_op.h new file mode 100644 index 00000000000000..0e03f24527855a --- /dev/null +++ b/caffe2/operators/asin_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_ASIN_OP_H_ +#define CAFFE2_OPERATORS_ASIN_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct AsinFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Asin(N, X, Y, context); + return true; + } +}; + +template +struct AsinGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ASIN_OP_H_ diff --git a/caffe2/operators/atan_op.cc b/caffe2/operators/atan_op.cc index f93d62ea18353f..47e6cf3be52be6 100644 --- a/caffe2/operators/atan_op.cc +++ b/caffe2/operators/atan_op.cc @@ -1,35 +1,39 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/atan_op.h" -namespace caffe2 { +#include +#include -struct AtanCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Atan(n, x, y, device_context); - } -}; +namespace caffe2 { -struct AtanGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = dyM / (1 + xM * xM); - } -}; +template <> +template +bool AtanGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = dY_arr / (T(1) + X_arr.square()); + return true; +} REGISTER_CPU_OPERATOR( Atan, - UnaryElementwiseOp, CPUContext, AtanCPUFunctor>); + UnaryElementwiseOp< + TensorTypes, + CPUContext, + AtanFunctor>); REGISTER_CPU_OPERATOR( AtanGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + AtanGradientFunctor>); OPERATOR_SCHEMA(Atan) .NumInputs(1) @@ -44,18 +48,26 @@ Calculates the arctangent of the given input tensor, element-wise. "output", "The arctangent of the input tensor computed element-wise"); -OPERATOR_SCHEMA(AtanGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +OPERATOR_SCHEMA(AtanGradient) + .NumInputs(2) + .NumOutputs(1) + .IdenticalTypeAndShape(); + +namespace { class GetAtanGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "AtanGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; +} // namespace + REGISTER_GRADIENT(Atan, GetAtanGradient); + } // namespace caffe2 diff --git a/caffe2/operators/atan_op.cu b/caffe2/operators/atan_op.cu index e1cc0c00b5681d..c71015d79b5814 100644 --- a/caffe2/operators/atan_op.cu +++ b/caffe2/operators/atan_op.cu @@ -1,61 +1,58 @@ -#include +#include "caffe2/operators/atan_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { +namespace { + template -__global__ void AtanKernel(const int N, const T* X, T* Y) { +__global__ void +AtanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = atan(X[i]); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) / (T(1) + __ldg(X + i) * __ldg(X + i)); +#else + dX[i] = dY[i] / (T(1) + X[i] * X[i]); +#endif } } +} // namespace + +template <> template -__global__ void AtanGradientKernel(const int N, const T* X, const T* dY, T* dX) { - CUDA_1D_KERNEL_LOOP(i, N) { - dX[i] = dY[i] / (1 + X[i] * X[i]); - } +bool AtanGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + AtanGradientCUDAKernel + <<cuda_stream()>>>(size, dY, X, dX); + return true; } -struct AtanCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - AtanKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct AtanGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - AtanGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; - REGISTER_CUDA_OPERATOR( Atan, - UnaryElementwiseOp, CUDAContext, AtanCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + AtanFunctor>); REGISTER_CUDA_OPERATOR( AtanGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + AtanGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/atan_op.h b/caffe2/operators/atan_op.h new file mode 100644 index 00000000000000..6bde917fd5db83 --- /dev/null +++ b/caffe2/operators/atan_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_ATAN_OP_H_ +#define CAFFE2_OPERATORS_ATAN_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct AtanFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Atan(N, X, Y, context); + return true; + } +}; + +template +struct AtanGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ATAN_OP_H_ diff --git a/caffe2/operators/cos_op.cc b/caffe2/operators/cos_op.cc index 8328a3c47046be..8f65fb81dd7690 100644 --- a/caffe2/operators/cos_op.cc +++ b/caffe2/operators/cos_op.cc @@ -1,35 +1,36 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/cos_op.h" -namespace caffe2 { +#include +#include -struct CosCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Cos(n, x, y, device_context); - } -}; +namespace caffe2 { -struct CosGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = -dyM * sin(xM); - } -}; +template <> +template +bool CosGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = -dY_arr * X_arr.sin(); + return true; +} REGISTER_CPU_OPERATOR( Cos, - UnaryElementwiseOp, CPUContext, CosCPUFunctor>); + UnaryElementwiseOp, CPUContext, CosFunctor>); REGISTER_CPU_OPERATOR( CosGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + CosGradientFunctor>); OPERATOR_SCHEMA(Cos) .NumInputs(1) @@ -46,15 +47,21 @@ Calculates the cosine of the given input tensor, element-wise. OPERATOR_SCHEMA(CosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +namespace { + class GetCosGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "CosGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Cos, GetCosGradient); + } // namespace caffe2 diff --git a/caffe2/operators/cos_op.cu b/caffe2/operators/cos_op.cu index 76aa1e443ac923..fcabc6781c78d0 100644 --- a/caffe2/operators/cos_op.cu +++ b/caffe2/operators/cos_op.cu @@ -1,61 +1,58 @@ -#include +#include "caffe2/operators/cos_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { -template -__global__ void CosKernel(const int N, const T* X, T* Y) { - CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = cos(X[i]); - } -} +namespace { template -__global__ void CosGradientKernel(const int N, const T* X, const T* dY, T* dX) { +__global__ void +CosGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + dX[i] = -__ldg(dY + i) * sin(__ldg(X + i)); +#else dX[i] = -dY[i] * sin(X[i]); +#endif } } -struct CosCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - CosKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct CosGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - CosGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; +} // namespace + +template <> +template +bool CosGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + CosGradientCUDAKernel<<< + CAFFE_GET_BLOCKS(size), + CAFFE_CUDA_NUM_THREADS, + 0, + context->cuda_stream()>>>(size, dY, X, dX); + return true; +} REGISTER_CUDA_OPERATOR( Cos, - UnaryElementwiseOp, CUDAContext, CosCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + CosFunctor>); REGISTER_CUDA_OPERATOR( CosGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + CosGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/cos_op.h b/caffe2/operators/cos_op.h new file mode 100644 index 00000000000000..5a25172fa8c77d --- /dev/null +++ b/caffe2/operators/cos_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_COS_OP_H_ +#define CAFFE2_OPERATORS_COS_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct CosFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Cos(N, X, Y, context); + return true; + } +}; + +template +struct CosGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_COS_OP_H_ diff --git a/caffe2/operators/elementwise_add_op.cc b/caffe2/operators/elementwise_add_op.cc index d914876e2ae0af..b02c956d1b78df 100644 --- a/caffe2/operators/elementwise_add_op.cc +++ b/caffe2/operators/elementwise_add_op.cc @@ -1,10 +1,33 @@ -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_add_op.h" namespace caffe2 { -// See the operations supported here: -// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html -#define EIGEN_ADD(x, y) ((x) + (y)) -EIGEN_FUNCTOR(Add, EIGEN_ADD, NumericTypes, SameTypeAsInput); -#undef EIGEN_ADD -} +REGISTER_CPU_OPERATOR( + Add, + BinaryElementwiseOp>); +REGISTER_CPU_OPERATOR( + AddGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CPUContext, + AddFunctor>); + +namespace { + +class GetAddGradient final : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + + std::vector GetGradientDefs() override { + return SingleGradientDef( + "AddGradient", + "", + std::vector{GO(0), I(0), I(1)}, + std::vector{GI(0), GI(1)}); + } +}; + +} // namespace + +REGISTER_GRADIENT(Add, GetAddGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_add_op.h b/caffe2/operators/elementwise_add_op.h new file mode 100644 index 00000000000000..5af98f7f483a39 --- /dev/null +++ b/caffe2/operators/elementwise_add_op.h @@ -0,0 +1,73 @@ +#ifndef CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_ +#define CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_ + +#include +#include +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct AddFunctor { + template + bool Forward( + const std::vector& A_dims, + const std::vector& B_dims, + const TIn* A, + const TIn* B, + TOut* C, + Context* context) const { + math::Add( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A, + B, + C, + context); + return true; + } + + template + bool Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC, + const TIn* /* A */, + const TIn* /* B */, + const TOut* /* C */, + TGrad* dA, + TGrad* dB, + Context* context) const { + const std::vector C_dims = + ComputeBinaryBroadcastForwardDims(A_dims, B_dims); + std::vector A_axes; + std::vector B_axes; + ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes); + math::ReduceSum( + C_dims.size(), + C_dims.data(), + A_axes.size(), + A_axes.data(), + dC, + dA, + context); + math::ReduceSum( + C_dims.size(), + C_dims.data(), + B_axes.size(), + B_axes.data(), + dC, + dB, + context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_ diff --git a/caffe2/operators/elementwise_add_op_gpu.cc b/caffe2/operators/elementwise_add_op_gpu.cc new file mode 100644 index 00000000000000..2f38bbd36e1569 --- /dev/null +++ b/caffe2/operators/elementwise_add_op_gpu.cc @@ -0,0 +1,17 @@ +#include "caffe2/operators/elementwise_add_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Add, + BinaryElementwiseOp>); +REGISTER_CUDA_OPERATOR( + AddGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CUDAContext, + AddFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_div_op.cc b/caffe2/operators/elementwise_div_op.cc index 955ab848d566a6..226589bd997e4a 100644 --- a/caffe2/operators/elementwise_div_op.cc +++ b/caffe2/operators/elementwise_div_op.cc @@ -1,27 +1,120 @@ -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_div_op.h" + +#include +#include namespace caffe2 { -// See the operations supported here: -// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html -#define EIGEN_DIV(x, y) ((x) / (y)) -EIGEN_FUNCTOR(Div, EIGEN_DIV, NumericTypes, SameTypeAsInput); -#undef EIGEN_DIV - -void ElementWiseDivide( - CPUContext& /* unused context */, - const int n, - float* dXdata, - float* dYdata, - const float* dZdata, - const float* Ydata, - const float* Zdata) { - ConstEigenVectorArrayMap dZdataVec(dZdata, n); - ConstEigenVectorArrayMap YdataVec(Ydata, n); - ConstEigenVectorArrayMap ZdataVec(Zdata, n); - EigenVectorArrayMap(dXdata, n) = dZdataVec / YdataVec; - EigenVectorArrayMap(dYdata, n) = - (dZdataVec * ZdataVec) / YdataVec; +namespace { + +template +void ComputeDivGradient( + const int ndim, + const int* A_dims, + const int* B_dims, + const int* C_dims, + const TGrad* dC, + const TIn* B, + const TOut* C, + TGrad* dA, + TGrad* dB, + CPUContext* context) { + const int A_size = + std::accumulate(A_dims, A_dims + ndim, 1, std::multiplies()); + const int B_size = + std::accumulate(B_dims, B_dims + ndim, 1, std::multiplies()); + const int C_size = + std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies()); + math::Set(A_size, TGrad(0), dA, context); + math::Set(B_size, TGrad(0), dB, context); + std::vector index(ndim, 0); + for (int C_index = 0; C_index < C_size; ++C_index) { + const int A_index = + math::utils::GetIndexFromDims(ndim, A_dims, index.data()); + const int B_index = + math::utils::GetIndexFromDims(ndim, B_dims, index.data()); + dA[A_index] += dC[C_index] / B[B_index]; + dB[B_index] += -dC[C_index] * C[C_index] / B[B_index]; + math::utils::IncreaseIndexInDims(ndim, C_dims, index.data()); + } } -REGISTER_CPU_OPERATOR(DivGradient, DivGradientOp); +} // namespace + +template <> +template +bool DivFunctor::Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC, + const TIn* /* A */, + const TIn* B, + const TOut* C, + TGrad* dA, + TGrad* dB, + CPUContext* context) const { + if (A_dims == B_dims) { + const int size = std::accumulate( + A_dims.cbegin(), A_dims.cend(), 1, std::multiplies()); + math::Div(size, dC, B, dA, context); + EigenVectorMap(dB, size) = + -ConstEigenVectorArrayMap(dC, size) * + ConstEigenVectorArrayMap(C, size) / + ConstEigenVectorArrayMap(B, size); + return true; + } + const int ndim = std::max(A_dims.size(), B_dims.size()); + std::vector A_broadcast_dims(ndim); + std::vector B_broadcast_dims(ndim); + std::vector C_broadcast_dims(ndim); + math::utils::ComputeBroadcastBinaryOpDims( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A_broadcast_dims.data(), + B_broadcast_dims.data(), + C_broadcast_dims.data()); + ComputeDivGradient( + ndim, + A_broadcast_dims.data(), + B_broadcast_dims.data(), + C_broadcast_dims.data(), + dC, + B, + C, + dA, + dB, + context); + return true; } + +REGISTER_CPU_OPERATOR( + Div, + BinaryElementwiseOp>); +REGISTER_CPU_OPERATOR( + DivGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CPUContext, + DivFunctor>); + +namespace { + +class GetDivGradient final : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + + std::vector GetGradientDefs() override { + return SingleGradientDef( + "DivGradient", + "", + std::vector{GO(0), I(0), I(1), O(0)}, + std::vector{GI(0), GI(1)}); + } +}; + +} // namespace + +REGISTER_GRADIENT(Div, GetDivGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu new file mode 100644 index 00000000000000..acba40b1d57ac9 --- /dev/null +++ b/caffe2/operators/elementwise_div_op.cu @@ -0,0 +1,316 @@ +#include "caffe2/operators/elementwise_div_op.h" + +#include +#include + +#include +#include + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +namespace { + +template +using BlockReduce = cub::BlockReduce; + +template +__global__ void ComputeDivAGradientCUDAKernel( + const int outer_size, + const int inner_size, + const SimpleArray C_dims, + const SimpleArray C_strides, + const SimpleArray B_strides, + const SimpleArray A_dims, + const TGrad* dC, + const TIn* B, + TGrad* dA) { + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + TGrad sum = 0; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int A_index = i * inner_size + j; + int C_index = 0; + int A_index_val = A_index; +#pragma unroll + for (int d = D - 1; d >= 0; --d) { + C_index += (A_index_val % A_dims.data[d]) * C_strides.data[d]; + A_index_val /= A_dims.data[d]; + } + int B_index = 0; + int C_index_val = C_index; +#pragma unroll + for (int d = D - 1; d >= 0; --d) { + B_index += B_strides.data[d] == 0 + ? 0 + : (C_index_val % C_dims.data[d]) * B_strides.data[d]; + C_index_val /= C_dims.data[d]; + } +#if __CUDA_ARCH__ >= 350 + sum += __ldg(dC + C_index) / __ldg(B + B_index); +#else + sum += dC[C_index] / B[B_index]; +#endif + } + sum = BlockReduce(temp_storage).Reduce(sum, cub::Sum()); + if (threadIdx.x == 0) { + dA[i] = sum; + } + __syncthreads(); + } +} + +template +__global__ void ComputeSimpleDivBGradientCUDAKernel( + const int size, + const TGrad* dC, + const TIn* B, + const TOut* C, + TGrad* dB) { + CUDA_1D_KERNEL_LOOP(i, size) { +#if __CUDA_ARCH__ >= 350 + dB[i] = -__ldg(dC + i) * __ldg(C + i) / __ldg(B + i); +#else + dB[i] = -dC[i] * C[i] / B[i]; +#endif + } +} + +template +__global__ void ComputeDivBGradientCUDAKernel( + const int outer_size, + const int inner_size, + const SimpleArray C_strides, + const SimpleArray B_dims, + const TGrad* dC, + const TIn* B, + const TOut* C, + TGrad* dB) { + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + TGrad sum = 0; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + int C_index = 0; + int B_index = i * inner_size + j; +#pragma unroll + for (int d = D - 1; d >= 0; --d) { + C_index += (B_index % B_dims.data[d]) * C_strides.data[d]; + B_index /= B_dims.data[d]; + } +#if __CUDA_ARCH__ >= 350 + sum += -__ldg(dC + C_index) * __ldg(C + C_index) / __ldg(B + i); +#else + sum += -dC[C_index] * C[C_index] / B[i]; +#endif + } + sum = BlockReduce(temp_storage).Reduce(sum, cub::Sum()); + if (threadIdx.x == 0) { + dB[i] = sum; + } + __syncthreads(); + } +} + +template +void ComputeDivAGradientCUDAImpl( + const int outer_size, + const int inner_size, + const int* C_dims, + const int* B_dims, + const int* A_axes, + const TGrad* dC, + const TIn* B, + TGrad* dA, + CUDAContext* context) { + SimpleArray C_dims_arr; + SimpleArray C_strides_arr; + SimpleArray B_strides_arr; + SimpleArray A_dims_arr; + std::copy_n(C_dims, D, C_dims_arr.data); + math::utils::ComputeTransposedStrides(D, C_dims, A_axes, C_strides_arr.data); + int cur_stride = 1; + for (int i = D - 1; i >= 0; --i) { + B_strides_arr.data[i] = B_dims[i] == 1 ? 0 : cur_stride; + cur_stride *= B_dims[i]; + } + for (int i = 0; i < D; ++i) { + A_dims_arr.data[i] = C_dims[A_axes[i]]; + } + ComputeDivAGradientCUDAKernel + <<cuda_stream()>>>( + outer_size, + inner_size, + C_dims_arr, + C_strides_arr, + B_strides_arr, + A_dims_arr, + dC, + B, + dA); +} + +template +void ComputeDivBGradientCUDAImpl( + const int outer_size, + const int inner_size, + const int* C_dims, + const int* B_axes, + const TGrad* dC, + const TIn* B, + const TOut* C, + TGrad* dB, + CUDAContext* context) { + SimpleArray C_strides_arr; + SimpleArray B_dims_arr; + math::utils::ComputeTransposedStrides(D, C_dims, B_axes, C_strides_arr.data); + for (int i = 0; i < D; ++i) { + B_dims_arr.data[i] = C_dims[B_axes[i]]; + } + ComputeDivBGradientCUDAKernel + <<cuda_stream()>>>( + outer_size, inner_size, C_strides_arr, B_dims_arr, dC, B, C, dB); +} + +template +void ComputeDivAGradientCUDA( + const std::vector& C_dims, + const std::vector& B_dims, + const std::vector& A_axes, + const TGrad* dC, + const TIn* B, + TGrad* dA, + CUDAContext* context) { + CAFFE_ENFORCE_EQ(C_dims.size(), B_dims.size()); + const int ndim = C_dims.size(); + std::vector A_transpose_axes(ndim); + math::utils::ComputeTransposeAxesForReduceOp( + ndim, A_axes.size(), A_axes.data(), A_transpose_axes.data()); + const int pivot = ndim - A_axes.size(); + int outer_size = 1; + for (int i = 0; i < pivot; ++i) { + outer_size *= C_dims[A_transpose_axes[i]]; + } + int inner_size = 1; + for (int i = pivot; i < ndim; ++i) { + inner_size *= C_dims[A_transpose_axes[i]]; + } + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2( + ndim, + ComputeDivAGradientCUDAImpl, + TGrad, + TIn, + outer_size, + inner_size, + C_dims.data(), + B_dims.data(), + A_transpose_axes.data(), + dC, + B, + dA, + context); +} + +template +void ComputeDivBGradientCUDA( + const std::vector& C_dims, + const std::vector& B_axes, + const TGrad* dC, + const TIn* B, + const TOut* C, + TGrad* dB, + CUDAContext* context) { + const int ndim = C_dims.size(); + std::vector B_transpose_axes(ndim); + math::utils::ComputeTransposeAxesForReduceOp( + ndim, B_axes.size(), B_axes.data(), B_transpose_axes.data()); + const int pivot = ndim - B_axes.size(); + int outer_size = 1; + for (int i = 0; i < pivot; ++i) { + outer_size *= C_dims[B_transpose_axes[i]]; + } + int inner_size = 1; + for (int i = pivot; i < ndim; ++i) { + inner_size *= C_dims[B_transpose_axes[i]]; + } + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3( + ndim, + ComputeDivBGradientCUDAImpl, + TGrad, + TIn, + TOut, + outer_size, + inner_size, + C_dims.data(), + B_transpose_axes.data(), + dC, + B, + C, + dB, + context); +} + +} // namespace + +template <> +template +bool DivFunctor::Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC, + const TIn* /* A */, + const TIn* B, + const TOut* C, + TGrad* dA, + TGrad* dB, + CUDAContext* context) const { + if (A_dims == B_dims) { + const int size = std::accumulate( + A_dims.cbegin(), A_dims.cend(), 1, std::multiplies()); + math::Div(size, dC, B, dA, context); + ComputeSimpleDivBGradientCUDAKernel + <<cuda_stream()>>>(size, dC, B, C, dB); + return true; + } + const int ndim = std::max(A_dims.size(), B_dims.size()); + std::vector A_broadcast_dims(ndim); + std::vector B_broadcast_dims(ndim); + std::vector C_broadcast_dims(ndim); + math::utils::ComputeBroadcastBinaryOpDims( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A_broadcast_dims.data(), + B_broadcast_dims.data(), + C_broadcast_dims.data()); + std::vector A_axes; + std::vector B_axes; + ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes); + ComputeDivAGradientCUDA( + C_broadcast_dims, B_broadcast_dims, A_axes, dC, B, dA, context); + ComputeDivBGradientCUDA( + C_broadcast_dims, B_axes, dC, B, C, dB, context); + return true; +} + +REGISTER_CUDA_OPERATOR( + Div, + BinaryElementwiseOp>); +REGISTER_CUDA_OPERATOR( + DivGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CUDAContext, + DivFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_div_op.h b/caffe2/operators/elementwise_div_op.h new file mode 100644 index 00000000000000..d89fcd4f5381a0 --- /dev/null +++ b/caffe2/operators/elementwise_div_op.h @@ -0,0 +1,48 @@ +#ifndef CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_ +#define CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct DivFunctor { + template + bool Forward( + const std::vector& A_dims, + const std::vector& B_dims, + const TIn* A, + const TIn* B, + TOut* C, + Context* context) const { + math::Div( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A, + B, + C, + context); + return true; + } + + template + bool Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC_data, + const TIn* A_data, + const TIn* B_data, + const TOut* C_data, + TGrad* dA_data, + TGrad* dB_data, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_ diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h index 90ec047e390a0d..99b84c58303975 100644 --- a/caffe2/operators/elementwise_logical_ops.h +++ b/caffe2/operators/elementwise_logical_ops.h @@ -5,7 +5,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_ops.h" #include diff --git a/caffe2/operators/elementwise_mul_op.cc b/caffe2/operators/elementwise_mul_op.cc index 75a5e076aef159..9878e613b4b607 100644 --- a/caffe2/operators/elementwise_mul_op.cc +++ b/caffe2/operators/elementwise_mul_op.cc @@ -1,10 +1,117 @@ -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_mul_op.h" + +#include +#include namespace caffe2 { -// See the operations supported here: -// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html -#define EIGEN_MUL(x, y) ((x) * (y)) -EIGEN_FUNCTOR(Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput); -#undef EIGEN_MUL +namespace { + +template +void ComputeMulGradient( + const int ndim, + const int* A_dims, + const int* B_dims, + const int* C_dims, + const TGrad* dC, + const TIn* A, + const TIn* B, + TGrad* dA, + TGrad* dB, + CPUContext* context) { + const int A_size = + std::accumulate(A_dims, A_dims + ndim, 1, std::multiplies()); + const int B_size = + std::accumulate(B_dims, B_dims + ndim, 1, std::multiplies()); + const int C_size = + std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies()); + math::Set(A_size, TGrad(0), dA, context); + math::Set(B_size, TGrad(0), dB, context); + std::vector index(ndim, 0); + for (int C_index = 0; C_index < C_size; ++C_index) { + const int A_index = + math::utils::GetIndexFromDims(ndim, A_dims, index.data()); + const int B_index = + math::utils::GetIndexFromDims(ndim, B_dims, index.data()); + dA[A_index] += dC[C_index] * B[B_index]; + dB[B_index] += dC[C_index] * A[A_index]; + math::utils::IncreaseIndexInDims(ndim, C_dims, index.data()); + } +} + +} // namespace + +template <> +template +bool MulFunctor::Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC, + const TIn* A, + const TIn* B, + const TOut* /* C */, + TGrad* dA, + TGrad* dB, + CPUContext* context) const { + if (A_dims == B_dims) { + const int size = std::accumulate( + A_dims.cbegin(), A_dims.cend(), 1, std::multiplies()); + math::Mul(size, dC, B, dA, context); + math::Mul(size, dC, A, dB, context); + return true; + } + const int ndim = std::max(A_dims.size(), B_dims.size()); + std::vector A_broadcast_dims(ndim); + std::vector B_broadcast_dims(ndim); + std::vector C_broadcast_dims(ndim); + math::utils::ComputeBroadcastBinaryOpDims( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A_broadcast_dims.data(), + B_broadcast_dims.data(), + C_broadcast_dims.data()); + ComputeMulGradient( + ndim, + A_broadcast_dims.data(), + B_broadcast_dims.data(), + C_broadcast_dims.data(), + dC, + A, + B, + dA, + dB, + context); + return true; } + +REGISTER_CPU_OPERATOR( + Mul, + BinaryElementwiseOp>); +REGISTER_CPU_OPERATOR( + MulGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CPUContext, + MulFunctor>); + +namespace { + +class GetMulGradient final : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + + std::vector GetGradientDefs() override { + return SingleGradientDef( + "MulGradient", + "", + std::vector{GO(0), I(0), I(1)}, + std::vector{GI(0), GI(1)}); + } +}; + +} // namespace + +REGISTER_GRADIENT(Mul, GetMulGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu new file mode 100644 index 00000000000000..1d891590525ffd --- /dev/null +++ b/caffe2/operators/elementwise_mul_op.cu @@ -0,0 +1,197 @@ +#include "caffe2/operators/elementwise_mul_op.h" + +#include +#include + +#include +#include + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +namespace { + +template +using BlockReduce = cub::BlockReduce; + +template +__global__ void ComputeMulGradientCUDAKernel( + const int outer_size, + const int inner_size, + const SimpleArray Y_dims, + const SimpleArray Y_strides, + const SimpleArray W_strides, + const SimpleArray X_dims, + const TGrad* dY, + const TIn* W, + TGrad* dX) { + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + TGrad sum = 0; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int X_index = i * inner_size + j; + int Y_index = 0; + int X_index_val = X_index; +#pragma unroll + for (int d = D - 1; d >= 0; --d) { + Y_index += (X_index_val % X_dims.data[d]) * Y_strides.data[d]; + X_index_val /= X_dims.data[d]; + } + int W_index = 0; + int Y_index_val = Y_index; +#pragma unroll + for (int d = D - 1; d >= 0; --d) { + W_index += W_strides.data[d] == 0 + ? 0 + : (Y_index_val % Y_dims.data[d]) * W_strides.data[d]; + Y_index_val /= Y_dims.data[d]; + } +#if __CUDA_ARCH__ >= 350 + sum += __ldg(dY + Y_index) * __ldg(W + W_index); +#else + sum += dY[Y_index] * W[W_index]; +#endif + } + sum = BlockReduce(temp_storage).Reduce(sum, cub::Sum()); + if (threadIdx.x == 0) { + dX[i] = sum; + } + __syncthreads(); + } +} + +template +void ComputeMulGradientCUDAImpl( + const int outer_size, + const int inner_size, + const int* Y_dims, + const int* W_dims, + const int* X_axes, + const TGrad* dY, + const TIn* W, + TGrad* dX, + CUDAContext* context) { + SimpleArray Y_dims_arr; + SimpleArray Y_strides_arr; + SimpleArray W_strides_arr; + SimpleArray X_dims_arr; + std::copy_n(Y_dims, D, Y_dims_arr.data); + math::utils::ComputeTransposedStrides(D, Y_dims, X_axes, Y_strides_arr.data); + int cur_stride = 1; + for (int i = D - 1; i >= 0; --i) { + W_strides_arr.data[i] = W_dims[i] == 1 ? 0 : cur_stride; + cur_stride *= W_dims[i]; + } + for (int i = 0; i < D; ++i) { + X_dims_arr.data[i] = Y_dims[X_axes[i]]; + } + ComputeMulGradientCUDAKernel + <<cuda_stream()>>>( + outer_size, + inner_size, + Y_dims_arr, + Y_strides_arr, + W_strides_arr, + X_dims_arr, + dY, + W, + dX); +} + +template +void ComputeMulGradientCUDA( + const std::vector& Y_dims, + const std::vector& W_dims, + const std::vector& X_axes, + const TGrad* dY, + const TIn* W, + TGrad* dX, + CUDAContext* context) { + CAFFE_ENFORCE_EQ(Y_dims.size(), W_dims.size()); + const int ndim = Y_dims.size(); + std::vector X_transpose_axes(ndim); + math::utils::ComputeTransposeAxesForReduceOp( + ndim, X_axes.size(), X_axes.data(), X_transpose_axes.data()); + const int pivot = ndim - X_axes.size(); + int outer_size = 1; + for (int i = 0; i < pivot; ++i) { + outer_size *= Y_dims[X_transpose_axes[i]]; + } + int inner_size = 1; + for (int i = pivot; i < ndim; ++i) { + inner_size *= Y_dims[X_transpose_axes[i]]; + } + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2( + ndim, + ComputeMulGradientCUDAImpl, + TGrad, + TIn, + outer_size, + inner_size, + Y_dims.data(), + W_dims.data(), + X_transpose_axes.data(), + dY, + W, + dX, + context); +} + +} // namespace + +template <> +template +bool MulFunctor::Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC, + const TIn* A, + const TIn* B, + const TOut* /* C */, + TGrad* dA, + TGrad* dB, + CUDAContext* context) const { + if (A_dims == B_dims) { + const int size = std::accumulate( + A_dims.cbegin(), A_dims.cend(), 1, std::multiplies()); + math::Mul(size, dC, B, dA, context); + math::Mul(size, dC, A, dB, context); + return true; + } + const int ndim = std::max(A_dims.size(), B_dims.size()); + std::vector A_broadcast_dims(ndim); + std::vector B_broadcast_dims(ndim); + std::vector C_broadcast_dims(ndim); + math::utils::ComputeBroadcastBinaryOpDims( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A_broadcast_dims.data(), + B_broadcast_dims.data(), + C_broadcast_dims.data()); + std::vector A_axes; + std::vector B_axes; + ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes); + ComputeMulGradientCUDA( + C_broadcast_dims, B_broadcast_dims, A_axes, dC, B, dA, context); + ComputeMulGradientCUDA( + C_broadcast_dims, A_broadcast_dims, B_axes, dC, A, dB, context); + return true; +} + +REGISTER_CUDA_OPERATOR( + Mul, + BinaryElementwiseOp>); +REGISTER_CUDA_OPERATOR( + MulGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CUDAContext, + MulFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_mul_op.h b/caffe2/operators/elementwise_mul_op.h new file mode 100644 index 00000000000000..f1c42edc48b07a --- /dev/null +++ b/caffe2/operators/elementwise_mul_op.h @@ -0,0 +1,48 @@ +#ifndef CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_ +#define CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct MulFunctor { + template + bool Forward( + const std::vector& A_dims, + const std::vector& B_dims, + const TIn* A, + const TIn* B, + TOut* C, + Context* context) const { + math::Mul( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A, + B, + C, + context); + return true; + } + + template + bool Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC_data, + const TIn* A_data, + const TIn* B_data, + const TOut* C_data, + TGrad* dA_data, + TGrad* dB_data, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_ diff --git a/caffe2/operators/elementwise_op.cc b/caffe2/operators/elementwise_op.cc deleted file mode 100644 index 43499beaa23413..00000000000000 --- a/caffe2/operators/elementwise_op.cc +++ /dev/null @@ -1,162 +0,0 @@ -#include "caffe2/operators/elementwise_op.h" - -namespace caffe2 { - -// For some comparison and logical operators, eigen does not have vectorized -// math so we need to improvise. -#define NAIVE_FUNCTOR(name, op, input_type, output_type) \ - struct Naive##name##Functor { \ - template \ - inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \ - for (int i = 0; i < n; ++i) { \ - out[i] = op(a[i], b[b_is_scalar ? 0 : i]); \ - } \ - } \ - template \ - void RunWithBroadcast( \ - const T* a, \ - const T* b, \ - R* out, \ - size_t pre, \ - size_t n, \ - CPUContext*) { \ - for (int i = 0; i < pre; ++i) { \ - for (int j = 0; j < n; ++j) { \ - out[i * n + j] = op(a[i * n + j], b[j]); \ - } \ - } \ - } \ - template \ - void RunWithBroadcast2( \ - const T* a, \ - const T* b, \ - R* out, \ - size_t pre, \ - size_t n, \ - size_t post, \ - CPUContext*) { \ - for (int i = 0; i < pre; ++i) { \ - for (int j = 0; j < n; ++j) { \ - for (int k = 0; k < post; ++k) { \ - out[(i * n + j) * post + k] = op(a[(i * n + j) * post + k], b[j]); \ - } \ - } \ - } \ - } \ - }; \ - REGISTER_CPU_OPERATOR( \ - name, \ - BinaryElementwiseOp< \ - input_type, \ - CPUContext, \ - Naive##name##Functor, \ - output_type>) - -#define NAIVE_LT(x, y) ((x) < (y)) -NAIVE_FUNCTOR(LT, NAIVE_LT, NumericTypes, FixedType); -#undef NAIVE_LT -#define NAIVE_LE(x, y) ((x) <= (y)) -NAIVE_FUNCTOR(LE, NAIVE_LE, NumericTypes, FixedType); -#undef NAIVE_LE -#define NAIVE_GT(x, y) ((x) > (y)) -NAIVE_FUNCTOR(GT, NAIVE_GT, NumericTypes, FixedType); -#undef NAIVE_GT -#define NAIVE_GE(x, y) ((x) >= (y)) -NAIVE_FUNCTOR(GE, NAIVE_GE, NumericTypes, FixedType); -#undef NAIVE_GE -#define NAIVE_EQ(x, y) ((x) == (y)) -NAIVE_FUNCTOR(EQ, NAIVE_EQ, IntBoolTypes, FixedType); -#undef NAIVE_EQ -#define NAIVE_AND(x, y) ((x) & (y)) -NAIVE_FUNCTOR(And, NAIVE_AND, BoolTypes, FixedType); -#undef NAIVE_AND -#define NAIVE_OR(x, y) ((x) | (y)) -NAIVE_FUNCTOR(Or, NAIVE_OR, BoolTypes, FixedType); -#undef NAIVE_OR -#define NAIVE_XOR(x, y) ((x) ^ (y)) -NAIVE_FUNCTOR(Xor, NAIVE_XOR, BoolTypes, FixedType); -#undef NAIVE_XOR - -struct NotFunctor { - inline void operator()(const int n, const bool* x, bool* y, CPUContext*) { - for (int i = 0; i < n; ++i) { - y[i] = !x[i]; - } - } -}; -REGISTER_CPU_OPERATOR( - Not, - UnaryElementwiseOp); - -template -void SRLHelper::sum2one(const T* x, T* y, size_t n) { - *y = ConstEigenArrayMap(x, n, 1).sum(); -} - -template -void SRLHelper::RunWithBroadcastFront( - const T* x, - T* y, - size_t pre, - size_t n, - CPUContext*) { - EigenArrayMap(y, n, 1) = ConstEigenArrayMap(x, n, pre).rowwise().sum(); -} - -template -void SRLHelper::RunWithBroadcastBack( - const T* x, - T* y, - size_t post, - size_t n, - CPUContext*) { - EigenArrayMap(y, 1, n) = ConstEigenArrayMap(x, post, n).colwise().sum(); -} - -template -void SRLHelper::RunWithBroadcast2( - const T* a, - T* y, - size_t pre, - size_t n, - size_t post, - CPUContext*) { - for (int i = 0; i < n; ++i) { - y[i] = 0; - for (int j = 0; j < pre; ++j) { - for (int k = 0; k < post; ++k) { - y[i] += a[(j * n + i) * post + k]; - } - } - } -} - -template <> -template -bool SumReduceLikeOp::DoRunWithType() { - const auto& A = Input(0); - const auto& B = Input(1); - auto* C = Output(0); - CAFFE_ENFORCE(&B != C, "In-place is not allowed."); - C->ResizeLike(B); - const T* Adata = A.template data(); - auto* Cdata = C->template mutable_data(); - if (B.size() == 1) { - auto count = A.size(); - SRLHelper::sum2one(Adata, Cdata, count); - } else { - size_t pre, n, post; - std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_); - if (post == 1) { - SRLHelper::RunWithBroadcastFront(Adata, Cdata, pre, n, &context_); - } else if (pre == 1) { - SRLHelper::RunWithBroadcastBack(Adata, Cdata, post, n, &context_); - } else { - SRLHelper::RunWithBroadcast2(Adata, Cdata, pre, n, post, &context_); - } - } - return true; -} -REGISTER_CPU_OPERATOR(SumReduceLike, SumReduceLikeOp); - -} // namespace caffe2 diff --git a/caffe2/operators/elementwise_op.cu b/caffe2/operators/elementwise_op.cu deleted file mode 100644 index 567537f1e7fac1..00000000000000 --- a/caffe2/operators/elementwise_op.cu +++ /dev/null @@ -1,427 +0,0 @@ -#define CUB_STDERR -#include -#include -#include -#include "caffe2/core/common_gpu.h" -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/conversions.h" - -namespace caffe2 { - -#define CUDA_FUNCTOR(name, op, input_type, output_type) \ -template \ -__global__ void name##Kernel(const T* a, const T* b, R* out, int n) { \ - CUDA_1D_KERNEL_LOOP(i, n) { \ - out[i] = op(a[i], b[b_is_scalar ? 0 : i]); \ - } \ -} \ -template \ -__global__ void name##BroadcastKernel( \ - const T* a, const T* b, R* out, int pre, int n) { \ - CUDA_1D_KERNEL_LOOP(i, pre * n) { \ - out[i] = op(a[i], b[i % n]); \ - } \ -} \ -template \ -__global__ void name##Broadcast2Kernel( \ - const T* a, const T* b, R* out, int pre, int n, int post) { \ - CUDA_1D_KERNEL_LOOP(i, pre * n * post) { \ - out[i] = op(a[i], b[(i / post) % n]); \ - } \ -} \ - \ -struct Cuda##name##Functor { \ - template \ - inline void Run( \ - size_t n, const T* a, const T* b, R* out, CUDAContext* context) { \ - name##Kernel<<cuda_stream()>>>( \ - a, b, out, n); \ - } \ - template \ - void RunWithBroadcast( \ - const T* a, const T* b, R* out, size_t pre, size_t n, \ - CUDAContext* context) { \ - name##BroadcastKernel<<cuda_stream()>>>( \ - a, b, out, pre, n); \ - } \ - template \ - void RunWithBroadcast2( \ - const T* a, const T* b, R* out, size_t pre, size_t n, size_t post, \ - CUDAContext* context) { \ - name##Broadcast2Kernel<<cuda_stream()>>>( \ - a, b, out, pre, n, post); \ - } \ -}; \ -REGISTER_CUDA_OPERATOR( \ - name, BinaryElementwiseOp< \ - input_type, CUDAContext, Cuda##name##Functor, output_type>) - -#define CUDA_SUB(x, y) ((x) - (y)) -CUDA_FUNCTOR(Sub, CUDA_SUB, NumericTypes, SameTypeAsInput); -#undef CUDA_SUB -#define CUDA_MUL(x, y) ((x) * (y)) -CUDA_FUNCTOR(Mul, CUDA_MUL, NumericTypes, SameTypeAsInput); -#undef CUDA_MUL -#define CUDA_DIV(x, y) ((x) / (y)) -CUDA_FUNCTOR(Div, CUDA_DIV, NumericTypes, SameTypeAsInput); -#undef CUDA_DIV -#define CUDA_LT(x, y) ((x) < (y)) -CUDA_FUNCTOR(LT, CUDA_LT, NumericTypes, FixedType); -#undef CUDA_LT -#define CUDA_LE(x, y) ((x) <= (y)) -CUDA_FUNCTOR(LE, CUDA_LE, NumericTypes, FixedType); -#undef CUDA_LE -#define CUDA_GT(x, y) ((x) > (y)) -CUDA_FUNCTOR(GT, CUDA_GT, NumericTypes, FixedType); -#undef CUDA_GT -#define CUDA_GE(x, y) ((x) >= (y)) -CUDA_FUNCTOR(GE, CUDA_GE, NumericTypes, FixedType); -#undef CUDA_GE -#define CUDA_EQ(x, y) ((x) == (y)) -CUDA_FUNCTOR(EQ, CUDA_EQ, IntBoolTypes, FixedType); -#undef CUDA_EQ -#define CUDA_AND(x, y) ((x) & (y)) -CUDA_FUNCTOR(And, CUDA_AND, BoolTypes, FixedType); -#undef CUDA_AND -#define CUDA_OR(x, y) ((x) | (y)) -CUDA_FUNCTOR(Or, CUDA_OR, BoolTypes, FixedType); -#undef CUDA_OR -#define CUDA_XOR(x, y) ((x) ^ (y)) -CUDA_FUNCTOR(Xor, CUDA_XOR, BoolTypes, FixedType); -#undef CUDA_XOR - -__global__ void NotKernel(const int n, const bool* x, bool* y) { - CUDA_1D_KERNEL_LOOP(i, n) { - y[i] = !x[i]; - } -} -struct CudaNotFunctor { - inline void operator()( - const int n, const bool* x, bool* y, CUDAContext* context) { - NotKernel<<cuda_stream()>>>(n, x, y); - } -}; -REGISTER_CUDA_OPERATOR( - Not, - UnaryElementwiseOp); - -__global__ void DivKernel(const int n, float *dXdata, float *dYdata, - const float *dZdata, const float *Ydata, - const float *Zdata) { - CUDA_1D_KERNEL_LOOP(i, n) { - dXdata[i] = dZdata[i] / Ydata[i]; - dYdata[i] = - (dZdata[i] * Zdata[i]) / Ydata[i]; - } -} - -void ElementWiseDivide( - CUDAContext& context, - const int n, - float* dXdata, - float* dYdata, - const float* dZdata, - const float* Ydata, - const float* Zdata) { - DivKernel<<>>(n, dXdata, dYdata, dZdata, Ydata, Zdata); -} - -REGISTER_CUDA_OPERATOR(DivGradient, DivGradientOp); - -namespace { - -template -__global__ void -reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) { - int n = blockIdx.x * blockDim.x + threadIdx.x; - if (n >= N) { - return; - } - - float sum = 0.0; - for (int i = 0; i < pre; ++i) { - sum += convert::To(g_idata[i * N + n]); - } - - g_odata[n] = convert::To(sum); -} - -template -void device_reduce( - const T* d_in, - T* d_out, - int N, - Tensor* buffer, - CUDAContext* context) { - // Determine temporary device storage requirements - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Sum( - NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream()); - - auto buffer_size = temp_storage_bytes / sizeof(T); - buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0; - buffer->Resize(buffer_size); - void* d_temp_storage = static_cast(buffer->template mutable_data()); - // Run sum-reduction - cub::DeviceReduce::Sum( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - N, - context->cuda_stream()); -} - -template <> -void device_reduce( - const float16* in, - float16* out, - int N, - Tensor* buffer, - CUDAContext* context) { - auto buffer_size = 1; - - if (buffer->size() != buffer_size) { - buffer->Resize(buffer_size); - - math::Set( - N, - convert::To(1.), - buffer->mutable_data(), - context); - } - - CUBLAS_ENFORCE(cublasDotEx( - context->cublas_handle(), - N, - in, - CUDA_R_16F, - 1, - buffer->data(), - CUDA_R_16F, - 0, - out, - CUDA_R_16F, - CUDA_R_32F)); -} - -template -__global__ void -reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) { - int n = blockIdx.x; - float sum = 0.0; - int limit = pre * post; - for (int i = threadIdx.x; i < limit; i += blockDim.x) { - int curPre = i / post; - int curPost = i % post; - - sum += convert::To(g_idata[curPre * N * post + n * post + curPost]); - } - // uses a shared memory reduction within block - typedef cub::BlockReduce BlockReduceT; - // Shared memory - __shared__ typename BlockReduceT::TempStorage temp_storage; - float aggregate = BlockReduceT(temp_storage).Sum(sum); - if (threadIdx.x == 0) { - g_odata[n] = convert::To(aggregate); - } -} -} // namespace - -template <> -template -bool SumReduceLikeOp::DoRunWithType() { - const auto& A = Input(0); - const auto& B = Input(1); - auto* C = Output(0); - auto count = A.size(); - CAFFE_ENFORCE(&B != C, "In-place is not allowed."); - C->ResizeLike(B); - const T* Adata = A.template data(); - auto* Cdata = C->template mutable_data(); - if (B.size() == 1) { - device_reduce(Adata, Cdata, count, &sum_buffer_, &context_); - } else { - size_t pre, n, post; - std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_); - // because we check shape(B) \in shape(A) before, - // post and pre cannot be 1 at same time - if (post == 1) { - reduce_sum_like_post1 - <<>>(Adata, Cdata, pre, n); - } else { - if (post >= 128) { - reduce_sum_like - <<>>(Adata, Cdata, pre, n, post); - } else if (post >= 64) { - reduce_sum_like - <<>>(Adata, Cdata, pre, n, post); - } else if (post >= 32) { - reduce_sum_like - <<>>(Adata, Cdata, pre, n, post); - } else { - reduce_sum_like - <<>>(Adata, Cdata, pre, n, post); - } - } - } - return true; -} - -template <> -bool SumReduceLikeOp::RunOnDevice() { - return DispatchHelper>::call(this, Input(0)); -} - -REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp); - -namespace { - -template -__global__ void binary_add_kernel(const int N, const T* a, const T* b, T* r) { - CUDA_1D_KERNEL_LOOP(idx, N) { - r[idx] = convert::To( - convert::To(a[idx]) + - convert::To(is_scaler ? b[0] : b[idx])); - } -} - -template -__global__ void binary_add_kernel_broadcast( - const T* a, - const T* b, - T* r, - const int pre, - const int post, - const int n) { - CUDA_1D_KERNEL_LOOP(idx, no_post ? pre * n : pre * post * n) { - r[idx] = convert::To( - convert::To(a[idx]) + - convert::To(no_post ? b[idx % n] : b[(idx / post) % n])); - } -} -} // namespace - -// Actual Add operator, because the above macros are read-only. -class CUDAAddOp final : public Operator { - public: - CUDAAddOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0), - OP_SINGLE_ARG(int, "axis", axis_, -1), - OP_SINGLE_ARG(string, "axis_str", axis_str_, ""), - OP_SINGLE_ARG(string, "order", order_, "NCHW") { - // Figure out the correct axis to use. - if (enable_broadcast_) { - if (axis_ != -1) { - // Get axis from an explicit axis argument. - CAFFE_ENFORCE_EQ( - axis_str_.size(), - 0, - "Args axis and axis_str cannot be used simultaneously."); - } else if (axis_str_.size()) { - // Get the axis index semantically. - CAFFE_ENFORCE_EQ( - axis_str_.size(), 1, "Unsupported axis string", axis_str_); - size_t semantic_axis_ = order_.find(axis_str_); - CAFFE_ENFORCE_NE( - semantic_axis_, - string::npos, - "Unrecognizable axis string ", - axis_str_, - " from order string ", - order_); - axis_ = semantic_axis_; - } - } else { - CAFFE_ENFORCE( - axis_ == -1 && axis_str_.size() == 0, - "Do not specify axis or axis_str if broadcast is not enabled."); - } - } - - ~CUDAAddOp() {} - - template - bool DoRunWithType() { - auto& X0 = Input(0); - auto& X1 = Input(1); - auto* output = Output(0); - - output->ResizeLike(X0); - - const T* X0data = X0.template data(); - const T* X1data = X1.template data(); - T* outputData = output->template mutable_data(); - - if (!enable_broadcast_) { - CAFFE_ENFORCE_EQ( - X0.dims(), - X1.dims(), - "Dimension mismatch - did you forget to set broadcast=1?"); - binary_add_kernel<<< - CAFFE_GET_BLOCKS(X0.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(X0.size(), X0data, X1data, outputData); - } else if (X1.size() == 1) { - binary_add_kernel<<< - CAFFE_GET_BLOCKS(X0.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(X0.size(), X0data, X1data, outputData); - } else { - size_t pre, n, post; - std::tie(pre, n, post) = calculate_broadcast_sizes(X0, X1, axis_); - if (post == 1) { - binary_add_kernel_broadcast<<< - CAFFE_GET_BLOCKS(pre * n), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(X0data, X1data, outputData, pre, post, n); - } else { - binary_add_kernel_broadcast<<< - CAFFE_GET_BLOCKS(pre * post * n), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(X0data, X1data, outputData, pre, post, n); - } - } - return true; - } - - bool RunOnDevice() override { - if (Input(0).IsType()) { - return DoRunWithType(); - } else if (Input(0).IsType()) { - return DoRunWithType(); - } else if (Input(0).IsType()) { - return DoRunWithType(); - } else if (Input(0).IsType()) { - return DoRunWithType(); - } else { - return false; - } - } - - private: - bool enable_broadcast_; - int axis_; - string axis_str_; - string order_; -}; - -REGISTER_CUDA_OPERATOR(Add, CUDAAddOp); - -} // namespace caffe2 diff --git a/caffe2/operators/elementwise_op.h b/caffe2/operators/elementwise_op.h deleted file mode 100644 index d078f91d8a1bf2..00000000000000 --- a/caffe2/operators/elementwise_op.h +++ /dev/null @@ -1,432 +0,0 @@ -#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_ -#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_ - -#include "caffe2/core/common_omp.h" -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/core/tensor.h" -#include "caffe2/utils/math.h" - -#include - -namespace caffe2 { - -using NumericTypes = TensorTypes; -using IntTypes = TensorTypes; -using BoolTypes = TensorTypes; -using IntBoolTypes = TensorTypes; // discrete types - -struct SameTypeAsInput { - template - using type = T; -}; - -template -struct FixedType { - template - using type = R; -}; - -template < - typename InputTypes, - class Context, - class Functor, - class TypeMap = SameTypeAsInput> -class UnaryElementwiseWithArgsOp : public Operator { - public: - USE_OPERATOR_CONTEXT_FUNCTIONS; - UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), functor_(*this) {} - - bool RunOnDevice() override { - return DispatchHelper::call(this, Input(0)); - } - - template - bool DoRunWithType() { - auto& input = Input(0); - auto* output = Output(0); - output->ResizeLike(input); - using R = typename TypeMap::template type; - functor_( - input.size(), - input.template data(), - output->template mutable_data(), - &context_); - return true; - } - - private: - Functor functor_; -}; - -/** - * WithDefaultConstructor is a functor that can be used as the functor of an - * UnaryElementwiseWithArgsOp. It simply forwards the operator() call into - * another functor that doesn't accept arguments in its constructor. - */ -template -struct WithDefaultConstructor { - explicit WithDefaultConstructor(OperatorBase& /*op*/) {} - - template - void operator()(int n, const In* in, Out* out, Context* c) { - Functor()(n, in, out, c); - } -}; - -/** - * UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the - * difference that it takes a functor with default constructor, e.g. that does - * not need to take into consideration any arguments during operator creation. - */ -template < - typename InputTypes, - class Context, - class Functor, - class OutputType = SameTypeAsInput> -using UnaryElementwiseOp = UnaryElementwiseWithArgsOp< - InputTypes, - Context, - WithDefaultConstructor, - OutputType>; - -template -std::tuple calculate_broadcast_sizes( - const Tensor& A, - const Tensor& B, - int axis) { - CAFFE_ENFORCE_GE( - A.ndim(), - B.ndim(), - "If you are doing broadcasting, input1 should have " - "a smaller or equal number of dimensions."); - if (axis == -1) { - axis = A.ndim() - B.ndim(); - } - CAFFE_ENFORCE( - axis >= 0 && axis <= A.ndim() - B.ndim(), - "Broadcast axis should be in the range of" - "[0, A.ndim() - B.ndim()], but axis = ", - axis); - - int b_dim_start = 0; - while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) { - ++b_dim_start; - } - int b_dim_end = B.ndim() - 1; - while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) { - --b_dim_end; - } - size_t pre = 1, n = 1, post = 1; - for (int i = 0; i < axis + b_dim_start; ++i) { - pre *= A.dim(i); - } - for (int i = b_dim_start; i <= b_dim_end; ++i) { - CAFFE_ENFORCE_EQ( - A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch."); - n *= B.dim(i); - } - for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) { - post *= A.dim(i); - } - return std::make_tuple(pre, n, post); -} - -/** - * Performs a binary operation (e.g. +, - or /) with optional broadcast support. - * - * Functor specifies actual operation to be performed. - * - * If AllowBroadcast=false tensors has to be of exactly the same shape. - * - * If AllowBroadcast=true it support limited broadcasting of the right-hand-side - * argument to match the shape of left-hand-side argument. Only suffix matching - * is supported for now (1-dim expansion is allowed on both ends). E.g. this - * will be accepted: - * A dims: 2 3 4 5 6 - * B dims: 1 4 1 - * ^ - * | - * axis = 1 - */ -template < - typename InputTypes, - class Context, - class Functor, - class TypeMap = SameTypeAsInput> -class BinaryElementwiseOp : public Operator { - public: - USE_OPERATOR_CONTEXT_FUNCTIONS; - - BinaryElementwiseOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0), - OP_SINGLE_ARG(int, "axis", axis_, -1), - OP_SINGLE_ARG(string, "axis_str", axis_str_, ""), - OP_SINGLE_ARG(string, "order", order_, "NCHW"), - functor_() { - // Figure out the correct axis to use. - if (enable_broadcast_) { - if (axis_ != -1) { - // Get axis from an explicit axis argument. - CAFFE_ENFORCE_EQ( - axis_str_.size(), - 0, - "Args axis and axis_str cannot be used simultaneously."); - } else if (axis_str_.size()) { - // Get the axis index semantically. - CAFFE_ENFORCE_EQ( - axis_str_.size(), 1, "Unsupported axis string", axis_str_); - size_t semantic_axis_ = order_.find(axis_str_); - CAFFE_ENFORCE_NE( - semantic_axis_, - string::npos, - "Unrecognizable axis string ", - axis_str_, - " from order string ", - order_); - axis_ = semantic_axis_; - } - } else { - CAFFE_ENFORCE( - axis_ == -1 && axis_str_.size() == 0, - "Do not specify axis or axis_str if broadcast is not enabled."); - } - } - - bool RunOnDevice() override { - return DispatchHelper::call(this, Input(0)); - } - - template - bool DoRunWithType() { - const auto& A = Input(0); - const auto& B = Input(1); - auto* C = Output(0); - CAFFE_ENFORCE( - &B != C || !enable_broadcast_, - "In-place is allowed only with the first tensor when broadcasting"); - C->ResizeLike(A); - const T* Adata = A.template data(); - const T* Bdata = B.template data(); - auto* Cdata = - C->template mutable_data>(); - if (!enable_broadcast_) { - CAFFE_ENFORCE_EQ( - A.dims(), - B.dims(), - "Dimension mismatch - did you forget to set broadcast=1?"); - functor_.template Run(A.size(), Adata, Bdata, Cdata, &context_); - } else if (B.size() == 1) { - functor_.template Run(A.size(), Adata, Bdata, Cdata, &context_); - } else { - size_t pre, n, post; - std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_); - if (post == 1) { - functor_.RunWithBroadcast(Adata, Bdata, Cdata, pre, n, &context_); - } else { - functor_.RunWithBroadcast2( - Adata, Bdata, Cdata, pre, n, post, &context_); - } - } - return true; - } - - private: - bool enable_broadcast_; - int axis_; - string axis_str_; - string order_; - Functor functor_; -}; - -template -struct WithoutBroadcast { - template - inline void Run(size_t n, const T* a, const T* b, R* out, Context* c) { - if (b_is_scalar) { - CAFFE_THROW("Broadcast not supported."); - } else { - Functor().Run(n, a, b, out, c); - } - } - template - inline void RunWithBroadcast( - const T* /*a*/, - const T* /*b*/, - R* /*out*/, - size_t /*pre*/, - size_t /*n*/, - Context*) { - CAFFE_NOT_IMPLEMENTED; - } - template - inline void RunWithBroadcast2( - const T* /*a*/, - const T* /*b*/, - R* /*out*/, - size_t /*pre*/, - size_t /*n*/, - size_t /*post*/, - Context*) { - CAFFE_NOT_IMPLEMENTED; - } -}; - -// Gradient operator for elementwise division. -template -class DivGradientOp final : public Operator { - public: - USE_SIMPLE_CTOR_DTOR(DivGradientOp); - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override; -}; - -namespace SRLHelper { - -template -void sum2one(const T* a, T* y, size_t n); - -template -void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*); - -template -void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*); - -template -void RunWithBroadcast2( - const T* a, - T* y, - size_t pre, - size_t n, - size_t post, - CPUContext*); - -} // namespace SRLHelper - -// Sum reduction operator that is used for computing the gradient in cases -// where the forward op is in broadcast mode. -template -class SumReduceLikeOp final : public Operator { - public: - USE_OPERATOR_CONTEXT_FUNCTIONS; - SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - OP_SINGLE_ARG(int, "axis", axis_, -1), - OP_SINGLE_ARG(string, "axis_str", axis_str_, ""), - OP_SINGLE_ARG(string, "order", order_, "NCHW") { - if (axis_ != -1) { - // Get axis from an explicit axis argument. - CAFFE_ENFORCE_EQ( - axis_str_.size(), - 0, - "Args axis and axis_str cannot be used simultaneously."); - } else if (axis_str_.size()) { - // Get the axis index semantically. - CAFFE_ENFORCE_EQ( - axis_str_.size(), 1, "Unsupported axis string", axis_str_); - size_t semantic_axis = order_.find(axis_str_); - CAFFE_ENFORCE_NE( - semantic_axis, - string::npos, - "Unrecognizable axis string ", - axis_str_, - " from order string ", - order_); - axis_ = semantic_axis; - } - } - - bool RunOnDevice() override { - return DispatchHelper>::call(this, Input(0)); - } - - template - bool DoRunWithType(); - - private: - int axis_; - string axis_str_; - string order_; - Tensor ones_; - Tensor sum_buffer_; -}; - -template -bool DivGradientOp::RunOnDevice() { - auto& Y = Input(0); - auto& Z = Input(1); - auto& dZ = Input(2); - auto* dX = Output(0); - auto* dY = Output(1); - - dX->ResizeLike(Y); - dY->ResizeLike(Y); - - const float* Ydata = Y.template data(); - const float* Zdata = Z.template data(); - const float* dZdata = dZ.template data(); - float* dXdata = dX->template mutable_data(); - float* dYdata = dY->template mutable_data(); - - ElementWiseDivide(context_, Y.size(), dXdata, dYdata, dZdata, Ydata, Zdata); - return true; -} - -// For arithmetic operators, Eigen provides a good way to vectorize even -// when broadcasting. -#define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type) \ - struct Eigen##name##Functor { \ - template \ - inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \ - if (b_is_scalar) { \ - EigenVectorArrayMap(out, n) = \ - eigen_op((ConstEigenVectorArrayMap(a, n)), (b[0])); \ - } else { \ - EigenVectorArrayMap(out, n) = eigen_op( \ - (ConstEigenVectorArrayMap(a, n)), \ - (ConstEigenVectorArrayMap(b, n))); \ - } \ - } \ - template \ - void RunWithBroadcast( \ - const T* a, \ - const T* b, \ - R* out, \ - size_t pre, \ - size_t n, \ - CPUContext*) { \ - EigenArrayMap(out, n, pre) = eigen_op( \ - (ConstEigenArrayMap(a, n, pre).colwise()), \ - (ConstEigenVectorArrayMap(b, n))); \ - } \ - template \ - void RunWithBroadcast2( \ - const T* a, \ - const T* b, \ - R* out, \ - size_t pre, \ - size_t n, \ - size_t post, \ - CPUContext*) { \ - for (int i = 0; i < pre; ++i) { \ - EigenArrayMap(out + i * n * post, post, n) = eigen_op( \ - (ConstEigenArrayMap(a + i * n * post, post, n).rowwise()), \ - (Eigen::Map>(b, n))); \ - } \ - } \ - }; \ - REGISTER_CPU_OPERATOR( \ - name, \ - BinaryElementwiseOp< \ - input_type, \ - CPUContext, \ - Eigen##name##Functor, \ - output_type>) - -} // namespace caffe2 - -#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_ diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h index a284b1cd196de1..9b8f08136de5bd 100644 --- a/caffe2/operators/elementwise_op_test.h +++ b/caffe2/operators/elementwise_op_test.h @@ -1,11 +1,12 @@ #ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_ #define CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_ +#include "caffe2/operators/elementwise_ops.h" + #include #include #include -#include "caffe2/operators/elementwise_op.h" #include template diff --git a/caffe2/operators/elementwise_ops.cc b/caffe2/operators/elementwise_ops.cc new file mode 100644 index 00000000000000..3a3dda53de2c0d --- /dev/null +++ b/caffe2/operators/elementwise_ops.cc @@ -0,0 +1,183 @@ +#include "caffe2/operators/elementwise_ops.h" + +#include + +namespace caffe2 { + +std::vector ComputeBinaryBroadcastForwardDims( + const std::vector& A_dims, + const std::vector& B_dims) { + const int ndim = std::max(A_dims.size(), B_dims.size()); + std::vector C_dims(ndim); + int i = A_dims.size() - 1; + int j = B_dims.size() - 1; + int k = ndim - 1; + for (; i >= 0 && j >= 0; --k) { + CAFFE_ENFORCE(A_dims[i] == B_dims[j] || A_dims[i] == 1 || B_dims[j] == 1); + C_dims[k] = std::max(A_dims[i--], B_dims[j--]); + } + for (; i >= 0; --i) { + C_dims[k--] = A_dims[i]; + } + for (; j >= 0; --j) { + C_dims[k--] = B_dims[j]; + } + return C_dims; +} + +void ComputeBinaryBroadcastBackwardAxes( + const std::vector& A_dims, + const std::vector& B_dims, + std::vector* A_axes, + std::vector* B_axes) { + A_axes->clear(); + B_axes->clear(); + const int ndim = std::max(A_dims.size(), B_dims.size()); + int i = A_dims.size() - 1; + int j = B_dims.size() - 1; + int k = ndim - 1; + for (; i >= 0 && j >= 0; --k) { + CAFFE_ENFORCE(A_dims[i] == B_dims[j] || A_dims[i] == 1 || B_dims[j] == 1); + if (A_dims[i] != B_dims[j]) { + if (A_dims[i] == 1) { + A_axes->push_back(k); + } + if (B_dims[j] == 1) { + B_axes->push_back(k); + } + } + --i; + --j; + } + if (i < 0) { + for (; k >= 0; --k) { + A_axes->push_back(k); + } + } else { + for (; k >= 0; --k) { + B_axes->push_back(k); + } + } + std::reverse(A_axes->begin(), A_axes->end()); + std::reverse(B_axes->begin(), B_axes->end()); +} + +REGISTER_CPU_OPERATOR( + Not, + UnaryElementwiseOp>); +REGISTER_CPU_OPERATOR( + Sign, + UnaryElementwiseOp>); + +#define REGISTER_CPU_COMPARE_OPERATOR(Op) \ + REGISTER_CPU_OPERATOR( \ + Op, \ + BinaryElementwiseOp< \ + TensorTypes, \ + CPUContext, \ + Op##Functor, \ + FixedType>) + +REGISTER_CPU_COMPARE_OPERATOR(EQ); +REGISTER_CPU_COMPARE_OPERATOR(NE); +REGISTER_CPU_COMPARE_OPERATOR(LT); +REGISTER_CPU_COMPARE_OPERATOR(LE); +REGISTER_CPU_COMPARE_OPERATOR(GT); +REGISTER_CPU_COMPARE_OPERATOR(GE); + +#undef REGISTER_CPU_COMPARE_OPERATOR + +#define REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Op) \ + REGISTER_CPU_OPERATOR( \ + Op, BinaryElementwiseOp>) + +REGISTER_CPU_LOGICAL_BINARY_OPERATOR(And); +REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Or); +REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Xor); + +#undef REGISTER_CPU_LOGICAL_BINARY_OPERATOR + +#define REGISTER_CPU_BITWISE_BINARY_OPERATOR(Op) \ + REGISTER_CPU_OPERATOR( \ + Op, \ + BinaryElementwiseOp>) + +REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseAnd); +REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseOr); +REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseXor); + +#undef REGISTER_CPU_BITWISE_BINARY_OPERATOR + +template +void SRLHelper::sum2one(const T* x, T* y, size_t n) { + *y = ConstEigenArrayMap(x, n, 1).sum(); +} + +template +void SRLHelper::RunWithBroadcastFront( + const T* x, + T* y, + size_t pre, + size_t n, + CPUContext*) { + EigenArrayMap(y, n, 1) = ConstEigenArrayMap(x, n, pre).rowwise().sum(); +} + +template +void SRLHelper::RunWithBroadcastBack( + const T* x, + T* y, + size_t post, + size_t n, + CPUContext*) { + EigenArrayMap(y, 1, n) = ConstEigenArrayMap(x, post, n).colwise().sum(); +} + +template +void SRLHelper::RunWithBroadcast2( + const T* a, + T* y, + size_t pre, + size_t n, + size_t post, + CPUContext*) { + for (int i = 0; i < n; ++i) { + y[i] = 0; + for (int j = 0; j < pre; ++j) { + for (int k = 0; k < post; ++k) { + y[i] += a[(j * n + i) * post + k]; + } + } + } +} + +template <> +template +bool SumReduceLikeOp::DoRunWithType() { + const auto& A = Input(0); + const auto& B = Input(1); + auto* C = Output(0); + CAFFE_ENFORCE(&B != C, "In-place is not allowed."); + C->ResizeLike(B); + const T* Adata = A.template data(); + auto* Cdata = C->template mutable_data(); + if (B.size() == 1) { + auto count = A.size(); + SRLHelper::sum2one(Adata, Cdata, count); + } else { + size_t pre, n, post; + std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_); + if (post == 1) { + SRLHelper::RunWithBroadcastFront(Adata, Cdata, pre, n, &context_); + } else if (pre == 1) { + SRLHelper::RunWithBroadcastBack(Adata, Cdata, post, n, &context_); + } else { + SRLHelper::RunWithBroadcast2(Adata, Cdata, pre, n, post, &context_); + } + } + return true; +} + +REGISTER_CPU_OPERATOR(SumReduceLike, SumReduceLikeOp); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu new file mode 100644 index 00000000000000..4b1e355a926631 --- /dev/null +++ b/caffe2/operators/elementwise_ops.cu @@ -0,0 +1,214 @@ +#include "caffe2/operators/elementwise_ops.h" + +#include +#include +#include + +#include "caffe2/core/common_gpu.h" +#include "caffe2/core/context_gpu.h" +#include "caffe2/utils/conversions.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Not, + UnaryElementwiseOp>); +REGISTER_CUDA_OPERATOR( + Sign, + UnaryElementwiseOp>); + +#define REGISTER_CUDA_COMPARE_OPERATOR(Op) \ + REGISTER_CUDA_OPERATOR( \ + Op, \ + BinaryElementwiseOp< \ + TensorTypes, \ + CUDAContext, \ + Op##Functor, \ + FixedType>) + +REGISTER_CUDA_COMPARE_OPERATOR(EQ); +REGISTER_CUDA_COMPARE_OPERATOR(NE); +REGISTER_CUDA_COMPARE_OPERATOR(LT); +REGISTER_CUDA_COMPARE_OPERATOR(LE); +REGISTER_CUDA_COMPARE_OPERATOR(GT); +REGISTER_CUDA_COMPARE_OPERATOR(GE); + +#undef REGISTER_CUDA_COMPARE_OPERATOR + +#define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \ + REGISTER_CUDA_OPERATOR( \ + Op, \ + BinaryElementwiseOp>) + +REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And); +REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or); +REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor); + +#undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR + +#define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \ + REGISTER_CUDA_OPERATOR( \ + Op, \ + BinaryElementwiseOp< \ + IntBoolTypes, \ + CUDAContext, \ + Op##Functor>) + +REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd); +REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr); +REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor); + +#undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR + +namespace { + +template +__global__ void +reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) { + int n = blockIdx.x * blockDim.x + threadIdx.x; + if (n >= N) { + return; + } + + float sum = 0.0; + for (int i = 0; i < pre; ++i) { + sum += convert::To(g_idata[i * N + n]); + } + + g_odata[n] = convert::To(sum); +} + +template +void device_reduce( + const T* d_in, + T* d_out, + int N, + Tensor* buffer, + CUDAContext* context) { + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Sum( + NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream()); + + auto buffer_size = temp_storage_bytes / sizeof(T); + buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0; + buffer->Resize(buffer_size); + void* d_temp_storage = static_cast(buffer->template mutable_data()); + // Run sum-reduction + cub::DeviceReduce::Sum( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + N, + context->cuda_stream()); +} + +template <> +void device_reduce( + const float16* in, + float16* out, + int N, + Tensor* buffer, + CUDAContext* context) { + auto buffer_size = 1; + + if (buffer->size() != buffer_size) { + buffer->Resize(buffer_size); + + math::Set( + N, + convert::To(1.), + buffer->mutable_data(), + context); + } + + CUBLAS_ENFORCE(cublasDotEx( + context->cublas_handle(), + N, + in, + CUDA_R_16F, + 1, + buffer->data(), + CUDA_R_16F, + 0, + out, + CUDA_R_16F, + CUDA_R_32F)); +} + +template +__global__ void +reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) { + int n = blockIdx.x; + float sum = 0.0; + int limit = pre * post; + for (int i = threadIdx.x; i < limit; i += blockDim.x) { + int curPre = i / post; + int curPost = i % post; + + sum += + convert::To(g_idata[curPre * N * post + n * post + curPost]); + } + // uses a shared memory reduction within block + typedef cub::BlockReduce BlockReduceT; + // Shared memory + __shared__ typename BlockReduceT::TempStorage temp_storage; + float aggregate = BlockReduceT(temp_storage).Sum(sum); + if (threadIdx.x == 0) { + g_odata[n] = convert::To(aggregate); + } +} +} // namespace + +template <> +template +bool SumReduceLikeOp::DoRunWithType() { + const auto& A = Input(0); + const auto& B = Input(1); + auto* C = Output(0); + auto count = A.size(); + CAFFE_ENFORCE(&B != C, "In-place is not allowed."); + C->ResizeLike(B); + const T* Adata = A.template data(); + auto* Cdata = C->template mutable_data(); + if (B.size() == 1) { + device_reduce(Adata, Cdata, count, &sum_buffer_, &context_); + } else { + size_t pre, n, post; + std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_); + // because we check shape(B) \in shape(A) before, + // post and pre cannot be 1 at same time + if (post == 1) { + reduce_sum_like_post1 + <<>>(Adata, Cdata, pre, n); + } else { + if (post >= 128) { + reduce_sum_like + <<>>(Adata, Cdata, pre, n, post); + } else if (post >= 64) { + reduce_sum_like + <<>>(Adata, Cdata, pre, n, post); + } else if (post >= 32) { + reduce_sum_like + <<>>(Adata, Cdata, pre, n, post); + } else { + reduce_sum_like + <<>>(Adata, Cdata, pre, n, post); + } + } + } + return true; +} + +template <> +bool SumReduceLikeOp::RunOnDevice() { + return DispatchHelper>::call(this, Input(0)); +} + +REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_ops.h b/caffe2/operators/elementwise_ops.h new file mode 100644 index 00000000000000..5ae4c3d8748494 --- /dev/null +++ b/caffe2/operators/elementwise_ops.h @@ -0,0 +1,568 @@ +#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_ +#define CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_ + +#include +#include +#include +#include + +#include "caffe2/core/common_omp.h" +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +using NumericTypes = TensorTypes; +using IntTypes = TensorTypes; +using BoolTypes = TensorTypes; +using IntBoolTypes = TensorTypes; // discrete types + +struct SameTypeAsInput { + template + using type = T; +}; + +template +struct FixedType { + template + using type = R; +}; + +template < + typename InputTypes, + class Context, + class Functor, + class OutputTypeMap = SameTypeAsInput> +class UnaryElementwiseWithArgsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), functor_(*this) {} + + bool RunOnDevice() override { + return DispatchHelper::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + auto* Y = Output(0); + Y->ResizeLike(X); + return functor_( + X.size(), + X.template data(), + Y->template mutable_data>(), + &context_); + } + + private: + Functor functor_; +}; + +// UnaryFunctorWithDefaultCtor is a functor that can be used as the functor of +// an UnaryElementwiseWithArgsOp. It simply forwards the operator() call into +// another functor that doesn't accept arguments in its constructor. +template +struct UnaryFunctorWithDefaultCtor { + explicit UnaryFunctorWithDefaultCtor(OperatorBase& /* op */) {} + + template + bool operator()(const int size, const TIn* X, TOut* Y, Context* context) + const { + return functor(size, X, Y, context); + } + + Functor functor{}; +}; + +// UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the +// difference that it takes a functor with default constructor, e.g. that does +// not need to take into consideration any arguments during operator creation. +template < + typename InputTypes, + class Context, + class Functor, + class OutputTypeMap = SameTypeAsInput> +using UnaryElementwiseOp = UnaryElementwiseWithArgsOp< + InputTypes, + Context, + UnaryFunctorWithDefaultCtor, + OutputTypeMap>; + +template +std::tuple ComputeLegacyBroadcastSizes( + const Tensor& A, + const Tensor& B, + int axis) { + CAFFE_ENFORCE_GE( + A.ndim(), + B.ndim(), + "If you are doing broadcasting, input1 should have " + "a smaller or equal number of dimensions."); + if (axis == -1) { + axis = A.ndim() - B.ndim(); + } + CAFFE_ENFORCE( + axis >= 0 && axis <= A.ndim() - B.ndim(), + "Broadcast axis should be in the range of" + "[0, A.ndim() - B.ndim()], but axis = ", + axis); + + int b_dim_start = 0; + while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) { + ++b_dim_start; + } + int b_dim_end = B.ndim() - 1; + while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) { + --b_dim_end; + } + size_t pre = 1, n = 1, post = 1; + for (int i = 0; i < axis + b_dim_start; ++i) { + pre *= A.dim(i); + } + for (int i = b_dim_start; i <= b_dim_end; ++i) { + CAFFE_ENFORCE_EQ( + A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch."); + n *= B.dim(i); + } + for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) { + post *= A.dim(i); + } + return std::make_tuple(pre, n, post); +} + +std::vector ComputeBinaryBroadcastForwardDims( + const std::vector& A_dims, + const std::vector& B_dims); + +void ComputeBinaryBroadcastBackwardAxes( + const std::vector& A_dims, + const std::vector& B_dims, + std::vector* A_axes, + std::vector* B_axes); + +template < + typename InputTypes, + class Context, + class Functor, + class OutputTypeMap = SameTypeAsInput> +class BinaryElementwiseWithArgsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + BinaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false), + OP_SINGLE_ARG(int, "axis", axis_, -1), + OP_SINGLE_ARG(string, "axis_str", axis_str_, ""), + OP_SINGLE_ARG(string, "order", order_, "NCHW"), + functor_(*this) { + if (legacy_broadcast_) { + if (axis_ != -1) { + // Get axis from an explicit axis argument. + CAFFE_ENFORCE_EQ( + axis_str_.size(), + 0, + "Args axis and axis_str cannot be used simultaneously."); + } else if (axis_str_.size()) { + // Get the axis index semantically. + CAFFE_ENFORCE_EQ( + axis_str_.size(), 1, "Unsupported axis string", axis_str_); + const size_t semantic_axis_ = order_.find(axis_str_); + CAFFE_ENFORCE_NE( + semantic_axis_, + string::npos, + "Unrecognizable axis string ", + axis_str_, + " from order string ", + order_); + axis_ = semantic_axis_; + } else { + CAFFE_ENFORCE( + axis_ == -1 && axis_str_.empty(), + "Do not specify axis or axis_str if broadcast is not enabled."); + } + } + } + + bool RunOnDevice() override { + return DispatchHelper::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& A = Input(0); + const auto& B = Input(1); + auto* C = Output(0); + const T* A_data = A.template data(); + const T* B_data = B.template data(); + std::vector A_dims; + std::vector B_dims; + + if (legacy_broadcast_) { + CAFFE_ENFORCE_NE( + C, + &B, + "In-place is allowed only with the first tensor when " + "legacy-broadcasting"); + C->ResizeLike(A); + if (B.size() == 1) { + A_dims = {static_cast(A.size())}; + B_dims = {1}; + } else { + size_t pre, n, post; + std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_); + A_dims = { + static_cast(pre), static_cast(n), static_cast(post)}; + B_dims = {static_cast(n), 1}; + } + } else { + std::copy(A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims)); + std::copy(B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims)); + const std::vector C_dims = + ComputeBinaryBroadcastForwardDims(A_dims, B_dims); + if (C == &A) { + CAFFE_ENFORCE_EQ(C_dims, A_dims); + } else if (C == &B) { + CAFFE_ENFORCE_EQ(C_dims, B_dims); + } else { + C->Resize(C_dims); + } + } + auto* C_data = + C->template mutable_data>(); + return functor_.Forward(A_dims, B_dims, A_data, B_data, C_data, &context_); + } + + private: + const bool legacy_broadcast_; + int axis_; + const std::string axis_str_; + const std::string order_; + + Functor functor_; +}; + +template < + typename InputTypes, + class Context, + class Functor, + class OutputTypeMap = SameTypeAsInput, + class GradientTypeMap = SameTypeAsInput> +class BinaryElementwiseWithArgsGradientOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + BinaryElementwiseWithArgsGradientOp( + const OperatorDef& operator_def, + Workspace* ws) + : Operator(operator_def, ws), + OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false), + OP_SINGLE_ARG(int, "axis", axis_, -1), + OP_SINGLE_ARG(string, "axis_str", axis_str_, ""), + OP_SINGLE_ARG(string, "order", order_, "NCHW"), + functor_(*this) { + if (legacy_broadcast_) { + if (axis_ != -1) { + // Get axis from an explicit axis argument. + CAFFE_ENFORCE_EQ( + axis_str_.size(), + 0, + "Args axis and axis_str cannot be used simultaneously."); + } else if (axis_str_.size()) { + // Get the axis index semantically. + CAFFE_ENFORCE_EQ( + axis_str_.size(), 1, "Unsupported axis string", axis_str_); + const size_t semantic_axis_ = order_.find(axis_str_); + CAFFE_ENFORCE_NE( + semantic_axis_, + string::npos, + "Unrecognizable axis string ", + axis_str_, + " from order string ", + order_); + axis_ = semantic_axis_; + } else { + CAFFE_ENFORCE( + axis_ == -1 && axis_str_.empty(), + "Do not specify axis or axis_str if broadcast is not enabled."); + } + } + } + + bool RunOnDevice() override { + return DispatchHelper::call(this, Input(1)); + } + + template + bool DoRunWithType() { + const auto& dC = Input(0); + const auto& A = Input(1); + const auto& B = Input(2); + auto* dA = Output(0); + auto* dB = Output(1); + vector A_dims; + vector B_dims; + if (legacy_broadcast_) { + if (B.size() == 1) { + A_dims = {static_cast(A.size())}; + B_dims = {1}; + } else { + size_t pre, n, post; + std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_); + A_dims = { + static_cast(pre), static_cast(n), static_cast(post)}; + B_dims = {static_cast(n), 1}; + } + } else { + std::copy(A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims)); + std::copy(B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims)); + } + const typename OutputTypeMap::template type* C_data = nullptr; + if (InputSize() == 4) { + const auto& C = Input(3); + C_data = C.template data>(); + } + const auto* dC_data = + dC.template data>(); + const T* A_data = A.template data(); + const T* B_data = B.template data(); + dA->ResizeLike(A); + dB->ResizeLike(B); + auto* dA_data = + dA->template mutable_data>(); + auto* dB_data = + dB->template mutable_data>(); + return functor_.Backward( + A_dims, + B_dims, + dC_data, + A_data, + B_data, + C_data, + dA_data, + dB_data, + &context_); + } + + private: + const bool legacy_broadcast_; + int axis_; + const std::string axis_str_; + const std::string order_; + + Functor functor_; +}; + +template +struct BinaryFunctorWithDefaultCtor { + explicit BinaryFunctorWithDefaultCtor(OperatorBase& /* op */) {} + + template + bool Forward( + const std::vector& A_dims, + const std::vector& B_dims, + const TIn* A_data, + const TIn* B_data, + TOut* C_data, + Context* context) const { + return functor.Forward(A_dims, B_dims, A_data, B_data, C_data, context); + } + + template + bool Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC_data, + const TIn* A_data, + const TIn* B_data, + const TOut* C_data, + TGrad* dA_data, + TGrad* dB_data, + Context* context) const { + return functor.Backward( + A_dims, + B_dims, + dC_data, + A_data, + B_data, + C_data, + dA_data, + dB_data, + context); + } + + Functor functor{}; +}; + +// BinaryElementwiseOp is a wrapper around BinaryElementwiseWithArgsOp, with the +// difference that it takes a functor with default constructor, e.g. that does +// not need to take into consideration any arguments during operator creation. +template < + typename InputTypes, + class Context, + class Functor, + class TypeMap = SameTypeAsInput> +using BinaryElementwiseOp = BinaryElementwiseWithArgsOp< + InputTypes, + Context, + BinaryFunctorWithDefaultCtor, + TypeMap>; + +// BinaryElementwiseGradientOp is a wrapper around +// BinaryElementwiseGradientWithArgsOp, with the difference that it takes a +// functor with default constructor, e.g. that does not need to take into +// consideration any arguments during operator creation. +template < + typename InputTypes, + class Context, + class Functor, + class OutputTypeMap = SameTypeAsInput, + class GradientTypeMap = SameTypeAsInput> +using BinaryElementwiseGradientOp = BinaryElementwiseWithArgsGradientOp< + InputTypes, + Context, + BinaryFunctorWithDefaultCtor, + OutputTypeMap, + GradientTypeMap>; + +// Forward-only Unary Functors. +template +struct NotFunctor { + bool operator()(const int N, const bool* X, bool* Y, Context* context) const { + math::Not(N, X, Y, context); + return true; + } +}; + +template +struct SignFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Sign(N, X, Y, context); + return true; + } +}; + +// Forward-only Binary Functors. +#define CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(FunctorName) \ + template \ + struct FunctorName##Functor { \ + template \ + bool Forward( \ + const std::vector& A_dims, \ + const std::vector& B_dims, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + Context* context) const { \ + math::FunctorName( \ + A_dims.size(), \ + A_dims.data(), \ + B_dims.size(), \ + B_dims.data(), \ + A, \ + B, \ + C, \ + context); \ + return true; \ + } \ + }; + +// Compare functors. +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(EQ); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(NE); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(LT); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(LE); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(GT); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(GE); + +// Logical functors. +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(And); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(Or); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(Xor); + +// Bitwise functors. +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseAnd); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseOr); +CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseXor); + +#undef CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR + +namespace SRLHelper { + +template +void sum2one(const T* a, T* y, size_t n); + +template +void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*); + +template +void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*); + +template +void RunWithBroadcast2( + const T* a, + T* y, + size_t pre, + size_t n, + size_t post, + CPUContext*); + +} // namespace SRLHelper + +// Sum reduction operator that is used for computing the gradient in cases +// where the forward op is in broadcast mode. +template +class SumReduceLikeOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + OP_SINGLE_ARG(int, "axis", axis_, -1), + OP_SINGLE_ARG(string, "axis_str", axis_str_, ""), + OP_SINGLE_ARG(string, "order", order_, "NCHW") { + if (axis_ != -1) { + // Get axis from an explicit axis argument. + CAFFE_ENFORCE_EQ( + axis_str_.size(), + 0, + "Args axis and axis_str cannot be used simultaneously."); + } else if (axis_str_.size()) { + // Get the axis index semantically. + CAFFE_ENFORCE_EQ( + axis_str_.size(), 1, "Unsupported axis string", axis_str_); + size_t semantic_axis = order_.find(axis_str_); + CAFFE_ENFORCE_NE( + semantic_axis, + string::npos, + "Unrecognizable axis string ", + axis_str_, + " from order string ", + order_); + axis_ = semantic_axis; + } + } + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType(); + + private: + int axis_; + string axis_str_; + string order_; + Tensor ones_; + Tensor sum_buffer_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_ diff --git a/caffe2/operators/elementwise_op_schema.cc b/caffe2/operators/elementwise_ops_schema.cc similarity index 62% rename from caffe2/operators/elementwise_op_schema.cc rename to caffe2/operators/elementwise_ops_schema.cc index 860f2f55ed80bf..cab6179364b2ca 100644 --- a/caffe2/operators/elementwise_op_schema.cc +++ b/caffe2/operators/elementwise_ops_schema.cc @@ -1,5 +1,6 @@ +#include "caffe2/operators/elementwise_ops.h" + #include "caffe2/core/operator_gradient.h" -#include "caffe2/operators/elementwise_op.h" #include "caffe2/utils/proto_utils.h" namespace caffe2 { @@ -56,6 +57,11 @@ OPERATOR_SCHEMA(Add) .IdenticalTypeAndShapeOfInput(0) .FillUsing(MathDocGenerator("addition")) .InheritOnnxSchema("Add"); +OPERATOR_SCHEMA(AddGradient) + .NumInputs(3) + .NumOutputs(2) + .AllowInplace({{0, 0}, {1, 0}}); + OPERATOR_SCHEMA(Sub) .NumInputs(2) .NumOutputs(1) @@ -64,6 +70,11 @@ OPERATOR_SCHEMA(Sub) .IdenticalTypeAndShapeOfInput(0) .FillUsing(MathDocGenerator("subtraction")) .InheritOnnxSchema("Sub"); +OPERATOR_SCHEMA(SubGradient) + .NumInputs(3) + .NumOutputs(2) + .AllowInplace({{0, 0}, {1, 0}}); + OPERATOR_SCHEMA(Mul) .NumInputs(2) .NumOutputs(1) @@ -72,6 +83,11 @@ OPERATOR_SCHEMA(Mul) .IdenticalTypeAndShapeOfInput(0) .FillUsing(MathDocGenerator("multiplication")) .InheritOnnxSchema("Mul"); +OPERATOR_SCHEMA(MulGradient) + .NumInputs(3) + .NumOutputs(2) + .AllowInplace({{0, 0}, {1, 0}}); + OPERATOR_SCHEMA(Div) .NumInputs(2) .NumOutputs(1) @@ -80,7 +96,7 @@ OPERATOR_SCHEMA(Div) .IdenticalTypeAndShapeOfInput(0) .FillUsing(MathDocGenerator("division")) .InheritOnnxSchema("Div"); -OPERATOR_SCHEMA(DivGradient).NumInputs(3).NumOutputs(2).AllowInplace({{0, 0}}); +OPERATOR_SCHEMA(DivGradient).NumInputs(4).NumOutputs(2).AllowInplace({{0, 0}}); OPERATOR_SCHEMA(SumReduceLike) .NumInputs(2) @@ -120,159 +136,6 @@ For example, the following tensor shapes are supported: "If broadcasting is disabled it should be of the same size.") .Output(0, "C", "Result, has same dimensions and type as B"); -class GetAddGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - if (!ArgumentHelper::HasArgument(Def(), "broadcast")) { - SetDense(0, GO(0)); - SetDense(1, GO(0)); - return vector(); - } - SetDense(0, GO(0)); - - return SingleGradientDef( - "SumReduceLike", - "", - vector{GO(0), I(1)}, - vector{GI(1)}); - } -}; -REGISTER_GRADIENT(Add, GetAddGradient); - -// TODO(jiayq): Although we have Sub gradient implemented, we are still missing -// the Negative unary operator to be implemented. -class GetSubGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - if (!ArgumentHelper::HasArgument(Def(), "broadcast")) { - SetDense(0, GO(0)); - return SingleGradientDef( - "Negative", "", vector{GO(0)}, vector{GI(1)}); - } else { - SetDense(0, GO(0)); - vector grad_ops; - grad_ops.push_back(CreateOperatorDef( - "Negative", - "", - vector{GO(0)}, - vector{GI(1) + "_autogen_pre_red"})); - - Argument axis, axis_str, order; - if (ArgumentHelper::HasArgument(Def(), "axis")) { - axis = GetArgument(Def(), "axis"); - } else { - axis = MakeArgument("axis", -1); - } - if (ArgumentHelper::HasArgument(Def(), "axis_str")) { - axis_str = GetArgument(Def(), "axis_str"); - } else { - axis_str = MakeArgument("axis_str", ""); - } - if (ArgumentHelper::HasArgument(Def(), "order")) { - order = GetArgument(Def(), "order"); - } else { - order = MakeArgument("order", "NCHW"); - } - grad_ops.push_back(CreateOperatorDef( - "SumReduceLike", - "", - vector{GI(1) + "_autogen_pre_red", I(1)}, - vector{GI(1)}, - vector{axis, axis_str, order})); - - return grad_ops; - } - } - // Make sure the broadcast argument is not copied over. - bool CopyArguments() const override { - return false; - } -}; -REGISTER_GRADIENT(Sub, GetSubGradient); - -class GetMulGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - CAFFE_ENFORCE( - Def().input(0) != Def().output(0) && Def().input(1) != Def().output(0), - "Gradient computation cannot be carried out if Mul uses in-place " - "computation: ", - ProtoDebugString(Def())); - if (!ArgumentHelper::HasArgument(Def(), "broadcast")) { - return vector{ - CreateOperatorDef( - "Mul", "", vector{GO(0), I(1)}, vector{GI(0)}), - CreateOperatorDef( - "Mul", "", vector{GO(0), I(0)}, vector{GI(1)})}; - } else { - Argument broadcast, axis, axis_str, order; - if (ArgumentHelper::HasArgument(Def(), "broadcast")) { - broadcast = GetArgument(Def(), "broadcast"); - } else { - broadcast = MakeArgument("broadcast", 0); - } - if (ArgumentHelper::HasArgument(Def(), "axis")) { - axis = GetArgument(Def(), "axis"); - } else { - axis = MakeArgument("axis", -1); - } - if (ArgumentHelper::HasArgument(Def(), "axis_str")) { - axis_str = GetArgument(Def(), "axis_str"); - } else { - axis_str = MakeArgument("axis_str", ""); - } - if (ArgumentHelper::HasArgument(Def(), "order")) { - order = GetArgument(Def(), "order"); - } else { - order = MakeArgument("order", "NCHW"); - } - - vector grad_ops; - grad_ops.push_back(CreateOperatorDef( - "Mul", - "mul_grad_1st_op", - vector{GO(0), I(1)}, - vector{GI(0)}, - vector{broadcast, axis, axis_str, order})); - grad_ops.push_back(CreateOperatorDef( - "Mul", - "mul_gradient_2nd_op", - vector{GO(0), I(0)}, - vector{GI(1) + "_autogen_pre_red"})); - - grad_ops.push_back(CreateOperatorDef( - "SumReduceLike", - "mul_with_broadcast_grad_3", - vector{GI(1) + "_autogen_pre_red", I(1)}, - vector{GI(1)}, - vector{axis, axis_str, order})); - - return grad_ops; - } - } - - // Make sure the broadcast argument is not copied over. - bool CopyArguments() const override { - return false; - } -}; -REGISTER_GRADIENT(Mul, GetMulGradient); - -class GetDivGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - CAFFE_ENFORCE( - !ArgumentHelper::HasArgument(Def(), "broadcast"), - "Gradient not ready yet for Div with broadcasting."); - return SingleGradientDef( - "DivGradient", - "", - vector{I(1), O(0), GO(0)}, - vector{GI(0), GI(1)}); - } -}; -REGISTER_GRADIENT(Div, GetDivGradient); - std::function ComparisonDocGenerator( const char* name, const char* desc) { @@ -324,11 +187,12 @@ Performs element-wise {desc} comparison `{name}` (with limited broadcast support .FillUsing(ComparisonDocGenerator(symbol, desc)); \ SHOULD_NOT_DO_GRADIENT(name) +CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equal to"); +CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(NE, "!=", "not equal to"); CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LT, "<", "less than"); CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LE, "<=", "less or equal than"); CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GT, ">", "greater than"); CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GE, ">=", "greater or equal than"); -CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equality"); std::function LogicalDocGenerator(const char* name) { return [=](OpSchema& schema) { @@ -354,17 +218,56 @@ Both input operands should be of type `bool`. } #define CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(name, symbol, onnx_schema) \ + OPERATOR_SCHEMA(name) \ + .NumInputs(2) \ + .NumOutputs(1) \ + .AllowInplace({{0, 0}}) \ + .FillUsing(LogicalDocGenerator(symbol)) \ + .InheritOnnxSchema(onnx_schema); \ + SHOULD_NOT_DO_GRADIENT(name) + +CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or"); +CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And"); +CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor"); + +#undef CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP + +std::function BitwiseDocGenerator(const char* name) { + return [=](OpSchema& schema) { + string doc = R"DOC( +Performs element-wise bitwise operation `{name}` (with limited broadcast support). +Both input operands should be of type `bool`. +{broadcast_doc})DOC"; + ReplaceAll(doc, "{name}", name); + ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc); + schema.SetDoc(doc); + schema.Arg("broadcast", "Pass 1 to enable broadcasting"); + schema.Arg( + "axis", + "If set, defines the broadcast dimensions. See doc for details."); + schema.Input(0, "A", "First operand."); + schema.Input( + 1, + "B", + "Second operand. With broadcasting can be of smaller size than A. " + "If broadcasting is disabled it should be of the same size."); + schema.Output(0, "C", "Result, has same dimensions and type with A."); + }; +} + +#define CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(name, symbol) \ OPERATOR_SCHEMA(name) \ .NumInputs(2) \ .NumOutputs(1) \ .AllowInplace({{0, 0}}) \ - .FillUsing(LogicalDocGenerator(symbol)) \ - .InheritOnnxSchema(onnx_schema); \ + .FillUsing(LogicalDocGenerator(symbol)); \ SHOULD_NOT_DO_GRADIENT(name) -CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or"); -CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And"); -CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor"); +CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseOr, "bitwise_or"); +CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseAnd, "bitwise_and"); +CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseXor, "bitwise_xor"); + +#undef CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP OPERATOR_SCHEMA(Not) .NumInputs(1) @@ -375,4 +278,12 @@ OPERATOR_SCHEMA(Not) .InheritOnnxSchema("Not"); SHOULD_NOT_DO_GRADIENT(Not); +OPERATOR_SCHEMA(Sign) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC(Performs element-wise sign.)DOC") + .Input(0, "X", "Input tensor.") + .Output(0, "Y", "Output tensor."); +SHOULD_NOT_DO_GRADIENT(Sign); + } // namespace caffe2 diff --git a/caffe2/operators/elementwise_sub_op.cc b/caffe2/operators/elementwise_sub_op.cc index 5cfa15cf75d5d2..b1de78e66cb37a 100644 --- a/caffe2/operators/elementwise_sub_op.cc +++ b/caffe2/operators/elementwise_sub_op.cc @@ -1,10 +1,33 @@ -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_sub_op.h" namespace caffe2 { -// See the operations supported here: -// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html -#define EIGEN_SUB(x, y) ((x) - (y)) -EIGEN_FUNCTOR(Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput); -#undef EIGEN_SUB -} +REGISTER_CPU_OPERATOR( + Sub, + BinaryElementwiseOp>); +REGISTER_CPU_OPERATOR( + SubGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CPUContext, + SubFunctor>); + +namespace { + +class GetSubGradient final : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + + std::vector GetGradientDefs() override { + return SingleGradientDef( + "SubGradient", + "", + std::vector{GO(0), I(0), I(1)}, + std::vector{GI(0), GI(1)}); + } +}; + +} // namespace + +REGISTER_GRADIENT(Sub, GetSubGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/elementwise_sub_op.h b/caffe2/operators/elementwise_sub_op.h new file mode 100644 index 00000000000000..20d38bfff58c8a --- /dev/null +++ b/caffe2/operators/elementwise_sub_op.h @@ -0,0 +1,76 @@ +#ifndef CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_ +#define CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_ + +#include +#include +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct SubFunctor { + template + bool Forward( + const std::vector& A_dims, + const std::vector& B_dims, + const TIn* A, + const TIn* B, + TOut* C, + Context* context) const { + math::Sub( + A_dims.size(), + A_dims.data(), + B_dims.size(), + B_dims.data(), + A, + B, + C, + context); + return true; + } + + template + bool Backward( + const std::vector& A_dims, + const std::vector& B_dims, + const TGrad* dC, + const TIn* /* A */, + const TIn* /* B */, + const TOut* /* C */, + TGrad* dA, + TGrad* dB, + Context* context) const { + const std::vector C_dims = + ComputeBinaryBroadcastForwardDims(A_dims, B_dims); + std::vector A_axes; + std::vector B_axes; + ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes); + math::ReduceSum( + C_dims.size(), + C_dims.data(), + A_axes.size(), + A_axes.data(), + dC, + dA, + context); + math::ReduceSum( + C_dims.size(), + C_dims.data(), + B_axes.size(), + B_axes.data(), + dC, + dB, + context); + const int size = std::accumulate( + B_dims.cbegin(), B_dims.cend(), 1, std::multiplies()); + math::Neg(size, dB, dB, context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_ diff --git a/caffe2/operators/elementwise_sub_op_gpu.cc b/caffe2/operators/elementwise_sub_op_gpu.cc new file mode 100644 index 00000000000000..72a0f115b18204 --- /dev/null +++ b/caffe2/operators/elementwise_sub_op_gpu.cc @@ -0,0 +1,17 @@ +#include "caffe2/operators/elementwise_sub_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Sub, + BinaryElementwiseOp>); +REGISTER_CUDA_OPERATOR( + SubGradient, + BinaryElementwiseGradientOp< + NumericTypes, + CUDAContext, + SubFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/exp_op.cc b/caffe2/operators/exp_op.cc index d02f4b6b1f684e..d4b0b7b97a6ee8 100644 --- a/caffe2/operators/exp_op.cc +++ b/caffe2/operators/exp_op.cc @@ -1,19 +1,13 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/exp_op.h" -namespace caffe2 { +#include +#include -struct ExpCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Exp(n, x, y, device_context); - } -}; +namespace caffe2 { REGISTER_CPU_OPERATOR( Exp, - UnaryElementwiseOp, CPUContext, ExpCPUFunctor>); + UnaryElementwiseOp, CPUContext, ExpFunctor>); OPERATOR_SCHEMA(Exp) .NumInputs(1) @@ -33,15 +27,21 @@ and output blobs. "element-wise") .InheritOnnxSchema("Exp"); +namespace { + class GetExpGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "Mul", "", - std::vector{O(0), GO(0)}, - std::vector{GI(0)}); + std::vector{O(0), GO(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Exp, GetExpGradient); + } // namespace caffe2 diff --git a/caffe2/operators/exp_op.cu b/caffe2/operators/exp_op.cu deleted file mode 100644 index c31b5441d7a45c..00000000000000 --- a/caffe2/operators/exp_op.cu +++ /dev/null @@ -1,30 +0,0 @@ -#include - -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" - -namespace caffe2 { - -template -__global__ void ExpKernel(const int N, const T* X, T* Y) { - CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = __expf(X[i]); - } -} - -struct ExpCUDAFunctor { - template - inline void operator()(const int n, const T* x, - T* y, CUDAContext* device_context) { - ExpKernel<<cuda_stream()>>>(n, x, y); - return; - } - inline bool InplaceAllowed() { - return true; - } -}; - -REGISTER_CUDA_OPERATOR( - Exp, UnaryElementwiseOp, CUDAContext, ExpCUDAFunctor>); -} // namespace caffe2 diff --git a/caffe2/operators/exp_op.h b/caffe2/operators/exp_op.h new file mode 100644 index 00000000000000..051827bcb5548a --- /dev/null +++ b/caffe2/operators/exp_op.h @@ -0,0 +1,20 @@ +#ifndef CAFFE2_OPERATORS_EXP_OP_ +#define CAFFE2_OPERATORS_EXP_OP_ + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct ExpFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Exp(N, X, Y, context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_EXP_OP_ diff --git a/caffe2/operators/exp_op_gpu.cc b/caffe2/operators/exp_op_gpu.cc new file mode 100644 index 00000000000000..0b79a61430784a --- /dev/null +++ b/caffe2/operators/exp_op_gpu.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/exp_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Exp, + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + ExpFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/group_norm_op.cc b/caffe2/operators/group_norm_op.cc index 59de7940d3341f..38db78ceb7866f 100644 --- a/caffe2/operators/group_norm_op.cc +++ b/caffe2/operators/group_norm_op.cc @@ -38,7 +38,7 @@ void GroupNormForward( const int i_mu = index[0] * dims[kGDim] + index[kGDim]; const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim]; Y[i] = gamma[i_gamma] * (X[i] - mu[i_mu]) * rsig[i_mu] + beta[i_gamma]; - math::internal::IncreaseIndexInDims(4, dims.data(), index.data()); + math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); } } @@ -59,7 +59,7 @@ void ComputeInternalGradients( const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim]; ds[i_mu] += gamma[i_gamma] * dY[i] * X[i]; db[i_mu] += gamma[i_gamma] * dY[i]; - math::internal::IncreaseIndexInDims(4, dims.data(), index.data()); + math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); } } @@ -102,7 +102,7 @@ void GroupNormBackward( dX[i] = gamma[i_gamma] * dY[i] * rsig[i_mu] + (u - v) * denom; dgamma[i_gamma] += dY[i] * (X[i] - mu[i_mu]) * rsig[i_mu]; dbeta[i_gamma] += dY[i]; - math::internal::IncreaseIndexInDims(4, dims.data(), index.data()); + math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); } } diff --git a/caffe2/operators/group_norm_op.cu b/caffe2/operators/group_norm_op.cu index 20e61c1f31cc2b..116b963b74ae0c 100644 --- a/caffe2/operators/group_norm_op.cu +++ b/caffe2/operators/group_norm_op.cu @@ -23,7 +23,7 @@ template using BlockReduce = cub::BlockReduce; template -inline __device__ T CubeCUDA(const T x) { +inline __host__ __device__ T CubeCUDA(const T x) { return x * x * x; } diff --git a/caffe2/operators/log_op.cc b/caffe2/operators/log_op.cc index 1aa2bc4791f5c8..68e95e6f25ffb3 100644 --- a/caffe2/operators/log_op.cc +++ b/caffe2/operators/log_op.cc @@ -1,20 +1,13 @@ -#include "caffe2/operators/math_ops.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/log_op.h" +#include +#include namespace caffe2 { -struct LogCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Log(n, x, y, device_context); - } -}; - REGISTER_CPU_OPERATOR( Log, - UnaryElementwiseOp, CPUContext, LogCPUFunctor>); + UnaryElementwiseOp, CPUContext, LogFunctor>); OPERATOR_SCHEMA(Log) .NumInputs(1) @@ -34,16 +27,21 @@ and output blobs. "element-wise") .InheritOnnxSchema("Log"); +namespace { + class GetLogGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "Div", "", - std::vector{GO(0), I(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Log, GetLogGradient); } // namespace caffe2 diff --git a/caffe2/operators/log_op.cu b/caffe2/operators/log_op.cu deleted file mode 100644 index 3fb422cd7a3032..00000000000000 --- a/caffe2/operators/log_op.cu +++ /dev/null @@ -1,17 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/math_ops.h" - -namespace caffe2 { - -struct LogCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - math::Log(n, x, y, device_context); - } -}; - -REGISTER_CUDA_OPERATOR( - Log, - UnaryElementwiseOp, CUDAContext, LogCUDAFunctor>); -} diff --git a/caffe2/operators/log_op.h b/caffe2/operators/log_op.h new file mode 100644 index 00000000000000..7c420c11d6004c --- /dev/null +++ b/caffe2/operators/log_op.h @@ -0,0 +1,20 @@ +#ifndef CAFFE2_OPERATORS_LOG_OP_H_ +#define CAFFE2_OPERATORS_LOG_OP_H_ + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct LogFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Log(N, X, Y, context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_LOG_OP_H_ diff --git a/caffe2/operators/log_op_gpu.cc b/caffe2/operators/log_op_gpu.cc new file mode 100644 index 00000000000000..dbf920d8afc143 --- /dev/null +++ b/caffe2/operators/log_op_gpu.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/log_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Log, + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + LogFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/logit_op.cc b/caffe2/operators/logit_op.cc index e9eff0feb50300..8d1859a405a49d 100644 --- a/caffe2/operators/logit_op.cc +++ b/caffe2/operators/logit_op.cc @@ -1,28 +1,23 @@ #include "caffe2/operators/logit_op.h" -#include "caffe2/operators/elementwise_op.h" -namespace caffe2 { -struct LogitCPUFunctor { - explicit LogitCPUFunctor(OperatorBase& op) - : eps_(op.GetSingleArgument("eps", 1e-6f)) { - CAFFE_ENFORCE_GT(eps_, 0.0); - CAFFE_ENFORCE_LT(eps_, 0.5); - } - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /* unused */) { - ConstEigenArrayMap X(x, n, 1); - EigenArrayMap Y(y, n, 1); - const T k_one = 1.0; +#include +#include - Y = X.min(k_one - eps_); - Y = Y.max(eps_); - Y = (Y / (k_one - Y)).log(); - } +#include "caffe2/operators/elementwise_ops.h" - private: - float eps_; -}; +namespace caffe2 { + +template <> +template +bool LogitFunctor:: +operator()(const int size, const T* X, T* Y, CPUContext* /* context */) const { + ConstEigenVectorMap X_vec(X, size); + EigenVectorMap Y_vec(Y, size); + Y_vec = X_vec.array().min(static_cast(1.0f - eps_)); + Y_vec = Y_vec.array().max(eps_); + Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log(); + return true; +} template <> bool LogitGradientOp::RunOnDevice() { @@ -47,7 +42,7 @@ REGISTER_CPU_OPERATOR( UnaryElementwiseWithArgsOp< TensorTypes, CPUContext, - LogitCPUFunctor>); + LogitFunctor>); REGISTER_CPU_OPERATOR(LogitGradient, LogitGradientOp); @@ -72,16 +67,21 @@ OPERATOR_SCHEMA(LogitGradient) .Output(0, "dX", "output float tensor") .Arg("eps", "small positive epsilon value, the default is 1e-6."); +namespace { + class GetLogitGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; vector GetGradientDefs() override { return vector{CreateOperatorDef( "LogitGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)})}; + std::vector{I(0), GO(0)}, + std::vector{GI(0)})}; } }; +} // namespace + REGISTER_GRADIENT(Logit, GetLogitGradient); + } // namespace caffe2 diff --git a/caffe2/operators/logit_op.cu b/caffe2/operators/logit_op.cu index fd419e15fe5764..d2e8351fcac950 100644 --- a/caffe2/operators/logit_op.cu +++ b/caffe2/operators/logit_op.cu @@ -1,49 +1,46 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" #include "caffe2/operators/logit_op.h" +#include "caffe2/core/context_gpu.h" + namespace caffe2 { +namespace { + template __global__ void LogitKernel(const int N, const T* X, const float eps, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = fminf(X[i], (1.0 - eps)); + Y[i] = fminf(X[i], (T(1) - eps)); Y[i] = fmaxf(Y[i], eps); - Y[i] = logf(Y[i] / (1.0 - Y[i])); + Y[i] = logf(Y[i] / (T(1) - Y[i])); } } +template __global__ void LogitGradientKernel( const int N, - const float* X, - const float* dY, + const T* X, + const T* dY, const float eps, - float* dX) { + T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - dX[i] = (X[i] < eps || X[i] > 1.0 - eps) ? 0 : (dY[i] / X[i] / (1 - X[i])); + dX[i] = (X[i] < eps || X[i] > T(1) - eps) ? T(0) + : (dY[i] / X[i] / (T(1) - X[i])); } } -struct LogitCUDAFunctor { - explicit LogitCUDAFunctor(OperatorBase& op) - : eps_(op.GetSingleArgument("eps", 1e-6)) { - CAFFE_ENFORCE_GT(eps_, 0.0); - CAFFE_ENFORCE_LT(eps_, 0.5); - } - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - LogitKernel - <<cuda_stream()>>>(n, x, eps_, y); - return; - } +} // namespace - private: - float eps_; -}; +template <> +template +bool LogitFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + LogitKernel + <<cuda_stream()>>>(N, X, eps_, Y); + return true; +} template <> bool LogitGradientOp::RunOnDevice() { @@ -66,6 +63,7 @@ REGISTER_CUDA_OPERATOR( UnaryElementwiseWithArgsOp< TensorTypes, CUDAContext, - LogitCUDAFunctor>); + LogitFunctor>); REGISTER_CUDA_OPERATOR(LogitGradient, LogitGradientOp); + } // namespace caffe2 diff --git a/caffe2/operators/logit_op.h b/caffe2/operators/logit_op.h index 09a6620bbf4477..1e460baec894e8 100644 --- a/caffe2/operators/logit_op.h +++ b/caffe2/operators/logit_op.h @@ -1,9 +1,26 @@ +#ifndef CAFFE2_OPERATORS_LOGIT_OP_H_ +#define CAFFE2_OPERATORS_LOGIT_OP_H_ + #include "caffe2/core/context.h" #include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/elementwise_ops.h" namespace caffe2 { +template +struct LogitFunctor { + explicit LogitFunctor(OperatorBase& op) + : eps_(op.GetSingleArgument("eps", 1e-6f)) { + CAFFE_ENFORCE_GT(eps_, 0.0); + CAFFE_ENFORCE_LT(eps_, 0.5); + } + + template + bool operator()(const int size, const T* X, T* Y, Context* context) const; + + const float eps_; +}; + template class LogitGradientOp final : public Operator { public: @@ -20,3 +37,5 @@ class LogitGradientOp final : public Operator { }; } // namespace caffe2 + +#endif // CAFFE2_OPERATORS_LOGIT_OP_H_ diff --git a/caffe2/operators/math_ops.cc b/caffe2/operators/math_ops.cc deleted file mode 100644 index 72fe0eb59a5346..00000000000000 --- a/caffe2/operators/math_ops.cc +++ /dev/null @@ -1,69 +0,0 @@ -#include "caffe2/operators/math_ops.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -struct SqrCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Sqr(n, x, y, device_context); - } -}; - -REGISTER_CPU_OPERATOR( - Sqr, - UnaryElementwiseOp, CPUContext, SqrCPUFunctor>); - -OPERATOR_SCHEMA(Sqr) - .NumInputs(1) - .NumOutputs(1) - .AllowInplace({{0, 0}}) - .IdenticalTypeAndShape() - .SetDoc("Square (x^2) the elements of the input") - .Input(0, "input", "Input tensor") - .Output(0, "output", "Squared elements of the input"); - -class GetSqrGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - Argument scale_arg; - scale_arg.set_name("scale"); - scale_arg.set_f(2.0); - return vector{CreateOperatorDef( - "Scale", - "", - std::vector{GO(0)}, - std::vector{GO(0)}, - std::vector{scale_arg}), - CreateOperatorDef( - "Mul", - "", - std::vector{GO(0), I(0)}, - std::vector{GI(0)})}; - } -}; -REGISTER_GRADIENT(Sqr, GetSqrGradient); - -struct SignCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - for (int i = 0; i < n; ++i) { - y[i] = (-T(1) * (x[i] < 0)) + (x[i] > 0); - } - } -}; - -REGISTER_CPU_OPERATOR( - Sign, - UnaryElementwiseOp, CPUContext, SignCPUFunctor>); - -OPERATOR_SCHEMA(Sign) - .NumInputs(1) - .NumOutputs(1) - .SetDoc("Computes sign for each element of the input: -1, 0 or 1.") - .IdenticalTypeAndShape(); -SHOULD_NOT_DO_GRADIENT(Sign); - -} // namespace caffe2 diff --git a/caffe2/operators/math_ops.cu b/caffe2/operators/math_ops.cu deleted file mode 100644 index 9cd6ad64ac66bc..00000000000000 --- a/caffe2/operators/math_ops.cu +++ /dev/null @@ -1,39 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/math_ops.h" - -namespace caffe2 { - -struct SqrCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - math::Sqr(n, x, y, device_context); - } -}; - -template -__global__ void SignKernel(int n, const T* x, T* y) { - CUDA_1D_KERNEL_LOOP(i, n) { - y[i] = (-T(1) * (x[i] < 0)) + (x[i] > 0); - } -} - -struct SignCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - SignKernel<<< - CAFFE_GET_BLOCKS(n), - CAFFE_CUDA_NUM_THREADS, - 0, - device_context->cuda_stream()>>>(n, x, y); - } -}; - -REGISTER_CUDA_OPERATOR( - Sqr, - UnaryElementwiseOp, CUDAContext, SqrCUDAFunctor>); -REGISTER_CUDA_OPERATOR( - Sign, - UnaryElementwiseOp, CUDAContext, SignCUDAFunctor>); -} diff --git a/caffe2/operators/math_ops.h b/caffe2/operators/math_ops.h deleted file mode 100644 index b411b8ed6920ea..00000000000000 --- a/caffe2/operators/math_ops.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef CAFFE2_OPERATORS_MATH_OP_H_ -#define CAFFE2_OPERATORS_MATH_OP_H_ - -#include "caffe2/core/common_omp.h" -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/core/tensor.h" -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" - -#endif diff --git a/caffe2/operators/moments_op.cc b/caffe2/operators/moments_op.cc index 930339e737203f..49ecdf7d577c53 100644 --- a/caffe2/operators/moments_op.cc +++ b/caffe2/operators/moments_op.cc @@ -23,13 +23,13 @@ bool MomentsGradientOp::Compute( const T norm = static_cast(dY_size) / static_cast(dX_size); for (int dX_index = 0; dX_index < dX_size; ++dX_index) { const int dY_index = - math::internal::GetIndexFromDims(ndim, dY_dims.data(), index.data()); + math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data()); dX_data[dX_index] = (dmean_data[dY_index] + static_cast(2) * (X_data[dX_index] - mean_data[dY_index]) * dvariance_data[dY_index]) * norm; - math::internal::IncreaseIndexInDims(ndim, dX_dims.data(), index.data()); + math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data()); } return true; } diff --git a/caffe2/operators/negative_op.cc b/caffe2/operators/negative_op.cc index 239cd972d0869b..6f2b054bd2963d 100644 --- a/caffe2/operators/negative_op.cc +++ b/caffe2/operators/negative_op.cc @@ -1,21 +1,13 @@ -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/negative_op.h" -namespace caffe2 { +#include +#include -struct NegativeCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { - EigenVectorMap(y, n) = -ConstEigenVectorMap(x, n); - // for (int i = 0; i < n; ++i) { - // y[i] = -x[i]; - //} - } -}; +namespace caffe2 { REGISTER_CPU_OPERATOR( - Negative, UnaryElementwiseOp< - TensorTypes, CPUContext, NegativeCPUFunctor>); + Negative, + UnaryElementwiseOp>); // Input: X, output: Y OPERATOR_SCHEMA(Negative) @@ -30,14 +22,21 @@ Computes the element-wise negative of the input. .Output(0, "Y", "1D input tensor") .InheritOnnxSchema("Neg"); +namespace { + class GetNegativeGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( - "Negative", "", - vector{GO(0)}, - vector{GI(0)}); + "Negative", + "", + std::vector{GO(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Negative, GetNegativeGradient); -} // namespace caffe2 + +} // namespace caffe2 diff --git a/caffe2/operators/negative_op.cu b/caffe2/operators/negative_op.cu deleted file mode 100644 index 1bd5cea474ae46..00000000000000 --- a/caffe2/operators/negative_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" - -namespace caffe2 { - -template -__global__ void NegativeKernel(const int N, const T* x, T* y) { - CUDA_1D_KERNEL_LOOP(i, N) { - y[i] = -x[i]; - } -} - -struct NegativeCUDAFunctor { - template - inline void operator()(const int n, const T* x, - T* y, CUDAContext* device_context) { - NegativeKernel<<cuda_stream()>>>(n, x, y); - return; - } -}; - -REGISTER_CUDA_OPERATOR( - Negative, UnaryElementwiseOp< - TensorTypes, CUDAContext, - NegativeCUDAFunctor>); -} // namespace caffe2 diff --git a/caffe2/operators/negative_op.h b/caffe2/operators/negative_op.h new file mode 100644 index 00000000000000..876a83732307bd --- /dev/null +++ b/caffe2/operators/negative_op.h @@ -0,0 +1,20 @@ +#ifndef CAFFE2_OPERATORS_NEGATIVE_OP_H_ +#define CAFFE2_OPERATORS_NEGATIVE_OP_H_ + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct NegativeFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Neg(N, X, Y, context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_NEGATIVE_OP_H_ diff --git a/caffe2/operators/negative_op_gpu.cc b/caffe2/operators/negative_op_gpu.cc new file mode 100644 index 00000000000000..dffc947eff6755 --- /dev/null +++ b/caffe2/operators/negative_op_gpu.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/negative_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Negative, + UnaryElementwiseOp< + NumericTypes, + CUDAContext, + NegativeFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/pow_op.h b/caffe2/operators/pow_op.h index 054ed7278ab380..475499744c07f6 100644 --- a/caffe2/operators/pow_op.h +++ b/caffe2/operators/pow_op.h @@ -7,7 +7,7 @@ #include "caffe2/core/operator.h" #include "caffe2/utils/math.h" // definition of NumericTypes and SameTypeAsInput is in below header file -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_ops.h" namespace caffe2 { @@ -103,7 +103,7 @@ class PowOp : public Operator { A.size(), Adata, Bdata, 0, Cdata, &context_); } else { size_t pre, n, post; - std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_); + std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_); if (post == 1) { functor_.template RunWithBroadcast( Adata, Bdata, Cdata, pre, n, &context_); diff --git a/caffe2/operators/reduce_ops.cc b/caffe2/operators/reduce_ops.cc index fe1a5b5f3f9bcf..aa635118dbdbbc 100644 --- a/caffe2/operators/reduce_ops.cc +++ b/caffe2/operators/reduce_ops.cc @@ -24,10 +24,10 @@ void ComputeReduceMinMaxGradient( std::vector index(ndim, 0); for (int dX_index = 0; dX_index < dX_size; ++dX_index) { const int dY_index = - math::internal::GetIndexFromDims(ndim, dY_dims.data(), index.data()); + math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data()); dX_data[dX_index] = Y_data[dY_index] == X_data[dX_index] ? dY_data[dY_index] : T(0); - math::internal::IncreaseIndexInDims(ndim, dX_dims.data(), index.data()); + math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data()); } } diff --git a/caffe2/operators/reduce_ops.h b/caffe2/operators/reduce_ops.h index 16e53193b1bc4a..2112ba797e566e 100644 --- a/caffe2/operators/reduce_ops.h +++ b/caffe2/operators/reduce_ops.h @@ -120,7 +120,7 @@ class ReduceGradientOp final : public Operator { private: std::vector axes_; - const Reducer reducer_{}; + Reducer reducer_{}; }; template diff --git a/caffe2/operators/rsqrt_op.cc b/caffe2/operators/rsqrt_op.cc index 16cea45de71e14..2180a10c010162 100644 --- a/caffe2/operators/rsqrt_op.cc +++ b/caffe2/operators/rsqrt_op.cc @@ -1,32 +1,35 @@ #include "caffe2/operators/rsqrt_op.h" +#include +#include #include -#include namespace caffe2 { template <> struct RSqrtFunctor { template - inline void operator()( - const int size, - const T* X, - T* Y, - CPUContext* /* context */) const { + bool operator()(const int size, const T* X, T* Y, CPUContext* /* context */) + const { EigenArrayMap(Y, 1, size) = ConstEigenArrayMap(X, 1, size).rsqrt(); + return true; } }; template <> template -void RSqrtGradientFunctor::Run( - const int size, +bool RSqrtGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* Y_dims */, const T* dY, const T* Y, T* dX, CPUContext* /* context */) const { - EigenArrayMap(dX, 1, size) = ConstEigenArrayMap(dY, 1, size) * - ConstEigenArrayMap(Y, 1, size).cube() * static_cast(-0.5); + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + EigenVectorMap(dX, size) = ConstEigenVectorMap(dY, size).array() * + ConstEigenVectorMap(Y, size).array().cube() * static_cast(-0.5); + return true; } REGISTER_CPU_OPERATOR( @@ -40,7 +43,7 @@ REGISTER_CPU_OPERATOR( BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>>); + RSqrtGradientFunctor>); OPERATOR_SCHEMA(RSqrt) .NumInputs(1) diff --git a/caffe2/operators/rsqrt_op.cu b/caffe2/operators/rsqrt_op.cu index 3788abf70689a1..710b7c9d898376 100644 --- a/caffe2/operators/rsqrt_op.cu +++ b/caffe2/operators/rsqrt_op.cu @@ -1,5 +1,8 @@ #include "caffe2/operators/rsqrt_op.h" +#include +#include + #include "caffe2/core/context_gpu.h" namespace caffe2 { @@ -7,7 +10,7 @@ namespace caffe2 { namespace { template -inline __device__ T CubeCUDA(const T x) { +inline __host__ __device__ T CubeCUDA(const T x) { return x * x * x; } @@ -27,17 +30,21 @@ RSqrtGradientCUDAKernel(const int size, const T* dY, const T* Y, T* dX) { template <> template -void RSqrtGradientFunctor::Run( - const int size, +bool RSqrtGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* Y_dims */, const T* dY, const T* Y, T* dX, CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); RSqrtGradientCUDAKernel <<cuda_stream()>>>(size, dY, Y, dX); + return true; } REGISTER_CUDA_OPERATOR( @@ -51,6 +58,6 @@ REGISTER_CUDA_OPERATOR( BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>>); + RSqrtGradientFunctor>); } // namespace caffe2 diff --git a/caffe2/operators/rsqrt_op.h b/caffe2/operators/rsqrt_op.h index 575854a66600ca..93073458845658 100644 --- a/caffe2/operators/rsqrt_op.h +++ b/caffe2/operators/rsqrt_op.h @@ -1,9 +1,9 @@ #ifndef CAFFE2_OPERATORS_RSQRT_OP_H_ #define CAFFE2_OPERATORS_RSQRT_OP_H_ -#include "caffe2/core/context.h" -#include "caffe2/core/operator.h" -#include "caffe2/operators/elementwise_op.h" +#include + +#include "caffe2/operators/elementwise_ops.h" #include "caffe2/utils/math.h" namespace caffe2 { @@ -11,17 +11,22 @@ namespace caffe2 { template struct RSqrtFunctor { template - inline void operator()(const int size, const T* X, T* Y, Context* context) - const { - math::InvSqrt(size, X, Y, context); + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::InvSqrt(N, X, Y, context); + return true; } }; template struct RSqrtGradientFunctor { template - inline void - Run(const int size, const T* dY, const T* Y, T* dX, Context* context) const; + bool Forward( + const std::vector& dY_dims, + const std::vector& Y_dims, + const T* dY, + const T* Y, + T* dX, + Context* context) const; }; } // namespace caffe2 diff --git a/caffe2/operators/shape_op.cc b/caffe2/operators/shape_op.cc index 4d19fe08c2c939..de36a2b1280799 100644 --- a/caffe2/operators/shape_op.cc +++ b/caffe2/operators/shape_op.cc @@ -22,7 +22,7 @@ OPERATOR_SCHEMA(Shape) } else { out[0].add_dims(axes.size()); } - out[0].set_data_type(TensorProto::INT32); + out[0].set_data_type(TensorProto::INT64); return out; }) .SetDoc(R"DOC( diff --git a/caffe2/operators/sigmoid_op.cc b/caffe2/operators/sigmoid_op.cc index d908f7eeb705db..ad86954d975a83 100644 --- a/caffe2/operators/sigmoid_op.cc +++ b/caffe2/operators/sigmoid_op.cc @@ -1,72 +1,90 @@ -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/sigmoid_op.h" + +#include +#include + #include "caffe2/utils/math.h" namespace caffe2 { -struct SigmoidCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { - ConstEigenVectorArrayMap xM(x, n); - EigenVectorArrayMap(y, n) = 1. / (1. + (-xM).exp()); - } -}; +template <> +template +bool SigmoidFunctor:: +operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const { + ConstEigenVectorArrayMap X_arr(X, N); + EigenVectorArrayMap(Y, N) = 1. / (1. + (-X_arr).exp()); + return true; +} -struct SigmoidGradientCPUFunctor { - template - inline void Run( - const int n, - const T* y, - const T* dy, - T* dx, - CPUContext* /*device_context*/) { - ConstEigenVectorArrayMap yM(y, n), dyM(dy, n); - EigenVectorArrayMap(dx, n) = dyM * yM * (1. - yM); - } -}; +template <> +template +bool SigmoidGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* Y_dims */, + const T* dY, + const T* Y, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap Y_arr(Y, size); + EigenVectorArrayMap(dX, size) = dY_arr * Y_arr * (1. - Y_arr); + return true; +} REGISTER_CPU_OPERATOR( - Sigmoid, UnaryElementwiseOp< - TensorTypes, CPUContext, SigmoidCPUFunctor>); + Sigmoid, + UnaryElementwiseOp< + TensorTypes, + CPUContext, + SigmoidFunctor>); REGISTER_CPU_OPERATOR( SigmoidGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + SigmoidGradientFunctor>); // Input: X, output: Y OPERATOR_SCHEMA(Sigmoid) - .NumInputs(1) - .NumOutputs(1) - .AllowInplace({{0, 0}}) - .IdenticalTypeAndShape() - .SetDoc(R"DOC( + .NumInputs(1) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .IdenticalTypeAndShape() + .SetDoc(R"DOC( Sigmoid takes one input data (Tensor) and produces one output data (Tensor) where the sigmoid function, y = 1 / (1 + exp(-x)), is applied to the tensor elementwise. )DOC") - .Input(0, "X", "1D input tensor") - .Output(0, "Y", "1D output tensor") - .InheritOnnxSchema("Sigmoid"); + .Input(0, "X", "1D input tensor") + .Output(0, "Y", "1D output tensor") + .InheritOnnxSchema("Sigmoid"); // Input: Y, dY, output: dX OPERATOR_SCHEMA(SigmoidGradient) - .NumInputs(2) - .NumOutputs(1) - .AllowInplace({{1, 0}}) - .SetDoc(R"DOC( + .NumInputs(2) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .SetDoc(R"DOC( SigmoidGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the sigmoid function. )DOC"); +namespace { + class GetSigmoidGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( - "SigmoidGradient", "", - vector{O(0), GO(0)}, - vector{GI(0)}); + "SigmoidGradient", + "", + std::vector{GO(0), O(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Sigmoid, GetSigmoidGradient); -} // namespace caffe2 + +} // namespace caffe2 diff --git a/caffe2/operators/sigmoid_op.cu b/caffe2/operators/sigmoid_op.cu index cacf6eb05ca7e8..3fb51842795a10 100644 --- a/caffe2/operators/sigmoid_op.cu +++ b/caffe2/operators/sigmoid_op.cu @@ -1,50 +1,81 @@ -#include +#include "caffe2/operators/sigmoid_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { +namespace { + template -__global__ void SigmoidKernel(const int N, const T* x, T* y) { +__global__ void SigmoidKernel(const int N, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { - y[i] = 1. / (1. + exp(-x[i])); +#if __CUDA_ARCH__ >= 350 + Y[i] = T(1) / (T(1) + exp(-__ldg(X + i))); +#else + Y[i] = T(1) / (T(1) + exp(-X[i])); +#endif } } template -__global__ void SigmoidGradientKernel(const int N, const T* y, const T* dy, - T* dx) { +__global__ void +SigmoidGradientKernel(const int N, const T* dY, const T* Y, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - dx[i] = dy[i] * y[i] * (1. - y[i]); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) * __ldg(Y + i) * (T(1) - __ldg(Y + i)); +#else + dX[i] = dY[i] * Y[i] * (T(1) - Y[i]); +#endif } } -struct SigmoidCUDAFunctor { - template - inline void operator()(const int n, const T* x, - T* y, CUDAContext* device_context) { - SigmoidKernel<<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct SigmoidGradientCUDAFunctor { - template - inline void Run(const int n, const T* y, const T* dy, - T* dx, CUDAContext* device_context) { - SigmoidGradientKernel<<cuda_stream()>>>(n, y, dy, dx); - return; - } -}; +} // namespace + +template <> +template +bool SigmoidFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + SigmoidKernel + <<cuda_stream()>>>(N, X, Y); + return true; +} + +template <> +template +bool SigmoidGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* Y_dims */, + const T* dY, + const T* Y, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + SigmoidGradientKernel + <<cuda_stream()>>>(size, dY, Y, dX); + return true; +} REGISTER_CUDA_OPERATOR( Sigmoid, - UnaryElementwiseOp, CUDAContext, SigmoidCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + SigmoidFunctor>); REGISTER_CUDA_OPERATOR( - SigmoidGradient, BinaryElementwiseOp< - TensorTypes, CUDAContext, - WithoutBroadcast>); -} // namespace caffe2 + SigmoidGradient, + BinaryElementwiseOp< + TensorTypes, + CUDAContext, + SigmoidGradientFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/sigmoid_op.h b/caffe2/operators/sigmoid_op.h new file mode 100644 index 00000000000000..686fe12c4f6b38 --- /dev/null +++ b/caffe2/operators/sigmoid_op.h @@ -0,0 +1,30 @@ +#ifndef CAFFE2_OPERATORS_SIGMOID_OP_H_ +#define CAFFE2_OPERATORS_SIGMOID_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" + +namespace caffe2 { + +template +struct SigmoidFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const; +}; + +template +struct SigmoidGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& Y_dims, + const T* dY, + const T* Y, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_SIGMOID_OP_H_ diff --git a/caffe2/operators/sin_op.cc b/caffe2/operators/sin_op.cc index 8aa9cf77ca6845..d64202c8689164 100644 --- a/caffe2/operators/sin_op.cc +++ b/caffe2/operators/sin_op.cc @@ -1,35 +1,36 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/sin_op.h" -namespace caffe2 { +#include +#include -struct SinCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Sin(n, x, y, device_context); - } -}; +namespace caffe2 { -struct SinGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = dyM * cos(xM); - } -}; +template <> +template +bool SinGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = dY_arr * X_arr.cos(); + return true; +} REGISTER_CPU_OPERATOR( Sin, - UnaryElementwiseOp, CPUContext, SinCPUFunctor>); + UnaryElementwiseOp, CPUContext, SinFunctor>); REGISTER_CPU_OPERATOR( SinGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + SinGradientFunctor>); OPERATOR_SCHEMA(Sin) .NumInputs(1) @@ -43,15 +44,21 @@ Calculates the sine of the given input tensor, element-wise. OPERATOR_SCHEMA(SinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +namespace { + class GetSinGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "SinGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Sin, GetSinGradient); + } // namespace caffe2 diff --git a/caffe2/operators/sin_op.cu b/caffe2/operators/sin_op.cu index 59849dcd2930b7..fb984f827e3bd8 100644 --- a/caffe2/operators/sin_op.cu +++ b/caffe2/operators/sin_op.cu @@ -1,61 +1,58 @@ -#include +#include "caffe2/operators/sin_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { -template -__global__ void SinKernel(const int N, const T* X, T* Y) { - CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = sin(X[i]); - } -} +namespace { template -__global__ void SinGradientKernel(const int N, const T* X, const T* dY, T* dX) { +__global__ void +SinGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) * cos(__ldg(X + i)); +#else dX[i] = dY[i] * cos(X[i]); +#endif } } -struct SinCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - SinKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct SinGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - SinGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; +} // namespace + +template <> +template +bool SinGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + SinGradientCUDAKernel + <<cuda_stream()>>>(size, dY, X, dX); + return true; +} REGISTER_CUDA_OPERATOR( Sin, - UnaryElementwiseOp, CUDAContext, SinCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + SinFunctor>); REGISTER_CUDA_OPERATOR( SinGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + SinGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/sin_op.h b/caffe2/operators/sin_op.h new file mode 100644 index 00000000000000..aa0aae9a049cd2 --- /dev/null +++ b/caffe2/operators/sin_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_SIN_OP_H_ +#define CAFFE2_OPERATORS_SIN_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct SinFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Sin(N, X, Y, context); + return true; + } +}; + +template +struct SinGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_SIN_OP_H_ diff --git a/caffe2/operators/softsign_op.cc b/caffe2/operators/softsign_op.cc index eee80ec834f7ec..e7c72309148e77 100644 --- a/caffe2/operators/softsign_op.cc +++ b/caffe2/operators/softsign_op.cc @@ -1,40 +1,49 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/softsign_op.h" -namespace caffe2 { +#include +#include -struct SoftsignCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { - ConstEigenVectorArrayMap x_arr(x, n); - EigenVectorMap(y, n) = (1 + x_arr.abs()).inverse() * x_arr; - } -}; +namespace caffe2 { -struct SoftsignGradientCPUFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CPUContext* /*device_context*/) { - ConstEigenVectorArrayMap dy_arr(dy, n); - ConstEigenVectorArrayMap x_arr(x, n); - EigenVectorMap(dx, n) = dy_arr * (1 + x_arr.abs()).pow(2).inverse(); - } -}; +template <> +template +bool SoftsignFunctor:: +operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const { + ConstEigenVectorArrayMap X_arr(X, N); + EigenVectorMap(Y, N) = (T(1) + X_arr.abs()).inverse() * X_arr; + return true; +} + +template <> +template +bool SoftsignGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = + dY_arr * (T(1) + X_arr.abs()).square().inverse(); + return true; +} REGISTER_CPU_OPERATOR( Softsign, - UnaryElementwiseOp, CPUContext, SoftsignCPUFunctor>); + UnaryElementwiseOp< + TensorTypes, + CPUContext, + SoftsignFunctor>); REGISTER_CPU_OPERATOR( SoftsignGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + SoftsignGradientFunctor>); OPERATOR_SCHEMA(Softsign) .NumInputs(1) @@ -100,7 +109,7 @@ print("Y:\n", workspace.FetchBlob("Y")) OPERATOR_SCHEMA(SoftsignGradient) .NumInputs(2) .NumOutputs(1) - .AllowInplace({{1, 0}}) + .AllowInplace({{0, 0}}) .SetDoc(R"DOC( Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor element-wise. @@ -113,9 +122,11 @@ element-wise. "The softsign gradient (sgn(x)/(1+|x|)^2) values of the input tensor " "computed element-wise"); +namespace { + class GetSoftsignGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { CAFFE_ENFORCE( I(0) != O(0), "Cannot compute softsign gradient " @@ -124,11 +135,13 @@ class GetSoftsignGradient : public GradientMakerBase { return SingleGradientDef( "SoftsignGradient", "", - vector{I(0), GO(0)}, - vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; +} // namespace + REGISTER_GRADIENT(Softsign, GetSoftsignGradient); } // namespace caffe2 diff --git a/caffe2/operators/softsign_op.cu b/caffe2/operators/softsign_op.cu index 8fec50c62a6754..e7bb5048194d04 100644 --- a/caffe2/operators/softsign_op.cu +++ b/caffe2/operators/softsign_op.cu @@ -1,55 +1,86 @@ -#include +#include "caffe2/operators/softsign_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { +namespace { + template -__global__ void SoftsignKernel(const int N, const T* X, T* Y) { +inline __host__ __device__ T SquareCUDA(const T x) { + return x * x; +} + +template +__global__ void SoftsignCUDAKernel(const int N, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = X[i] / (1 + abs(X[i])); +#if __CUDA_ARCH__ >= 350 + Y[i] = __ldg(X + i) / (T(1) + abs(__ldg(X + i))); +#else + Y[i] = X[i] / (T(1) + abs(X[i])); +#endif } } template -__global__ void SoftsignGradientKernel(const int N, const T* x, const T* dy, - T* dx) { +__global__ void +SoftsignGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - dx[i] = dy[i] / pow(1 + abs(x[i]), 2); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + abs(__ldg(X + i))); +#else + dX[i] = dY[i] / SquareCUDA(T(1) + abs(X[i])); +#endif } } -struct SoftsignCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - SoftsignKernel<<< - CAFFE_GET_BLOCKS(n), - CAFFE_CUDA_NUM_THREADS, - 0, - device_context->cuda_stream()>>>(n, x, y); - return; - } -}; - -struct SoftsignGradientCUDAFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CUDAContext* device_context) { - SoftsignGradientKernel<<< - CAFFE_GET_BLOCKS(n), - CAFFE_CUDA_NUM_THREADS, - 0, - device_context->cuda_stream()>>>(n, x, dy, dx); - return; - } -}; +} // namespace + +template <> +template +bool SoftsignFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + SoftsignCUDAKernel + <<cuda_stream()>>>(N, X, Y); + return true; +} + +template <> +template +bool SoftsignGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + SoftsignGradientCUDAKernel + <<cuda_stream()>>>(size, dY, X, dX); + return true; +} REGISTER_CUDA_OPERATOR( Softsign, - UnaryElementwiseOp, CUDAContext, SoftsignCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + SoftsignFunctor>); REGISTER_CUDA_OPERATOR( SoftsignGradient, - BinaryElementwiseOp, CUDAContext, WithoutBroadcast>); + BinaryElementwiseOp< + TensorTypes, + CUDAContext, + SoftsignGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/softsign_op.h b/caffe2/operators/softsign_op.h new file mode 100644 index 00000000000000..e2ce15605e46dd --- /dev/null +++ b/caffe2/operators/softsign_op.h @@ -0,0 +1,31 @@ +#ifndef CAFFE2_OPERATORS_SOFTSIGN_OP_H_ +#define CAFFE2_OPERATORS_SOFTSIGN_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct SoftsignFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const; +}; + +template +struct SoftsignGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_SOFTSIGN_OP_H_ diff --git a/caffe2/operators/sqr_op.cc b/caffe2/operators/sqr_op.cc new file mode 100644 index 00000000000000..732cf737538a2a --- /dev/null +++ b/caffe2/operators/sqr_op.cc @@ -0,0 +1,47 @@ +#include "caffe2/operators/sqr_op.h" + +#include +#include + +namespace caffe2 { + +REGISTER_CPU_OPERATOR( + Sqr, + UnaryElementwiseOp, CPUContext, SqrFunctor>); + +OPERATOR_SCHEMA(Sqr) + .NumInputs(1) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .IdenticalTypeAndShape() + .SetDoc("Square (x^2) the elements of the input") + .Input(0, "input", "Input tensor") + .Output(0, "output", "Squared elements of the input"); + +namespace { + +class GetSqrGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + std::vector GetGradientDefs() override { + Argument scale_arg; + scale_arg.set_name("scale"); + scale_arg.set_f(2.0); + return std::vector{CreateOperatorDef( + "Scale", + "", + std::vector{GO(0)}, + std::vector{GO(0)}, + std::vector{scale_arg}), + CreateOperatorDef( + "Mul", + "", + std::vector{GO(0), I(0)}, + std::vector{GI(0)})}; + } +}; + +} // namespace + +REGISTER_GRADIENT(Sqr, GetSqrGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/sqr_op.h b/caffe2/operators/sqr_op.h new file mode 100644 index 00000000000000..f5af6542b85f47 --- /dev/null +++ b/caffe2/operators/sqr_op.h @@ -0,0 +1,20 @@ +#ifndef CAFFE2_OPERATORS_SQR_OP_H_ +#define CAFFE2_OPERATORS_SQR_OP_H_ + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct SqrFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Sqr(N, X, Y, context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_SQR_OP_H_ diff --git a/caffe2/operators/sqr_op_gpu.cc b/caffe2/operators/sqr_op_gpu.cc new file mode 100644 index 00000000000000..6f7ed8e5aa63f9 --- /dev/null +++ b/caffe2/operators/sqr_op_gpu.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/sqr_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Sqr, + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + SqrFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/sqrt_op.cc b/caffe2/operators/sqrt_op.cc index e3a7880720e86e..b9ceaaa33aa73c 100644 --- a/caffe2/operators/sqrt_op.cc +++ b/caffe2/operators/sqrt_op.cc @@ -1,19 +1,17 @@ -#include -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/sqrt_op.h" -namespace caffe2 { +#include +#include -struct SqrtCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { - EigenVectorArrayMap(y, n) = ConstEigenVectorArrayMap(x, n).sqrt(); - } -}; +namespace caffe2 { REGISTER_CPU_OPERATOR( Sqrt, - UnaryElementwiseOp, CPUContext, SqrtCPUFunctor>); + UnaryElementwiseOp< + TensorTypes, + CPUContext, + SqrtFunctor>); + // Input: X, output: Y OPERATOR_SCHEMA(Sqrt) .NumInputs(1) @@ -26,24 +24,30 @@ Computes the element-wise sqrt of the input. .Input(0, "X", "ND input tensor") .Output(0, "Y", "ND input tensor"); +namespace { + class GetSqrtGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { Argument scale_arg; scale_arg.set_name("scale"); scale_arg.set_f(0.5); - return vector{CreateOperatorDef( - "Scale", - "", - std::vector{GO(0)}, - std::vector{GI(0)}, - std::vector{scale_arg}), - CreateOperatorDef( - "Div", - "", - std::vector{GI(0), O(0)}, - std::vector{GI(0)})}; + return std::vector{CreateOperatorDef( + "Scale", + "", + std::vector{GO(0)}, + std::vector{GI(0)}, + std::vector{scale_arg}), + CreateOperatorDef( + "Div", + "", + std::vector{GI(0), O(0)}, + std::vector{GI(0)})}; } }; + +} // namespace + REGISTER_GRADIENT(Sqrt, GetSqrtGradient); + } // namespace caffe2 diff --git a/caffe2/operators/sqrt_op.cu b/caffe2/operators/sqrt_op.cu deleted file mode 100644 index af45e7c00b1e7c..00000000000000 --- a/caffe2/operators/sqrt_op.cu +++ /dev/null @@ -1,30 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/operators/math_ops.h" - -namespace caffe2 { - -template -__global__ void SqrtKernel(const int N, const T* x, T* y) { - CUDA_1D_KERNEL_LOOP(i, N) { - y[i] = sqrt(x[i]); - } -} - -struct SqrtCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - SqrtKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -REGISTER_CUDA_OPERATOR( - Sqrt, - UnaryElementwiseOp, CUDAContext, SqrtCUDAFunctor>); -} // namespace caffe2 diff --git a/caffe2/operators/sqrt_op.h b/caffe2/operators/sqrt_op.h new file mode 100644 index 00000000000000..4200068b67801e --- /dev/null +++ b/caffe2/operators/sqrt_op.h @@ -0,0 +1,20 @@ +#ifndef CAFFE2_OPERATORS_SQRT_OP_H_ +#define CAFFE2_OPERATORS_SQRT_OP_H_ + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct SqrtFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Sqrt(N, X, Y, context); + return true; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_SQRT_OP_H_ diff --git a/caffe2/operators/sqrt_op_gpu.cc b/caffe2/operators/sqrt_op_gpu.cc new file mode 100644 index 00000000000000..7a4ae2d30de0dd --- /dev/null +++ b/caffe2/operators/sqrt_op_gpu.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/sqrt_op.h" + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR( + Sqrt, + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + SqrtFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h index 17011fe7376323..b6f670f86938b4 100644 --- a/caffe2/operators/string_ops.h +++ b/caffe2/operators/string_ops.h @@ -2,7 +2,7 @@ #define CAFFE2_OPERATORS_STRING_OPS_H_ #include "caffe2/core/operator.h" -#include "caffe2/operators/elementwise_op.h" +#include "caffe2/operators/elementwise_ops.h" namespace caffe2 { @@ -19,11 +19,13 @@ struct ForEach { explicit ForEach(OperatorBase& op) : functor(op) {} template - void operator()(int n, const In* in, Out* out, Context* /*c*/) { + bool operator()(int n, const In* in, Out* out, Context* /*c*/) { for (int i = 0; i < n; ++i) { out[i] = functor(in[i]); } + return true; } + Functor functor; }; diff --git a/caffe2/operators/swish_op.cc b/caffe2/operators/swish_op.cc index 185553e481fb4c..cd8dfa7ea5d3f6 100644 --- a/caffe2/operators/swish_op.cc +++ b/caffe2/operators/swish_op.cc @@ -1,17 +1,21 @@ -#include "swish_op.h" +#include "caffe2/operators/swish_op.h" + +#include +#include + #include "caffe2/core/types.h" -#include "caffe2/operators/elementwise_op.h" #include "caffe2/utils/math.h" namespace caffe2 { -struct SwishCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { - ConstEigenVectorArrayMap xM(x, n); - EigenVectorArrayMap(y, n) = xM / (1. + (-xM).exp()); - } -}; + +template <> +template +bool SwishFunctor:: +operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const { + ConstEigenVectorArrayMap X_arr(X, N); + EigenVectorArrayMap(Y, N) = X_arr / (T(1) + (-X_arr).exp()); + return true; +} template <> template @@ -35,16 +39,16 @@ bool SwishGradientOp::DoRunWithType() { ConstEigenVectorArrayMap dYvec(dYdata, DYin.size()); // dx = dy * (y + sigmoid(x)*(1-y)) - dXvec = dYvec * (Yvec + (1. / (1. + (-Xvec).exp())) * (1. - Yvec)); + dXvec = dYvec * (Yvec + (T(1) / (T(1) + (-Xvec).exp())) * (T(1) - Yvec)); return true; } REGISTER_CPU_OPERATOR( Swish, UnaryElementwiseOp< - TensorTypes, + TensorTypes, CPUContext, - SwishCPUFunctor>); + SwishFunctor>); REGISTER_CPU_OPERATOR(SwishGradient, SwishGradientOp); // Input: X, output: Y @@ -69,5 +73,21 @@ SwishGradient takes X, Y and dY and uses this to update dX according to the chain rule and derivatives of the swish function. )DOC"); +namespace { + +class GetSwishGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + std::vector GetGradientDefs() override { + return SingleGradientDef( + "SwishGradient", + "", + std::vector{I(0), O(0), GO(0)}, + std::vector{GI(0)}); + } +}; + +} // namespace + REGISTER_GRADIENT(Swish, GetSwishGradient); + } // namespace caffe2 diff --git a/caffe2/operators/swish_op.cu b/caffe2/operators/swish_op.cu index 399f1b61a8abdf..d42ad3d86b2f17 100644 --- a/caffe2/operators/swish_op.cu +++ b/caffe2/operators/swish_op.cu @@ -1,38 +1,52 @@ -#include +#include "caffe2/operators/swish_op.h" #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/operators/swish_op.h" namespace caffe2 { +namespace { + template -__global__ void SwishKernel(const int N, const T* x, T* y) { +__global__ void SwishCUDAKernel(const int N, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { - y[i] = x[i] / (1. + exp(-x[i])); +#if __CUDA_ARCH__ >= 350 + Y[i] = __ldg(X + i) / (T(1) + exp(-__ldg(X + i))); +#else + Y[i] = X[i] / (T(1) + exp(-X[i])); +#endif } } template -__global__ void -SwishGradientKernel(const int N, const T* x, const T* y, const T* dy, T* dx) { +__global__ void SwishGradientCUDAKernel( + const int N, + const T* X, + const T* Y, + const T* dY, + T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - dx[i] = dy[i] * (y[i] + (1. - y[i]) / (1. + exp(-x[i]))); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) * + (__ldg(Y + i) + (T(1) - __ldg(Y + i)) / (T(1) + exp(-__ldg(X + i)))); +#else + dX[i] = dY[i] * (Y[i] + (T(1) - Y[i]) / (T(1) + exp(-X[i]))); +#endif } } -struct SwishCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - SwishKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; +} // namespace + +template <> +template +bool SwishFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + SwishCUDAKernel + <<cuda_stream()>>>(N, X, Y); + return true; +} template <> template @@ -50,7 +64,7 @@ bool SwishGradientOp::DoRunWithType() { const T* y = Yin.template data(); const T* dy = DYin.template data(); T* dx = DXout->template mutable_data(); - SwishGradientKernel + SwishGradientCUDAKernel <<, CUDAContext, - SwishCUDAFunctor>); + SwishFunctor>); REGISTER_CUDA_OPERATOR(SwishGradient, SwishGradientOp); + } // namespace caffe2 diff --git a/caffe2/operators/swish_op.h b/caffe2/operators/swish_op.h index 01e9683f97a2e5..c582691dab900b 100644 --- a/caffe2/operators/swish_op.h +++ b/caffe2/operators/swish_op.h @@ -1,9 +1,17 @@ -#pragma once +#ifndef CAFFE2_OPERATORS_SWISH_OP_H_ +#define CAFFE2_OPERATORS_SWISH_OP_H_ -#include "caffe2/core/operator.h" +#include "caffe2/operators/elementwise_ops.h" #include "caffe2/utils/math.h" namespace caffe2 { + +template +struct SwishFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const; +}; + template class SwishGradientOp final : public Operator { public: @@ -22,15 +30,6 @@ class SwishGradientOp final : public Operator { OUTPUT_TAGS(DX); }; -class GetSwishGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SwishGradient", - "", - vector{I(0), O(0), GO(0)}, - vector{GI(0)}); - } -}; - } // namespace caffe2 + +#endif // CAFFE2_OPERATORS_SWISH_OP_H_ diff --git a/caffe2/operators/tan_op.cc b/caffe2/operators/tan_op.cc index 1101bed7d36b31..9cc26beddfe488 100644 --- a/caffe2/operators/tan_op.cc +++ b/caffe2/operators/tan_op.cc @@ -1,35 +1,36 @@ -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include "caffe2/operators/tan_op.h" + +#include +#include namespace caffe2 { -struct TanCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* device_context) { - math::Tan(n, x, y, device_context); - } -}; - -struct TanGradientCPUFunctor { - template - inline void - Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) { - ConstEigenVectorArrayMap dyM(dy, n); - ConstEigenVectorArrayMap xM(x, n); - EigenVectorMap(dx, n) = dyM / square(cos(xM)); - } -}; +template <> +template +bool TanGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap X_arr(X, size); + EigenVectorMap(dX, size) = dY_arr / X_arr.cos().square(); + return true; +} REGISTER_CPU_OPERATOR( Tan, - UnaryElementwiseOp, CPUContext, TanCPUFunctor>); + UnaryElementwiseOp, CPUContext, TanFunctor>); REGISTER_CPU_OPERATOR( TanGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + TanGradientFunctor>); OPERATOR_SCHEMA(Tan) .NumInputs(1) @@ -46,16 +47,21 @@ Calculates the tangent of the given input tensor, element-wise. OPERATOR_SCHEMA(TanGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape(); +namespace { + class GetTanGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( "TanGradient", "", - std::vector{I(0), GO(0)}, - std::vector{GI(0)}); + std::vector{GO(0), I(0)}, + std::vector{GI(0)}); } }; +} // namespace + REGISTER_GRADIENT(Tan, GetTanGradient); + } // namespace caffe2 diff --git a/caffe2/operators/tan_op.cu b/caffe2/operators/tan_op.cu index fe74918011a573..e66fecda9020d0 100644 --- a/caffe2/operators/tan_op.cu +++ b/caffe2/operators/tan_op.cu @@ -1,61 +1,59 @@ -#include +#include "caffe2/operators/tan_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { template -__global__ void TanKernel(const int N, const T* X, T* Y) { - CUDA_1D_KERNEL_LOOP(i, N) { - Y[i] = tan(X[i]); - } +inline __host__ __device__ T Square(const T& x) { + return x * x; } template -__global__ void TanGradientKernel(const int N, const T* X, const T* dY, T* dX) { +__global__ void +TanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - dX[i] = dY[i] / pow(cos(X[i]), 2); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) / Square(cos(__ldg(X + i))); +#else + dX[i] = dY[i] / Square(cos(X[i])); +#endif } } -struct TanCUDAFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CUDAContext* device_context) { - TanKernel - <<cuda_stream()>>>(n, x, y); - return; - } -}; - -struct TanGradientCUDAFunctor { - template - inline void Run( - const int n, - const T* x, - const T* dy, - T* dx, - CUDAContext* device_context) { - TanGradientKernel - <<cuda_stream()>>>(n, x, dy, dx); - return; - } -}; +template <> +template +bool TanGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* X_dims */, + const T* dY, + const T* X, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + TanGradientCUDAKernel + <<cuda_stream()>>>(size, dY, X, dX); + return true; +} REGISTER_CUDA_OPERATOR( Tan, - UnaryElementwiseOp, CUDAContext, TanCUDAFunctor>); + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + TanFunctor>); REGISTER_CUDA_OPERATOR( TanGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, - WithoutBroadcast>); + TanGradientFunctor>); + } // namespace caffe2 diff --git a/caffe2/operators/tan_op.h b/caffe2/operators/tan_op.h new file mode 100644 index 00000000000000..579a20d12f11cb --- /dev/null +++ b/caffe2/operators/tan_op.h @@ -0,0 +1,34 @@ +#ifndef CAFFE2_OPERATORS_TAN_OP_H_ +#define CAFFE2_OPERATORS_TAN_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct TanFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const { + math::Tan(N, X, Y, context); + return true; + } +}; + +template +struct TanGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& X_dims, + const T* dY, + const T* X, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_TAN_OP_H_ diff --git a/caffe2/operators/tanh_op.cc b/caffe2/operators/tanh_op.cc index a19ec8e92ebd8d..c1b5f9261876f4 100644 --- a/caffe2/operators/tanh_op.cc +++ b/caffe2/operators/tanh_op.cc @@ -1,52 +1,63 @@ -#include +#include "caffe2/operators/tanh_op.h" -#include "caffe2/operators/elementwise_op.h" -#include "caffe2/utils/math.h" +#include +#include +#include namespace caffe2 { -struct TanhCPUFunctor { - template - inline void - operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) { +template <> +template <> +bool TanhFunctor::operator()( + const int N, + const float* X, + float* Y, + CPUContext* /* context */) const { #ifdef CAFFE2_USE_ACCELERATE - vvtanhf(y, x, &n); + vvtanhf(Y, X, &N); #else - ConstEigenVectorArrayMap x_arr(x, n); - EigenVectorMap(y, n) = 1 - 2 * ((x_arr * 2).exp() + 1).inverse(); + ConstEigenVectorArrayMap X_arr(X, N); + EigenVectorMap(Y, N) = 1 - 2 * ((X_arr * 2).exp() + 1).inverse(); #endif - } -}; - -struct TanhGradientCPUFunctor { - template - inline void Run( - const int n, - const T* y, - const T* dy, - T* dx, - CPUContext* /*device_context*/) { - ConstEigenVectorArrayMap dy_arr(dy, n); - ConstEigenVectorArrayMap y_arr(y, n); - EigenVectorMap(dx, n) = dy_arr * (1 - y_arr * y_arr); - } -}; + return true; +} + +template <> +template +bool TanhGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* Y_dims */, + const T* dY, + const T* Y, + T* dX, + CPUContext* /* context */) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + ConstEigenVectorArrayMap dY_arr(dY, size); + ConstEigenVectorArrayMap Y_arr(Y, size); + EigenVectorMap(dX, size) = dY_arr * (1 - Y_arr * Y_arr); + return true; +} REGISTER_CPU_OPERATOR( - Tanh, UnaryElementwiseOp, CPUContext, TanhCPUFunctor>); + Tanh, + UnaryElementwiseOp< + TensorTypes, + CPUContext, + TanhFunctor>); REGISTER_CPU_OPERATOR( TanhGradient, BinaryElementwiseOp< TensorTypes, CPUContext, - WithoutBroadcast>); + TanhGradientFunctor>); OPERATOR_SCHEMA(Tanh) - .NumInputs(1) - .NumOutputs(1) - .AllowInplace({{0, 0}}) - .IdenticalTypeAndShape() - .SetDoc(R"DOC( + .NumInputs(1) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .IdenticalTypeAndShape() + .SetDoc(R"DOC( Calculates the hyperbolic tangent of the given input tensor element-wise. This operation can be done in an in-place fashion too, by providing the same input and output blobs. @@ -99,20 +110,31 @@ print("X:\n", workspace.FetchBlob("X")) )DOC") - .Input(0, "input", "1-D input tensor") - .Output(0, "output", "The hyperbolic tangent values of the input tensor, computed element-wise") - .InheritOnnxSchema("Tanh"); + .Input(0, "input", "1-D input tensor") + .Output( + 0, + "output", + "The hyperbolic tangent values of the input tensor, computed " + "element-wise") + .InheritOnnxSchema("Tanh"); + +OPERATOR_SCHEMA(TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}}); -OPERATOR_SCHEMA(TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace({{1, 0}}); +namespace { class GetTanhGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { + std::vector GetGradientDefs() override { return SingleGradientDef( - "TanhGradient", "", - std::vector{O(0), GO(0)}, - std::vector{GI(0)}); + "TanhGradient", + "", + std::vector{GO(0), O(0)}, + std::vector{GI(0)}); } }; + +} // namespace + REGISTER_GRADIENT(Tanh, GetTanhGradient); -} // namespace caffe2 + +} // namespace caffe2 diff --git a/caffe2/operators/tanh_op.cu b/caffe2/operators/tanh_op.cu index 0feeb6c4b1527a..a15e6758d11bc3 100644 --- a/caffe2/operators/tanh_op.cu +++ b/caffe2/operators/tanh_op.cu @@ -1,52 +1,81 @@ -#include +#include "caffe2/operators/tanh_op.h" + +#include +#include #include "caffe2/core/context_gpu.h" -#include "caffe2/operators/elementwise_op.h" namespace caffe2 { +namespace { + template -__global__ void TanhKernel(const int N, const T* X, T* Y) { +__global__ void TanhCUDAKernel(const int N, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + Y[i] = tanh(__ldg(X + i)); +#else Y[i] = tanh(X[i]); +#endif } } template -__global__ void TanhGradientKernel(const int N, const T* Y, const T* dY, - T* dX) { +__global__ void +TanhGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { - dX[i] = dY[i]*(1 - Y[i]*Y[i]); +#if __CUDA_ARCH__ >= 350 + dX[i] = __ldg(dY + i) * (T(1) - __ldg(Y + i) * __ldg(Y + i)); +#else + dX[i] = dY[i] * (T(1) - Y[i] * Y[i]); +#endif } } -struct TanhCUDAFunctor { - template - inline void operator()(const int n, const T* x, - T* y, CUDAContext* device_context) { - TanhKernel<<cuda_stream()>>>(n, x, y); - return; - } - inline bool InplaceAllowed() { - return true; - } -}; - -struct TanhGradientCUDAFunctor { - template - inline void Run(const int n, const T* y, const T* dy, - T* dx, CUDAContext* device_context) { - TanhGradientKernel<<cuda_stream()>>>(n, y, dy, dx); - return; - } -}; +} // namespace + +template <> +template +bool TanhFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + TanhCUDAKernel + <<cuda_stream()>>>(N, X, Y); + return true; +} + +template <> +template +bool TanhGradientFunctor::Forward( + const std::vector& dY_dims, + const std::vector& /* Y_dims */, + const T* dY, + const T* Y, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies()); + TanhGradientCUDAKernel + <<cuda_stream()>>>(size, dY, Y, dX); + return true; +} REGISTER_CUDA_OPERATOR( - Tanh, UnaryElementwiseOp, CUDAContext, TanhCUDAFunctor>); + Tanh, + UnaryElementwiseOp< + TensorTypes, + CUDAContext, + TanhFunctor>); REGISTER_CUDA_OPERATOR( - TanhGradient, BinaryElementwiseOp< - TensorTypes, CUDAContext, - WithoutBroadcast>); -} // namespace caffe2 + TanhGradient, + BinaryElementwiseOp< + TensorTypes, + CUDAContext, + TanhGradientFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/tanh_op.h b/caffe2/operators/tanh_op.h new file mode 100644 index 00000000000000..b8fdcff4e990df --- /dev/null +++ b/caffe2/operators/tanh_op.h @@ -0,0 +1,31 @@ +#ifndef CAFFE2_OPERATORS_TANH_OP_H_ +#define CAFFE2_OPERATORS_TANH_OP_H_ + +#include + +#include "caffe2/operators/elementwise_ops.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +struct TanhFunctor { + template + bool operator()(const int N, const T* X, T* Y, Context* context) const; +}; + +template +struct TanhGradientFunctor { + template + bool Forward( + const std::vector& dY_dims, + const std::vector& Y_dims, + const T* dY, + const T* Y, + T* dX, + Context* context) const; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_TANH_OP_H_ diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 38b5e22ae627ce..e0f8dd160b08a8 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -6,6 +6,8 @@ using namespace nom; +namespace { + std::map getArgumentsFromOperator(caffe2::OperatorDef op) { std::map argMap; @@ -83,6 +85,8 @@ std::vector getDilations(std::map argMap) { return dilations; } +} // namespace + namespace caffe2 { std::unique_ptr @@ -483,4 +487,4 @@ caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& old return predictNet; } -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py index da9066bc78067b..73be94feaf2b38 100644 --- a/caffe2/python/attention.py +++ b/caffe2/python/attention.py @@ -346,9 +346,9 @@ def apply_soft_coverage_attention( ) # [encoder_length, batch_size, encoder_output_dim] - decoder_hidden_encoder_outputs_sum = model.net.Add( + decoder_hidden_encoder_outputs_sum_tmp = model.net.Add( [weighted_encoder_outputs, weighted_decoder_hidden_state], - s(scope, 'decoder_hidden_encoder_outputs_sum'), + s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'), broadcast=1, ) # [batch_size, encoder_length] @@ -374,8 +374,8 @@ def apply_soft_coverage_attention( # [encoder_length, batch_size, encoder_output_dim] decoder_hidden_encoder_outputs_sum = model.net.Add( - [decoder_hidden_encoder_outputs_sum, scaled_coverage_weights], - decoder_hidden_encoder_outputs_sum, + [decoder_hidden_encoder_outputs_sum_tmp, scaled_coverage_weights], + s(scope, 'decoder_hidden_encoder_outputs_sum'), ) # [batch_size, encoder_length, 1] diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index c5158e65e70098..bf25806f20dde2 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -710,7 +710,7 @@ def testAddOpInMiddle(self): input_to_grad = net.AddGradientOperators({"x4": "x4_grad"}) sum_op = net.Proto().op[-2] self.assertEqual(sum_op.input[0], "x2_grad") - self.assertEqual(sum_op.input[1], "x4_grad") + self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0") self.assertEqual(sum_op.output[0], "x2_grad") self.assertEqual(input_to_grad["x1"], "x1_grad") @@ -762,7 +762,7 @@ def testSubOpInMiddle(self): print(str(net.Proto())) sum_op = net.Proto().op[-2] self.assertEqual(sum_op.input[0], "x2_grad") - self.assertEqual(sum_op.input[1], "x4_grad") + self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0") self.assertEqual(sum_op.output[0], "x2_grad") self.assertEqual(input_to_grad["x1"], "x1_grad") @@ -788,12 +788,12 @@ def testAddOpAtLeaf(self): net.DotProduct(["x4", "x5"], "x6") input_to_grad = net.AddGradientOperators({"x6": "x6_grad"}) sum_op = net.Proto().op[-1] - self.assertEqual(sum_op.input[0], "x5_grad") - self.assertEqual(sum_op.input[1], "x4_grad") + self.assertEqual(sum_op.input[0], "x2_grad") + self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0") self.assertEqual(sum_op.output[0], "x2_grad") - self.assertEqual(input_to_grad["x1"], "x4_grad") + self.assertEqual(input_to_grad["x1"], "x1_grad") self.assertEqual(input_to_grad["x2"], "x2_grad") - self.assertEqual(input_to_grad["x3"], "x5_grad") + self.assertEqual(input_to_grad["x3"], "x3_grad") def testSubOpAtLeaf(self): # x1 @@ -818,9 +818,9 @@ def testSubOpAtLeaf(self): input_to_grad = net.AddGradientOperators({"x6": "x6_grad"}) sum_op = net.Proto().op[-1] self.assertEqual(sum_op.input[0], "x2_grad") - self.assertEqual(sum_op.input[1], "x5_grad") + self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0") self.assertEqual(sum_op.output[0], "x2_grad") - self.assertEqual(input_to_grad["x1"], "x4_grad") + self.assertEqual(input_to_grad["x1"], "x1_grad") self.assertEqual(input_to_grad["x2"], "x2_grad") self.assertEqual(input_to_grad["x3"], "x3_grad") @@ -846,12 +846,12 @@ def testMultiLayerAddOps(self): net.Add(["x4", "x5"], "x6") input_to_grad = net.AddGradientOperators({"x6": "x6_grad"}) sum_op = net.Proto().op[-1] - self.assertEqual(sum_op.input[0], "x6_grad") - self.assertEqual(sum_op.input[1], "x6_grad") + self.assertEqual(sum_op.input[0], "x2_grad") + self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0") self.assertEqual(sum_op.output[0], "x2_grad") - self.assertEqual(input_to_grad["x1"], "x6_grad") + self.assertEqual(input_to_grad["x1"], "x1_grad") self.assertEqual(input_to_grad["x2"], "x2_grad") - self.assertEqual(input_to_grad["x3"], "x6_grad") + self.assertEqual(input_to_grad["x3"], "x3_grad") def testMultiLayerSubOps(self): # x1 @@ -876,9 +876,9 @@ def testMultiLayerSubOps(self): input_to_grad = net.AddGradientOperators({"x6": "x6_grad"}) sum_op = net.Proto().op[-1] self.assertEqual(sum_op.input[0], "x2_grad") - self.assertEqual(sum_op.input[1], "x5_grad") + self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0") self.assertEqual(sum_op.output[0], "x2_grad") - self.assertEqual(input_to_grad["x1"], "x6_grad") + self.assertEqual(input_to_grad["x1"], "x1_grad") self.assertEqual(input_to_grad["x2"], "x2_grad") self.assertEqual(input_to_grad["x3"], "x3_grad") diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py index ae6eccb8a335f0..6561efbea3ad67 100644 --- a/caffe2/python/gradient_check_test.py +++ b/caffe2/python/gradient_check_test.py @@ -507,6 +507,7 @@ def testIf(self): class TestWhile(test_util.TestCase): + @unittest.skip("Skip flaky test.") def testWhile(self): with NetBuilder(_use_control_ops=True) as nb: ops.Copy(ops.Const(0), "i") diff --git a/caffe2/python/ideep/squeeze_op_test.py b/caffe2/python/ideep/squeeze_op_test.py new file mode 100644 index 00000000000000..2a2f85dae07985 --- /dev/null +++ b/caffe2/python/ideep/squeeze_op_test.py @@ -0,0 +1,35 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import unittest +import hypothesis.strategies as st +from hypothesis import given +import numpy as np +from caffe2.python import core, workspace +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.ideep_test_util as mu + + +@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.") +class SqueezeTest(hu.HypothesisTestCase): + @given( + squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3), + inplace=st.booleans(), + **mu.gcs + ) + def test_squeeze(self, squeeze_dims, inplace, gc, dc): + shape = [ + 1 if dim in squeeze_dims else np.random.randint(1, 5) + for dim in range(4) + ] + X = np.random.rand(*shape).astype(np.float32) + op = core.CreateOperator( + "Squeeze", "X", "X" if inplace else "Y", dims=squeeze_dims + ) + self.assertDeviceChecks(dc, op, [X], [0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index 45baa59aed6bcc..ccf800e440f3cc 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -8,9 +8,10 @@ from caffe2.python import core -def rewrite_init_net_simple(net): +def rewrite_init_net_simple(net, ideep=True): + device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN for op in net.op: - op.device_option.device_type = caffe2_pb2.MKLDNN + op.device_option.device_type = device def last_producer(ops, blob): for (i, op) in reversed(list(enumerate(ops))): @@ -19,7 +20,7 @@ def last_producer(ops, blob): raise ValueError("Failed to find last producer of blob, %s", blob) -def rewrite_run_net_simple(net): +def rewrite_run_net_simple(net, ideep=True): # Simple rewrite for now - assume entire graph can be executed # with MKL, so just insert copy ops for external_input[0] and # external_output[0] @@ -32,12 +33,14 @@ def mkl_tmp(name): "Input blob: {} is not consumed by first op: {}".format( input_blob, net.op[0])) # Modify input/outputs to point to copied MKL blobs. + from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL" + to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU" copy_input_op = core.CreateOperator( - "CopyCPUToMKL", input_blob, mkl_tmp(input_blob)) + from_cpu, input_blob, mkl_tmp(input_blob)) net.op[0].input[0] = mkl_tmp(input_blob) copy_output_ops = [ - core.CreateOperator("CopyMKLToCPU", mkl_tmp(output_blob), output_blob) + core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob) for output_blob in net.external_output] for output_blob in net.external_output: @@ -54,15 +57,16 @@ def mkl_tmp(name): ops = [copy_input_op] + net.op[:] + copy_output_ops del net.op[:] net.op.extend(ops) + device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN for op in net.op: op.device_option.MergeFrom( - core.DeviceOption(device_type=caffe2_pb2.MKLDNN)) + core.DeviceOption(device_type=device)) op.engine = "" -def rewrite_model_helper_simple(model): +def rewrite_model_helper_simple(model, ideep=True): model = copy.deepcopy(model) # All parameter initialization should run on MKL - rewrite_init_net_simple(model.param_init_net.Proto()) - rewrite_run_net_simple(model.net.Proto()) + rewrite_init_net_simple(model.param_init_net.Proto(), ideep) + rewrite_run_net_simple(model.net.Proto(), ideep) return model diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py index 65bf7fd53d44eb..87b08d5283e54c 100644 --- a/caffe2/python/mkl/rewrite_graph_test.py +++ b/caffe2/python/mkl/rewrite_graph_test.py @@ -180,15 +180,16 @@ def complex_resnet(): return model, [(1, 1, 224, 224)] -@unittest.skipIf(not workspace.C.has_mkldnn, - "Skipping as we do not have mkldnn.") +@unittest.skipIf(not workspace.C.has_mkldnn or not workspace.C.use_ideep, + "Skipping as we do not have MKLDNN and IDEEP.") class MKLRewriteTest(hu.HypothesisTestCase): @given(gen=st.sampled_from([simple_relu, simple_fc, - simple_mlp, simple_cnn])) - def test_mkl_simple_rewrite(self, gen): + simple_mlp, simple_cnn]), + ideep=st.booleans()) + def test_mkl_simple_rewrite(self, gen, ideep): cpu_model, (shape,) = gen() cpu_model = deterministic_io(cpu_model) - mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model) + mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep) X = np.random.randn(*shape).astype(np.float32) def run(model): @@ -200,10 +201,11 @@ def run(model): np.testing.assert_allclose(run(cpu_model), run(mkl_model), atol=1e-4, rtol=1e-4) - def test_mkl_resnet_rewrite(self): + @given(ideep=st.booleans()) + def test_mkl_resnet_rewrite(self, ideep): cpu_model, (shape,) = complex_resnet() cpu_model = deterministic_io(cpu_model) - mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model) + mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep) np.random.seed(1701) X = np.random.randn(*shape).astype(np.float32) @@ -215,10 +217,11 @@ def run(model): np.testing.assert_allclose(run(cpu_model), run(mkl_model), atol=1e-4, rtol=1e-4) - def test_mkl_multi_output_rewrite(self): + @given(ideep=st.booleans()) + def test_mkl_multi_output_rewrite(self, ideep): cpu_model, shapes = double_matmul() cpu_model = deterministic_io(cpu_model) - mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model) + mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep) np.random.seed(1701) Xs = [np.random.randn(*shape).astype(np.float32) for shape in shapes] @@ -236,10 +239,11 @@ def run(model): np.testing.assert_allclose(run(cpu_model), run(mkl_model), atol=1e-4, rtol=1e-4) - def test_mkl_alexnet_rewrite(self): + @given(ideep=st.booleans()) + def test_mkl_alexnet_rewrite(self, ideep): cpu_model, (shape,) = alexnet() cpu_model = deterministic_io(cpu_model) - mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model) + mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep) np.random.seed(1701) X = np.random.randn(*shape).astype(np.float32) diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index f7b1dca934dd5c..ff6bddeb2bb6d0 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -324,155 +324,3 @@ def test_semantic_broadcast(self, gc, dc): out = workspace.FetchBlob("out") np.testing.assert_array_almost_equal(out, X + Y) self.assertDeviceChecks(dc, op, [X, Y], [0]) - - @given(**hu.gcs) - def test_sum_reduce(self, gc, dc): - # Set broadcast and no axis, i.e. broadcasting last dimensions. - X = np.random.rand(2, 3, 4, 5).astype(np.float32) - Y = np.random.rand(4, 5).astype(np.float32) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1) - workspace.FeedBlob("X", X) - workspace.FeedBlob("Y", Y) - workspace.RunOperatorOnce(op) - out = workspace.FetchBlob("out") - res = np.sum(X, axis=0) - res = np.sum(res, axis=0) - np.testing.assert_array_almost_equal(out, res) - self.assertDeviceChecks(dc, op, [X, Y], [0]) - - # Set broadcast and no axis, i.e. broadcasting last dimensions. - X = np.random.rand(2, 3, 4, 5).astype(np.float32) - Y = np.random.rand(2, 3).astype(np.float32) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) - workspace.FeedBlob("X", X) - workspace.FeedBlob("Y", Y) - workspace.RunOperatorOnce(op) - out = workspace.FetchBlob("out") - res = np.sum(X, axis=3) - res = np.sum(res, axis=2) - np.testing.assert_array_almost_equal(out, res, decimal=3) - self.assertDeviceChecks(dc, op, [X, Y], [0]) - - # broadcasting intermediate dimensions - X = np.random.rand(2, 3, 4, 5).astype(np.float32) - Y = np.random.rand(3, 4).astype(np.float32) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) - workspace.FeedBlob("X", X) - workspace.FeedBlob("Y", Y) - workspace.RunOperatorOnce(op) - out = workspace.FetchBlob("out") - res = np.sum(X, axis=0) - res = np.sum(res, axis=2) - np.testing.assert_array_almost_equal(out, res) - self.assertDeviceChecks(dc, op, [X, Y], [0]) - - # broadcasting intermediate dimensions - X = np.random.rand(2, 3, 4, 500).astype(np.float64) - Y = np.random.rand(1).astype(np.float64) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1) - workspace.FeedBlob("X", X) - workspace.FeedBlob("Y", Y) - workspace.RunOperatorOnce(op) - out = workspace.FetchBlob("out") - res = np.array(np.sum(X)) - np.testing.assert_array_almost_equal(out, res, decimal=0) - - # broadcasting with single elem dimensions at both ends - X = np.random.rand(2, 3, 4, 5).astype(np.float32) - Y = np.random.rand(1, 3, 4, 1).astype(np.float32) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1) - workspace.FeedBlob("X", X) - workspace.FeedBlob("Y", Y) - workspace.RunOperatorOnce(op) - out = workspace.FetchBlob("out") - res = np.sum(X, axis=0) - res = np.sum(res, axis=2).reshape(Y.shape) - np.testing.assert_array_almost_equal(out, res) - self.assertDeviceChecks(dc, op, [X, Y], [0]) - - # fp64 is not supported with the CUDA op - dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA] - self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0]) - - @unittest.skipIf(not workspace.has_gpu_support, "No gpu support") - @given(**hu.gcs_gpu_only) - def test_sum_reduce_fp16(self, gc, dc): - # Set broadcast and no axis, i.e. broadcasting last dimensions. - X = np.random.rand(2, 3, 4, 5).astype(np.float16) - Y = np.random.rand(4, 5).astype(np.float16) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1, device_option=gc) - - def ref_op(X, Y): - res = np.sum(X, axis=0) - res = np.sum(res, axis=0) - return [res] - - self.assertReferenceChecks( - device_option=gc, - op=op, - inputs=[X, Y], - reference=ref_op, - threshold=1e-3) - - # Set broadcast and no axis, i.e. broadcasting last dimensions. - X = np.random.rand(2, 3, 4, 5).astype(np.float16) - Y = np.random.rand(2, 3).astype(np.float16) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) - - def ref_op(X, Y): - res = np.sum(X, axis=3) - res = np.sum(res, axis=2) - return [res] - - self.assertReferenceChecks( - device_option=gc, - op=op, - inputs=[X, Y], - reference=ref_op, - threshold=1e-3) - - # broadcasting intermediate dimensions - X = np.random.rand(2, 3, 4, 5).astype(np.float16) - Y = np.random.rand(3, 4).astype(np.float16) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) - - def ref_op(X, Y): - res = np.sum(X, axis=0) - res = np.sum(res, axis=2) - return [res] - - self.assertReferenceChecks( - device_option=gc, - op=op, - inputs=[X, Y], - reference=ref_op, - threshold=1e-3) - - # broadcasting with single elem dimensions at both ends - X = np.random.rand(2, 3, 4, 5).astype(np.float16) - Y = np.random.rand(1, 3, 4, 1).astype(np.float16) - op = core.CreateOperator( - "SumReduceLike", ["X", "Y"], "out", broadcast=1) - - def ref_op(X, Y): - res = np.sum(X, axis=0) - res = np.sum(res, axis=2) - return [res.reshape(Y.shape)] - - self.assertReferenceChecks( - device_option=gc, - op=op, - inputs=[X, Y], - reference=ref_op, - threshold=1e-3) - -if __name__ == "__main__": - unittest.main() diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index 0232b4b7699a11..30d4a8da4f3532 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -12,32 +12,6 @@ class TestElementwiseOps(hu.HypothesisTestCase): - @given(n=st.integers(0, 10), m=st.integers(4, 6), - d=st.integers(2, 3), seed=st.integers(0, 1000), **hu.gcs) - def test_div(self, n, m, d, gc, dc, seed): - np.random.seed(seed) - X = np.random.rand(n, m, d).astype(np.float32) - Y = np.random.rand(n, m, d).astype(np.float32) + 5.0 - - def div_op(X, Y): - return [np.divide(X, Y)] - - op = core.CreateOperator( - "Div", - ["X", "Y"], - ["Z"] - ) - - self.assertReferenceChecks( - device_option=gc, - op=op, - inputs=[X, Y], - reference=div_op, - ) - - self.assertGradientChecks( - gc, op, [X, Y], 0, [0], stepsize=1e-4, threshold=1e-2) - @given(n=st.integers(0, 6), m=st.integers(4, 6), seed=st.integers(0, 1000), **hu.gcs) def test_log(self, n, m, gc, dc, seed): @@ -307,3 +281,137 @@ def eq(X, Y): result_2 = net_2.EQ(["X", "Y"], 1) (shapes, types) = workspace.InferShapesAndTypes([net]) self.assertTrue(str(result_2) not in shapes) + + def _run_single_test(self, op, ref, A, B, test_grad, gc, dc): + inputs = [A, B] + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=inputs, + reference=ref, + ) + self.assertDeviceChecks(dc, op, inputs, [0]) + if test_grad: + for i in range(len(inputs)): + self.assertGradientChecks(gc, op, inputs, i, [0]) + + inputs = [B, A] + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=inputs, + reference=ref, + ) + self.assertDeviceChecks(dc, op, inputs, [0]) + if test_grad: + for i in range(len(inputs)): + self.assertGradientChecks(gc, op, inputs, i, [0]) + + def _test_binary_op( + self, op_name, np_ref, n, m, k, t, bias, test_grad, gc, dc): + op = core.CreateOperator( + op_name, + ["A", "B"], + ["C"], + ) + + def ref(A, B): + return [np_ref(A, B)] + + A = np.random.rand(n, m, k, t).astype(np.float32) + bias + B = np.random.rand(n, m, k, t).astype(np.float32) + bias + self._run_single_test(op, ref, A, B, test_grad, gc, dc) + + A = np.random.rand(1).astype(np.float32) + bias + B = np.random.rand(n, m, k, t).astype(np.float32) + bias + self._run_single_test(op, ref, A, B, test_grad, gc, dc) + + A = np.random.rand(k, t).astype(np.float32) + bias + B = np.random.rand(n, m, k, t).astype(np.float32) + bias + self._run_single_test(op, ref, A, B, test_grad, gc, dc) + + A = np.random.rand(n, m, 1, 1).astype(np.float32) + bias + B = np.random.rand(n, m, k, t).astype(np.float32) + bias + self._run_single_test(op, ref, A, B, test_grad, gc, dc) + + A = np.random.rand(m, 1, t).astype(np.float32) + bias + B = np.random.rand(n, m, k, t).astype(np.float32) + bias + self._run_single_test(op, ref, A, B, test_grad, gc, dc) + + A = np.random.rand(1, m, 1, t).astype(np.float32) + bias + B = np.random.rand(n, 1, k, 1).astype(np.float32) + bias + self._run_single_test(op, ref, A, B, test_grad, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_add(self, n, m, k, t, gc, dc): + self._test_binary_op("Add", np.add, n, m, k, t, -0.5, True, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_sub(self, n, m, k, t, gc, dc): + self._test_binary_op("Sub", np.subtract, n, m, + k, t, -0.5, True, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_mul(self, n, m, k, t, gc, dc): + self._test_binary_op("Mul", np.multiply, n, m, + k, t, -0.5, True, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_div(self, n, m, k, t, gc, dc): + self._test_binary_op("Div", np.divide, n, m, k, t, 1.0, True, gc, dc) + + def _test_bitwise_binary_op(self, op_name, np_ref, n, m, k, t, gc, dc): + op = core.CreateOperator( + op_name, + ["A", "B"], + ["C"], + ) + + def ref(A, B): + return [np_ref(A, B)] + + A = np.random.randint(128, size=(n, m, k, t)) + B = np.random.randint(128, size=(n, m, k, t)) + self._run_single_test(op, ref, A, B, False, gc, dc) + + A = np.random.randint(128, size=1) + B = np.random.randint(128, size=(n, m, k, t)) + self._run_single_test(op, ref, A, B, False, gc, dc) + + A = np.random.randint(128, size=(k, t)) + B = np.random.randint(128, size=(n, m, k, t)) + self._run_single_test(op, ref, A, B, False, gc, dc) + + A = np.random.randint(128, size=(n, m, 1, 1)) + B = np.random.randint(128, size=(n, m, k, t)) + self._run_single_test(op, ref, A, B, False, gc, dc) + + A = np.random.randint(128, size=(m, 1, t)) + B = np.random.randint(128, size=(n, m, k, t)) + self._run_single_test(op, ref, A, B, False, gc, dc) + + A = np.random.randint(128, size=(1, m, 1, t)) + B = np.random.randint(128, size=(n, 1, k, 1)) + self._run_single_test(op, ref, A, B, False, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_bitwise_and(self, n, m, k, t, gc, dc): + self._test_bitwise_binary_op( + "BitwiseAnd", np.bitwise_and, n, m, k, t, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_bitwise_or(self, n, m, k, t, gc, dc): + self._test_bitwise_binary_op( + "BitwiseOr", np.bitwise_or, n, m, k, t, gc, dc) + + @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5), + t=st.integers(1, 5), **hu.gcs) + def test_bitwise_xor(self, n, m, k, t, gc, dc): + self._test_bitwise_binary_op( + "BitwiseXor", np.bitwise_xor, n, m, k, t, gc, dc) diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py index bf3a6564ef4594..175256e920d296 100644 --- a/caffe2/python/operator_test/reshape_ops_test.py +++ b/caffe2/python/operator_test/reshape_ops_test.py @@ -48,6 +48,8 @@ def test_zero_dim(self): expected_shape=(4, 2, 1)) _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 2, 1), expected_shape=(4, 2, 1), arg_shape=False) + _test_reshape(old_shape=(0, 0), new_shape=(0, 0, 0), + expected_shape=(0, 0, 0), arg_shape=False) def test_zero_dim_and_missing_dim(self): _test_reshape(old_shape=(4, 2, 1), new_shape=(0, -1, 0), diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py index f31f003c669648..59eb899c97c8df 100644 --- a/caffe2/python/operator_test/shape_inference_test.py +++ b/caffe2/python/operator_test/shape_inference_test.py @@ -494,6 +494,12 @@ def testInt8Conversion(self): workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32)) self.InferTensorRunAndCompare(model) + def testShapeOp(self): + model = model_helper.ModelHelper(name="shape_op_test") + model.Shape('x', 'y') + workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32)) + self.InferTensorRunAndCompare(model) + def InferTensorRunAndCompare(self, model, expected_uninferred_blobs=None): ''' Runs shape inference, and then the model to check diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h index bd0b2d6e44db9d..81c8228cf25f7b 100644 --- a/caffe2/utils/math.h +++ b/caffe2/utils/math.h @@ -15,6 +15,7 @@ extern "C" { #include "caffe2/core/common.h" #include "caffe2/core/types.h" +#include "caffe2/utils/math_utils.h" #include "Eigen/Core" #include "Eigen/Dense" @@ -31,26 +32,26 @@ class DefaultEngine {}; // Common Eigen types that we will often use template using EigenMatrixMap = - Eigen::Map >; + Eigen::Map>; template using EigenArrayMap = - Eigen::Map >; + Eigen::Map>; template -using EigenVectorMap = Eigen::Map >; +using EigenVectorMap = Eigen::Map>; template -using EigenVectorArrayMap = Eigen::Map >; +using EigenVectorArrayMap = Eigen::Map>; template using ConstEigenMatrixMap = - Eigen::Map >; + Eigen::Map>; template using ConstEigenArrayMap = - Eigen::Map >; + Eigen::Map>; template using ConstEigenVectorMap = - Eigen::Map >; + Eigen::Map>; template using ConstEigenVectorArrayMap = - Eigen::Map >; + Eigen::Map>; namespace math { @@ -81,69 +82,107 @@ void InvSqrt(const int N, const T* x, T* y, Context* context); template void Sqr(const int N, const T* x, T* y, Context* context); +template +void Neg(const int N, const T* x, T* y, Context* context); + +template +void Sign(const int N, const T* x, T* y, Context* context); + template void Not(const int N, const T* x, T* y, Context* context); template void Powx(const int N, const T* a, const T b, T* y, Context* context); -#define CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(name) \ +#define CAFFE2_DECLARE_COMPARE_OP(Comp) \ template \ - void name(const int N, const T* a, const T* b, bool* y, Context* context); \ + void Comp(const int N, const T* A, const T* B, bool* C, Context* context); \ + \ + template \ + void Rowwise##Comp( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + bool* C, \ + Context* context); \ + \ + template \ + void Colwise##Comp( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + bool* C, \ + Context* context); \ + \ template \ - void name##ToRow( \ - const int M, \ - const int N, \ - const T* a, \ - const T* b, \ - bool* y, \ + void Comp( \ + const int A_ndim, \ + const int* A_dims, \ + const int B_ndim, \ + const int* B_dims, \ + const T* A, \ + const T* B, \ + bool* C, \ Context* context); -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(LT); -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(LE); -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(GT); -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(GE); - -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(And); -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(Or); -CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(Xor); +CAFFE2_DECLARE_COMPARE_OP(EQ) +CAFFE2_DECLARE_COMPARE_OP(NE) +CAFFE2_DECLARE_COMPARE_OP(LT) +CAFFE2_DECLARE_COMPARE_OP(LE) +CAFFE2_DECLARE_COMPARE_OP(GT) +CAFFE2_DECLARE_COMPARE_OP(GE) -#undef CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT +#undef CAFFE2_DECLARE_COMPARE_OP -#define CAFFE2_DECLARE_BINARY_OP(name) \ - template \ - void name(const int N, const T* a, const T* b, T* y, Context* context); \ +#define CAFFE2_DECLARE_BINARY_OP(Func) \ template \ - void name##ToRow( \ - const int M, \ - const int N, \ - const T* a, \ - const T* b, \ - T* y, \ + void Func(const int N, const T* A, const T* B, T* C, Context* context); \ + \ + template \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ Context* context); \ + \ + template \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + Context* context); \ + \ template \ - void name##ToRow( \ - const int M, const int N, const T* x, T* y, Context* context); \ - template \ - void name##ToCol( \ - const int M, const int N, const T* x, T* y, Context* context); - -CAFFE2_DECLARE_BINARY_OP(Add); -CAFFE2_DECLARE_BINARY_OP(Sub); -CAFFE2_DECLARE_BINARY_OP(Mul); -CAFFE2_DECLARE_BINARY_OP(Div); - -#undef CAFFE2_DECLARE_BINARY_OP + void Func( \ + const int A_ndim, \ + const int* A_dims, \ + const int B_ndim, \ + const int* B_dims, \ + const T* A, \ + const T* B, \ + T* C, \ + Context* context); -namespace internal { +CAFFE2_DECLARE_BINARY_OP(Add) +CAFFE2_DECLARE_BINARY_OP(Sub) +CAFFE2_DECLARE_BINARY_OP(Mul) +CAFFE2_DECLARE_BINARY_OP(Div) -// Increase the index digits by one based on dims. -void IncreaseIndexInDims(const int n, const int* dims, int* index); +CAFFE2_DECLARE_BINARY_OP(And) +CAFFE2_DECLARE_BINARY_OP(Or) +CAFFE2_DECLARE_BINARY_OP(Xor) -// Get index value from dims and index digits. -int GetIndexFromDims(const int n, const int* dims, const int* index); +CAFFE2_DECLARE_BINARY_OP(BitwiseAnd) +CAFFE2_DECLARE_BINARY_OP(BitwiseOr) +CAFFE2_DECLARE_BINARY_OP(BitwiseXor) -} // namespace internal +#undef CAFFE2_DECLARE_BINARY_OP template void ReduceMin( @@ -152,6 +191,7 @@ void ReduceMin( T* y, Tensor* scratch_ptr, Context* context); + template void ReduceMax( const int N, @@ -237,14 +277,12 @@ void AddStripedBatch( // Compute the row-wise max of a N*D matrix X, and write it to a N // dimensional vector y. template -void RowwiseMax(const int N, const int D, const T* x, T* y, - Context* context); +void RowwiseMax(const int N, const int D, const T* x, T* y, Context* context); // Compute the column-wise max of a N*D matrix X, and write it to a D // dimensional vector y. template -void ColwiseMax(const int N, const int D, const T* x, T* y, - Context* context); +void ColwiseMax(const int N, const int D, const T* x, T* y, Context* context); // Elemwise maximum of vector x and vector y. z[i] = max(x[i], y[i]) template @@ -370,8 +408,12 @@ void Dot(const int N, const T* a, const T* b, T* y, Context* context); // Sum of vector x, and writes the result to a single value y. template -void Sum(const int N, const T* x, T* y, Context* context, - Tensor* scratch_ptr = nullptr); +void Sum( + const int N, + const T* x, + T* y, + Context* context, + Tensor* scratch_ptr = nullptr); // Sum of squares of vector x, and writes the result to a single value y. template @@ -385,8 +427,13 @@ void SumSqr( // Select does index selection of the rows a N*D matrix x, and gives the N // dimensional vector y that contains the selected data. template -void Select(const int N, const int D, const T* x, const int* idx, T* y, - Context* context); +void Select( + const int N, + const int D, + const T* x, + const int* idx, + T* y, + Context* context); template void Scale(const int N, const float alpha, const T* x, T* y, Context* context); @@ -487,12 +534,12 @@ void Col2Im( // image. image_size is H * W template void BiasCHW( - const T* bias, - const T* bias_multiplier, - const int bias_channels, - const int image_size, - T* image, - Context* context); + const T* bias, + const T* bias_multiplier, + const int bias_channels, + const int image_size, + T* image, + Context* context); template void CopyMatrix( @@ -525,7 +572,7 @@ inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { // is no overflow or underflow in the calculation. template constexpr T divUp(T a, T b) { - return (a + b - (T) 1) / b; + return (a + b - (T)1) / b; } // Rounds a up to the next highest multiple of b. User must be careful @@ -548,8 +595,8 @@ constexpr T integerNextHighestPowerOf2(T v) { return (integerIsPowerOf2(v) ? (T)2 * v : ((T)1 << (integerLog2(v) + 1))); } -} // namespace math -} // namespace caffe2 +} // namespace math +} // namespace caffe2 #include "caffe2/utils/math-detail.h" -#endif // CAFFE2_UTILS_MATH_H_ +#endif // CAFFE2_UTILS_MATH_H_ diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index 140471a457dc4d..2b45fad7a63523 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -22,18 +22,19 @@ #include #include #include +#include #include #include -#include "caffe2/utils/cpu_neon.h" #include "caffe2/core/context.h" +#include "caffe2/utils/cpu_neon.h" #include "Eigen/Core" #include "Eigen/Dense" #ifdef CAFFE2_USE_MKL #include -#endif // CAFFE2_USE_MKL +#endif // CAFFE2_USE_MKL #ifdef CAFFE2_USE_HPTT #include @@ -91,40 +92,40 @@ void Gemm( C_mat *= beta; } switch (TransA) { - case CblasNoTrans: { - switch (TransB) { - case CblasNoTrans: - C_mat.noalias() += alpha * ( - ConstEigenMatrixMap(B, N, K) * - ConstEigenMatrixMap(A, K, M)); - return; - case CblasTrans: - C_mat.noalias() += alpha * ( - ConstEigenMatrixMap(B, K, N).transpose() * - ConstEigenMatrixMap(A, K, M)); - return; - default: - LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB"; + case CblasNoTrans: { + switch (TransB) { + case CblasNoTrans: + C_mat.noalias() += alpha * + (ConstEigenMatrixMap(B, N, K) * + ConstEigenMatrixMap(A, K, M)); + return; + case CblasTrans: + C_mat.noalias() += alpha * + (ConstEigenMatrixMap(B, K, N).transpose() * + ConstEigenMatrixMap(A, K, M)); + return; + default: + LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB"; + } } - } - case CblasTrans: { - switch (TransB) { - case CblasNoTrans: - C_mat.noalias() += alpha * ( - ConstEigenMatrixMap(B, N, K) * - ConstEigenMatrixMap(A, M, K).transpose()); - return; - case CblasTrans: - C_mat.noalias() += alpha * ( - ConstEigenMatrixMap(B, K, N).transpose() * - ConstEigenMatrixMap(A, M, K).transpose()); - return; - default: - LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB"; + case CblasTrans: { + switch (TransB) { + case CblasNoTrans: + C_mat.noalias() += alpha * + (ConstEigenMatrixMap(B, N, K) * + ConstEigenMatrixMap(A, M, K).transpose()); + return; + case CblasTrans: + C_mat.noalias() += alpha * + (ConstEigenMatrixMap(B, K, N).transpose() * + ConstEigenMatrixMap(A, M, K).transpose()); + return; + default: + LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB"; + } } - } - default: - LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA"; + default: + LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA"; } } @@ -157,14 +158,14 @@ void GemmEx( case CblasNoTrans: { switch (TransB) { case CblasNoTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) * - ConstStridedMap(A, K, M, OuterStride(lda))); + C_mat.noalias() += alpha * + (ConstStridedMap(B, N, K, OuterStride(ldb)) * + ConstStridedMap(A, K, M, OuterStride(lda))); return; case CblasTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() * - ConstStridedMap(A, K, M, OuterStride(lda))); + C_mat.noalias() += alpha * + (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() * + ConstStridedMap(A, K, M, OuterStride(lda))); return; default: LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB"; @@ -173,14 +174,14 @@ void GemmEx( case CblasTrans: { switch (TransB) { case CblasNoTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) * - ConstStridedMap(A, M, K, OuterStride(lda)).transpose()); + C_mat.noalias() += alpha * + (ConstStridedMap(B, N, K, OuterStride(ldb)) * + ConstStridedMap(A, M, K, OuterStride(lda)).transpose()); return; case CblasTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() * - ConstStridedMap(A, M, K, OuterStride(lda)).transpose()); + C_mat.noalias() += alpha * + (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() * + ConstStridedMap(A, M, K, OuterStride(lda)).transpose()); return; default: LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB"; @@ -213,15 +214,15 @@ void Gemv( } switch (TransA) { case CblasNoTrans: { - y_vec.noalias() += alpha * ( - ConstEigenMatrixMap(A, N, M).transpose() * - ConstEigenVectorMap(x, N)); + y_vec.noalias() += alpha * + (ConstEigenMatrixMap(A, N, M).transpose() * + ConstEigenVectorMap(x, N)); return; } case CblasTrans: { - y_vec.noalias() += alpha * ( - ConstEigenMatrixMap(A, N, M) * - ConstEigenVectorMap(x, M)); + y_vec.noalias() += alpha * + (ConstEigenMatrixMap(A, N, M) * + ConstEigenVectorMap(x, M)); return; } default: @@ -247,13 +248,12 @@ void Gemv( CAFFE2_SPECIALIZED_SCALE(float) #undef CAFFE2_SPECIALIZED_SCALE -#define CAFFE2_SPECIALIZED_DOT(T) \ -template<> \ -void Dot( \ - const int N, const T* a, const T* b, T* y, \ - CPUContext* context) { \ - *y = ConstEigenVectorMap(a, N).dot(ConstEigenVectorMap(b, N)); \ -} +#define CAFFE2_SPECIALIZED_DOT(T) \ + template <> \ + void Dot( \ + const int N, const T* a, const T* b, T* y, CPUContext* context) { \ + *y = ConstEigenVectorMap(a, N).dot(ConstEigenVectorMap(b, N)); \ + } CAFFE2_SPECIALIZED_DOT(float) #undef CAFFE2_SPECIALIZED_DOT @@ -271,17 +271,22 @@ CAFFE2_SPECIALIZED_DOT(float) CAFFE2_SPECIALIZED_AXPY(float) #undef CAFFE2_SPECIALIZED_AXPY -#define CAFFE2_SPECIALIZED_AXPBY(T) \ -template <> \ -void Axpby(const int N, const T alpha, const T* x, \ - const T beta, T* y, CPUContext* context) { \ - EigenVectorMap y_vec(y, N); \ - y_vec = y_vec * beta + ConstEigenVectorMap(x, N) * alpha; \ -} +#define CAFFE2_SPECIALIZED_AXPBY(T) \ + template <> \ + void Axpby( \ + const int N, \ + const T alpha, \ + const T* x, \ + const T beta, \ + T* y, \ + CPUContext* context) { \ + EigenVectorMap y_vec(y, N); \ + y_vec = y_vec * beta + ConstEigenVectorMap(x, N) * alpha; \ + } CAFFE2_SPECIALIZED_AXPBY(float) #undef CAFFE2_SPECIALIZED_AXPBY -#else // CAFFE2_USE_EIGEN_FOR_BLAS +#else // CAFFE2_USE_EIGEN_FOR_BLAS template <> void Gemm( @@ -299,8 +304,21 @@ void Gemm( TensorProto::DataType /*math_type*/) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, - beta, C, N); + cblas_sgemm( + CblasRowMajor, + TransA, + TransB, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + N); } template <> @@ -319,8 +337,21 @@ void GemmEx( float* C, const int ldc, CPUContext* /*context*/) { - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); + cblas_sgemm( + CblasRowMajor, + TransA, + TransB, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc); } template <> @@ -393,7 +424,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s) CPUContext*) { \ cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \ } -#else // CAFFE2_USE_MKL +#else // CAFFE2_USE_MKL #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ template <> \ void Axpby( \ @@ -406,11 +437,11 @@ CAFFE2_SPECIALIZED_AXPY(float, s) cblas_##prefix##scal(N, beta, y, 1); \ cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ } -#endif // CAFFE2_USE_MKL +#endif // CAFFE2_USE_MKL CAFFE2_SPECIALIZED_AXPBY(float, s) #undef CAFFE2_SPECIALIZED_AXPBY -#endif // CAFFE2_USE_EIGEN_FOR_BLAS +#endif // CAFFE2_USE_EIGEN_FOR_BLAS template <> void GemmBatched( @@ -549,28 +580,28 @@ DELEGATE_POWX_FUNCTION(float, vsPowx) DELEGATE_POWX_FUNCTION(double, vdPowx) #undef DELEGATE_POWX_FUNCTION -#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \ - template <> \ - void Funcname( \ - const int N, const T* a, const T* b, T* y, CPUContext*) { \ - OriginalFunc(N, a, b, y); \ +#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl) \ + template <> \ + void Func( \ + const int N, const T* A, const T* B, T* C, CPUContext*) { \ + FuncImpl(N, A, B, C); \ } -DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd) +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd) DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd) -DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub) +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub) DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub) -DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul) +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul) DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul) -DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv) +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv) DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv) #undef DELEGATE_SIMPLE_BINARY_FUNCTION -#else // CAFFE2_USE_MKL +#else // CAFFE2_USE_MKL #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \ template <> \ void Funcname(const int N, const T* x, T* y, CPUContext*) { \ - EigenVectorMap(y, N) = ConstEigenVectorMap(x, N).array().expr(); \ + EigenVectorMap(y, N) = ConstEigenVectorArrayMap(x, N).expr(); \ } DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp) DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log) @@ -586,12 +617,12 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, rsqrt) DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square) #undef DELEGATE_SIMPLE_UNARY_FUNCTION -#define DELEGATE_SINCOS_FUNCTION(T) \ - template <> \ - void SinCos( \ - const int N, const T* x, T* ys, T* yc, CPUContext*) { \ - EigenVectorMap(ys, N) = ConstEigenVectorMap(x, N).array().sin(); \ - EigenVectorMap(yc, N) = ConstEigenVectorMap(x, N).array().cos(); \ +#define DELEGATE_SINCOS_FUNCTION(T) \ + template <> \ + void SinCos( \ + const int N, const T* x, T* ys, T* yc, CPUContext*) { \ + EigenVectorMap(ys, N) = ConstEigenVectorArrayMap(x, N).sin(); \ + EigenVectorMap(yc, N) = ConstEigenVectorArrayMap(x, N).cos(); \ } DELEGATE_SINCOS_FUNCTION(float) DELEGATE_SINCOS_FUNCTION(double) @@ -600,36 +631,56 @@ DELEGATE_SINCOS_FUNCTION(double) #define DELEGATE_POWX_FUNCTION(T) \ template <> \ void Powx(const int N, const T* a, T b, T* y, CPUContext*) { \ - EigenVectorMap(y, N) = ConstEigenVectorMap(a, N).array().pow(b); \ + EigenVectorMap(y, N) = ConstEigenVectorArrayMap(a, N).pow(b); \ } DELEGATE_POWX_FUNCTION(float) #undef DELEGATE_POWX_FUNCTION -#endif // CAFFE2_USE_MKL - +#endif // CAFFE2_USE_MKL -#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Funcname, expr) \ -template <> \ -void Funcname( \ - const int N, const T* a, const T* b, T* y, \ - CPUContext*) { \ - EigenVectorMap(y, N) = \ - ConstEigenVectorMap(a, N).array() expr \ - ConstEigenVectorMap(b, N).array(); \ -} +#define DELEGATE_NEG_FUNCTION(T) \ + template <> \ + void Neg(const int N, const T* x, T* y, CPUContext*) { \ + EigenVectorMap(y, N) = -ConstEigenVectorMap(x, N); \ + } +DELEGATE_NEG_FUNCTION(float) +DELEGATE_NEG_FUNCTION(double) +DELEGATE_NEG_FUNCTION(std::int32_t) +DELEGATE_NEG_FUNCTION(std::int64_t) +#undef DELEGATE_NEG_FUNCTION + +#define DELEGATE_SIGN_FUNCTION(T) \ + template <> \ + void Sign(const int N, const T* x, T* y, CPUContext*) { \ + EigenVectorMap(y, N) = ConstEigenVectorArrayMap(x, N).sign(); \ + } +DELEGATE_SIGN_FUNCTION(float) +DELEGATE_SIGN_FUNCTION(double) +DELEGATE_SIGN_FUNCTION(std::int32_t) +DELEGATE_SIGN_FUNCTION(std::int64_t) +#undef DELEGATE_SIGN_FUNCTION + +#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr) \ + template <> \ + void Func( \ + const int N, const T* A, const T* B, T* C, CPUContext*) { \ + EigenVectorMap(C, N) = ConstEigenVectorArrayMap(A, N) \ + expr ConstEigenVectorArrayMap(B, N); \ + } #ifdef CAFFE2_USE_MKL -#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \ -EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \ -EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) +#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \ + EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \ + EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr) #else -#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \ -EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr) \ -EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \ -EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) +#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \ + EIGEN_SIMPLE_BINARY_FUNCTION(float, Func, expr) \ + EIGEN_SIMPLE_BINARY_FUNCTION(double, Func, expr) \ + EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \ + EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr) #endif @@ -638,9 +689,8 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -) DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *) DEFINE_SIMPLE_BINARY_FUNCTION(Div, /) +#undef DEFINE_SIMPLE_BINARY_FUNCTION #undef EIGEN_SIMPLE_BINARY_FUNCTION -#undef DEFINE_FLOAT_BINARY_FUNCTION - //////////////////////////////////////////////////////////////////////////////// // Common math functions being used in Caffe that do not have a BLAS or MKL @@ -677,31 +727,6 @@ CAFFE2_SPECIALIZED_REDUCEMAX(int64_t) #undef CAFFE2_SPECIALIZED_REDUCEMAX -namespace internal { - -void IncreaseIndexInDims(const int n, const int* dims, int* index) { - for (int i = n - 1; i >= 0; --i) { - ++index[i]; - if (index[i] >= dims[i]) { - index[i] -= dims[i]; - } else { - break; - } - } -} - -int GetIndexFromDims(const int n, const int* dims, const int* index) { - int sum = 0; - for (int i = 0; i < n; ++i) { - if (dims[i] > 1) { - sum = sum * dims[i] + index[i]; - } - } - return sum; -} - -} // namespace internal - namespace { template @@ -728,9 +753,9 @@ void ReduceTensor( std::vector index(num_dims, 0); for (int X_index = 0; X_index < X_size; ++X_index) { const int Y_index = - internal::GetIndexFromDims(num_dims, Y_dims.data(), index.data()); + utils::GetIndexFromDims(num_dims, Y_dims.data(), index.data()); Y[Y_index] = reducer(Y[Y_index], X[X_index]); - internal::IncreaseIndexInDims(num_dims, dims, index.data()); + utils::IncreaseIndexInDims(num_dims, dims, index.data()); } } @@ -868,9 +893,9 @@ void BroadcastImpl( std::vector index(Y_ndim, 0); for (int Y_index = 0; Y_index < Y_size; ++Y_index) { const int X_index = - internal::GetIndexFromDims(Y_ndim, X_dims_ex.data(), index.data()); + utils::GetIndexFromDims(Y_ndim, X_dims_ex.data(), index.data()); Y[Y_index] = X[X_index]; - internal::IncreaseIndexInDims(Y_ndim, Y_dims, index.data()); + utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data()); } } @@ -920,10 +945,10 @@ void MomentsImpl( std::vector index(num_dims, 0); for (int X_index = 0; X_index < X_size; ++X_index) { const int Y_index = - internal::GetIndexFromDims(num_dims, Y_dims.data(), index.data()); + utils::GetIndexFromDims(num_dims, Y_dims.data(), index.data()); mean[Y_index] += X[X_index]; variance[Y_index] += X[X_index] * X[X_index]; - internal::IncreaseIndexInDims(num_dims, dims, index.data()); + utils::IncreaseIndexInDims(num_dims, dims, index.data()); } for (int Y_index = 0; Y_index < Y_size; ++Y_index) { mean[Y_index] /= static_cast(scale); @@ -992,42 +1017,169 @@ CAFFE2_SPECIALIZED_ELEMWISEMAX(float) CAFFE2_SPECIALIZED_MAXIMUM(float) #undef CAFFE2_SPECIALIZED_MAXIMUM -// AddToRow and AddToCol adds the corresponding row/col vector b to the matrix a -// of shape M x N. The actual implementation uses eigen which is column major, -// so notice the row/column swap in the actual implementation. -#define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr) \ - template <> \ - void Funcname##ToRow( \ - const int M, const int N, const T* a, const T* b, T* y, CPUContext*) { \ - EigenArrayMap(y, N, M) = ConstEigenArrayMap(a, N, M).colwise() \ - expr ConstEigenVectorArrayMap(b, N); \ - } \ - /* inplace versions */ \ - template <> \ - void Funcname##ToRow( \ - const int M, const int N, const T* x, T* y, CPUContext*) { \ - EigenArrayMap(y, N, M).colwise() expr## = \ - ConstEigenVectorArrayMap(x, N); \ - } \ - template <> \ - void Funcname##ToCol( \ - const int M, const int N, const T* x, T* y, CPUContext*) { \ - EigenArrayMap(y, N, M).rowwise() expr## = \ - ConstEigenVectorArrayMap(x, M).transpose(); \ +// The actual implementation uses eigen which is column major, so notice the +// row/column swap in the actual implementation. + +#define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + if (C == B) { \ + EigenArrayMap(C, cols, rows).colwise() expr## = \ + ConstEigenVectorArrayMap(A, cols); \ + } else { \ + EigenArrayMap(C, cols, rows) = \ + ConstEigenArrayMap(B, cols, rows) \ + .colwise() expr ConstEigenVectorArrayMap(A, cols); \ + } \ + } \ + template <> \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + if (C == B) { \ + EigenArrayMap(C, cols, rows).rowwise() expr## = \ + ConstEigenVectorArrayMap(A, rows).transpose(); \ + } else { \ + EigenArrayMap(C, cols, rows) = \ + ConstEigenArrayMap(B, cols, rows) \ + .rowwise() expr ConstEigenVectorArrayMap(A, rows) \ + .transpose(); \ + } \ + } + +#define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + if (C == A) { \ + EigenArrayMap(C, cols, rows).colwise() expr## = \ + ConstEigenVectorArrayMap(B, cols); \ + } else { \ + EigenArrayMap(C, cols, rows) = \ + ConstEigenArrayMap(A, cols, rows) \ + .colwise() expr ConstEigenVectorArrayMap(B, cols); \ + } \ + } \ + template <> \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + if (C == A) { \ + EigenArrayMap(C, cols, rows).rowwise() expr## = \ + ConstEigenVectorArrayMap(B, rows).transpose(); \ + } else { \ + EigenArrayMap(C, cols, rows) = \ + ConstEigenArrayMap(A, cols, rows) \ + .rowwise() expr ConstEigenVectorArrayMap(B, rows) \ + .transpose(); \ + } \ } -#define DEFINE_BROADCAST_BINARY_FUNCTION(name, op) \ - DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \ - DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \ - DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) \ +#define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \ + DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ + DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) -DEFINE_BROADCAST_BINARY_FUNCTION(Add, +) -DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -) -DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *) -DEFINE_BROADCAST_BINARY_FUNCTION(Div, /) +#define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \ + DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \ + DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \ + DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \ + DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr) -#undef DEFINE_BROADCAST_BINARY_FUNCTION -#undef DELEGATE_BROADCAST_BINARY_FUNCTION +DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +) +DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *) + +#undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION +#undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION + +#define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \ + template <> \ + void RowwiseSub( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + EigenArrayMap(C, cols, rows) = \ + (-ConstEigenArrayMap(B, cols, rows)).colwise() + \ + ConstEigenVectorArrayMap(A, cols); \ + } \ + template <> \ + void ColwiseSub( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + EigenArrayMap(C, cols, rows) = \ + (-ConstEigenArrayMap(B, cols, rows)).rowwise() + \ + ConstEigenVectorArrayMap(A, rows).transpose(); \ + } \ + DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -) + +DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float) +DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double) +DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t) +DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t) + +#undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION + +#define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \ + template <> \ + void RowwiseDiv( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + EigenArrayMap(C, cols, rows) = \ + ConstEigenArrayMap(B, cols, rows).inverse().colwise() * \ + ConstEigenVectorArrayMap(A, cols); \ + } \ + template <> \ + void ColwiseDiv( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + EigenArrayMap(C, cols, rows) = \ + ConstEigenArrayMap(B, cols, rows).inverse().rowwise() * \ + ConstEigenVectorArrayMap(A, rows).transpose(); \ + } \ + DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /) + +DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float) +DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double) +DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /) +DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /) + +#undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION + +#undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION +#undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION #define CAFFE2_SPECIALIZED_SET(T) \ template <> \ @@ -1053,48 +1205,13 @@ CAFFE2_SPECIALIZED_SET(uint8_t); CAFFE2_SPECIALIZED_SET(uint16_t); #undef CAFFE2_SPECIALIZED_SET -#define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T) \ - template <> \ - void name( \ - const int n, const T* a, const T* b, bool* y, CPUContext*) { \ - for (int i = 0; i < n; ++i) { \ - y[i] = a[i] op b[i]; \ - } \ - } \ - template <> \ - void name##ToRow( \ - const int m, \ - const int n, \ - const T* a, \ - const T* b, \ - bool* y, \ - CPUContext*) { \ - for (int i = 0; i < n * m; ++i) { \ - y[i] = a[i] op b[i % n]; \ - } \ - } - -#define CAFFE2_DEFINE_BINARY_OP(name, op) \ - CAFFE2_INSTANTIATE_BINARY_OP(name, op, float) \ - CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \ - CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t) - -CAFFE2_DEFINE_BINARY_OP(LT, <); -CAFFE2_DEFINE_BINARY_OP(LE, <=); -CAFFE2_DEFINE_BINARY_OP(GT, >); -CAFFE2_DEFINE_BINARY_OP(GE, >=); - -CAFFE2_INSTANTIATE_BINARY_OP(Or, |, bool); -CAFFE2_INSTANTIATE_BINARY_OP(And, &, bool); -CAFFE2_INSTANTIATE_BINARY_OP(Xor, ^, bool); - template <> void Not( - const int n, + const int N, const bool* x, bool* y, CPUContext* /*context*/) { - for (int i = 0; i < n; ++i) { + for (int i = 0; i < N; ++i) { y[i] = !x[i]; } } @@ -1119,6 +1236,363 @@ void Not( CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float); #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH +namespace { + +template +void RowwiseBinaryOp( + const int rows, + const int cols, + const BinaryOperator& op, + const TIn* A, + const TIn* B, + TOut* C) { + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + const int C_index = i * cols + j; + const int A_index = kBroadcast1st ? j : C_index; + const int B_index = kBroadcast1st ? C_index : j; + C[C_index] = op(A[A_index], B[B_index]); + } + } +} + +template +void ColwiseBinaryOp( + const int rows, + const int cols, + const BinaryOperator& op, + const TIn* A, + const TIn* B, + TOut* C) { + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + const int C_index = i * cols + j; + const int A_index = kBroadcast1st ? i : C_index; + const int B_index = kBroadcast1st ? C_index : i; + C[C_index] = op(A[A_index], B[B_index]); + } + } +} + +template +void BinaryOpWith2DBroadcasting( + const int ndim, + const int* dims, + const int pivot, + const bool broadcast_1st, + const Operator1& op1, + const Operator2& op2, + const TIn* A, + const TIn* B, + TOut* C, + CPUContext* context) { + const int rows = + std::accumulate(dims, dims + pivot, 1, std::multiplies()); + const int cols = + std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies()); + if (broadcast_1st) { + op1(rows, cols, A, B, C, context); + } else { + op2(rows, cols, A, B, C, context); + } +} + +template +void BroadcastBinaryOpImpl( + const int ndim, + const int* A_dims, + const int* B_dims, + const int* C_dims, + const BinaryOperator& op, + const TIn* A, + const TIn* B, + TOut* C) { + std::vector index(ndim, 0); + const int C_size = + std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies()); + for (int C_index = 0; C_index < C_size; ++C_index) { + const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data()); + const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data()); + C[C_index] = op(A[A_index], B[B_index]); + utils::IncreaseIndexInDims(ndim, C_dims, index.data()); + } +} + +} // namespace + +#define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Func( \ + const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \ + std::transform(A, A + N, B, C, Op()); \ + } + +#define DEFINE_1D_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_1D_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_1D_BINARY_FUNCTION(double, bool, Func, Op) \ + DELEGATE_1D_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_1D_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, Op) + +DEFINE_1D_COMPARE_FUNCTION(EQ, std::equal_to) +DEFINE_1D_COMPARE_FUNCTION(NE, std::not_equal_to) +DEFINE_1D_COMPARE_FUNCTION(LT, std::less) +DEFINE_1D_COMPARE_FUNCTION(LE, std::less_equal) +DEFINE_1D_COMPARE_FUNCTION(GT, std::greater) +DEFINE_1D_COMPARE_FUNCTION(GE, std::greater_equal) + +#undef DEFINE_1D_COMPARE_FUNCTION + +DELEGATE_1D_BINARY_FUNCTION(bool, bool, And, std::logical_and) +DELEGATE_1D_BINARY_FUNCTION(bool, bool, Or, std::logical_or) +DELEGATE_1D_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) + +#define DEFINE_1D_BITWISE_BINARY_FUNCTION(Func, op) \ + DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, op) \ + DELEGATE_1D_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, op) \ + DELEGATE_1D_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, op) + +DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) +DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) +DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) + +#undef DEFINE_1D_BITWISE_BINARY_FUNCTION + +#undef DELEGATE_1D_BINARY_FUNCTION + +#define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CPUContext*) { \ + RowwiseBinaryOp, true>(rows, cols, Op(), A, B, C); \ + } \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CPUContext*) { \ + RowwiseBinaryOp, false>( \ + rows, cols, Op(), A, B, C); \ + } \ + template <> \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CPUContext*) { \ + ColwiseBinaryOp, true>(rows, cols, Op(), A, B, C); \ + } \ + template <> \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CPUContext*) { \ + ColwiseBinaryOp, false>( \ + rows, cols, Op(), A, B, C); \ + } + +#define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) + +DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to) +DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to) +DEFINE_2D_COMPARE_FUNCTION(LT, std::less) +DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal) +DEFINE_2D_COMPARE_FUNCTION(GT, std::greater) +DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal) + +#undef DEFINE_2D_COMPARE_FUNCTION + +DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and) +DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or) +DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) + +#define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) + +DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) +DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) +DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) + +#undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION + +#undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION + +#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \ + template <> \ + void RowwiseDiv( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + RowwiseBinaryOp, true>( \ + rows, cols, std::divides(), A, B, C); \ + } \ + template <> \ + void ColwiseDiv( \ + const int rows, \ + const int cols, \ + const T* A, \ + const T* B, \ + T* C, \ + CPUContext*) { \ + ColwiseBinaryOp, true>( \ + rows, cols, std::divides(), A, B, C); \ + } +DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t) +DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t) +#undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION + +#define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Func( \ + const int A_ndim, \ + const int* A_dims, \ + const int B_ndim, \ + const int* B_dims, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CPUContext* context) { \ + const int ndim = std::max(A_ndim, B_ndim); \ + std::vector A_dims_array(ndim); \ + std::vector B_dims_array(ndim); \ + std::vector C_dims_array(ndim); \ + utils::ComputeBroadcastBinaryOpDims( \ + A_ndim, \ + A_dims, \ + B_ndim, \ + B_dims, \ + A_dims_array.data(), \ + B_dims_array.data(), \ + C_dims_array.data()); \ + if (A_dims_array == B_dims_array) { \ + const int size = std::accumulate( \ + C_dims_array.cbegin(), \ + C_dims_array.cend(), \ + 1, \ + std::multiplies()); \ + Func(size, A, B, C, context); \ + return; \ + } \ + int pivot; \ + bool broadcast_1st; \ + if (utils::IsRowwiseBroadcastBinaryOp( \ + ndim, \ + A_dims_array.data(), \ + B_dims_array.data(), \ + &pivot, \ + &broadcast_1st)) { \ + BinaryOpWith2DBroadcasting( \ + ndim, \ + C_dims_array.data(), \ + pivot, \ + broadcast_1st, \ + Rowwise##Func, \ + Rowwise##Func, \ + A, \ + B, \ + C, \ + context); \ + return; \ + } \ + if (utils::IsColwiseBroadcastBinaryOp( \ + ndim, \ + A_dims_array.data(), \ + B_dims_array.data(), \ + &pivot, \ + &broadcast_1st)) { \ + BinaryOpWith2DBroadcasting( \ + ndim, \ + C_dims_array.data(), \ + pivot, \ + broadcast_1st, \ + Colwise##Func, \ + Colwise##Func, \ + A, \ + B, \ + C, \ + context); \ + return; \ + } \ + BroadcastBinaryOpImpl( \ + ndim, \ + A_dims_array.data(), \ + B_dims_array.data(), \ + C_dims_array.data(), \ + Op(), \ + A, \ + B, \ + C); \ + } + +#define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) + +DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to) +DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to) +DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less) +DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal) +DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater) +DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal) + +#undef DEFINE_BROADCAST_COMPARE_FUNCTION + +#define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) + +DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus) +DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus) +DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies) +DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides) + +#undef DEFINE_BROADCAST_BINARY_FUNCTION + +DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and) +DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or) +DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) + +#define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) + +DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) +DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) +DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) + +#undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION + +#undef DELEGATE_BROADCAST_BINARY_FUNCTION + template <> void RandUniform( const size_t n, @@ -1127,7 +1601,7 @@ void RandUniform( float* r, CPUContext* context) { std::uniform_real_distribution distribution(a, b); - for (auto i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { r[i] = distribution(context->RandGenerator()); } } @@ -1140,7 +1614,7 @@ void RandUniform( int* r, CPUContext* context) { std::uniform_int_distribution distribution(a, b); - for (auto i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { r[i] = distribution(context->RandGenerator()); } } @@ -1185,7 +1659,7 @@ void RandGaussian( float* r, CPUContext* context) { std::normal_distribution distribution(mean, std); - for (auto i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { r[i] = distribution(context->RandGenerator()); } } @@ -1280,7 +1754,7 @@ void Im2ColNdNCHWImpl( } else if (!is_padding) { Y_data[img_index] += X_data[col_index]; } - internal::IncreaseIndexInDims(N, col_shape + 1, d_iter.data()); + utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data()); } } } @@ -1693,10 +2167,9 @@ void BiasCHW( // FIXME: if input < kVecSizeInFloat, can't vectorize at all - int prologue = - kVecSizeInFloat - - // remainder in floats - (((uintptr_t) image) % (sizeof(float32x4_t))) / sizeof(float); + int prologue = kVecSizeInFloat - + // remainder in floats + (((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float); int i = 0; // Prologue loop @@ -1900,7 +2373,7 @@ void TransposeCPUImpl( X + block_size * X_index, block_size * sizeof(T)); } - internal::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data()); + utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data()); } } @@ -1922,16 +2395,16 @@ void Transpose( TransposeCPUImpl(ndim, dims, axes, X, Y); } -#define CAFFE2_SPECIALIZED_TRANSPOSE(T) \ - template <> \ - void Transpose( \ - const int ndim, \ - const int* dims, \ - const int* axes, \ - const T* X, \ - T* Y, \ - CPUContext* /* context */) { \ - TransposeCPUImpl(ndim, dims, axes, X, Y); \ +#define CAFFE2_SPECIALIZED_TRANSPOSE(T) \ + template <> \ + void Transpose( \ + const int ndim, \ + const int* dims, \ + const int* axes, \ + const T* X, \ + T* Y, \ + CPUContext* /* context */) { \ + TransposeCPUImpl(ndim, dims, axes, X, Y); \ } CAFFE2_SPECIALIZED_TRANSPOSE(double) CAFFE2_SPECIALIZED_TRANSPOSE(int) diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index ca05238a62fe91..b667351c78d564 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -9,6 +9,8 @@ #include #include +#include + #include "caffe2/core/context_gpu.h" #include "caffe2/utils/conversions.h" @@ -19,117 +21,604 @@ namespace caffe2 { namespace math { -#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Funcname, function) \ - __global__ void _Kernel_##T##_##Funcname(const int N, const T* x, T* y) { \ - CUDA_1D_KERNEL_LOOP(i, N) { \ - y[i] = function(x[i]); \ - } \ - } \ - template <> \ - void Funcname( \ - const int N, const T* x, T* y, CUDAContext* context) { \ - _Kernel_##T##_##Funcname<<< \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_CUDA_NUM_THREADS, \ - 0, \ - context->cuda_stream()>>>(N, x, y); \ - } - -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf); -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, InvSqrt, rsqrtf); - -__device__ float cuda_sqrf(const float x) { +namespace { + +inline __host__ __device__ bool Not(const bool x) { + return !x; +} + +template +inline __host__ __device__ T Negate(const T& x) { + return -x; +} + +template +inline __host__ __device__ T Square(const T& x) { return x * x; } -DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf); +template +inline __host__ __device__ T Sign(const T& x) { + return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0)); +} + +#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr) \ + template \ + struct Func##Functor { \ + inline __host__ __device__ T \ + operator()(const T& lhs, const T& rhs) const { \ + return lhs expr rhs; \ + } \ + }; \ + template <> \ + struct Func##Functor { \ + inline __host__ __device__ float16 \ + operator()(const float16& lhs, const float16& rhs) const { \ + return convert::To(convert::To( \ + lhs) expr convert::To(rhs)); \ + } \ + }; +DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +) +DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -) +DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *) +DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /) +#undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR + +template +__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) { + CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + sincos(__ldg(X + i), S + i, C + i); +#else + sincos(X[i], S + i, C + i); +#endif + } +} + +template +__global__ void SimpleBinaryOpCUDAKernel( + const int N, + const BinaryOperator op, + const TIn* A, + const TIn* B, + TOut* C) { + CUDA_1D_KERNEL_LOOP(i, N) { + C[i] = op(A[i], B[i]); + } +} + +template +__global__ void RowwiseBinaryOpCUDAKenel( + const int rows, + const int cols, + const BinaryOperator op, + const TIn* A, + const TIn* B, + TOut* C) { + const int size = rows * cols; + CUDA_1D_KERNEL_LOOP(C_index, size) { + const int j = C_index % cols; + const int A_index = broadcast_1st ? j : C_index; + const int B_index = broadcast_1st ? C_index : j; + C[C_index] = op(A[A_index], B[B_index]); + } +} + +template +__global__ void ColwiseBinaryOpCUDAKenel( + const int rows, + const int cols, + const BinaryOperator op, + const TIn* A, + const TIn* B, + TOut* C) { + const int size = rows * cols; + CUDA_1D_KERNEL_LOOP(C_index, size) { + const int i = C_index / cols; + const int A_index = broadcast_1st ? i : C_index; + const int B_index = broadcast_1st ? C_index : i; + C[C_index] = op(A[A_index], B[B_index]); + } +} + +template +__global__ void BroadcastBinaryOpCUDAKernel( + const int size, + const SimpleArray A_strides, + const SimpleArray B_strides, + const SimpleArray C_dims, + const BinaryOperator op, + const TIn* A, + const TIn* B, + TOut* C) { + CUDA_1D_KERNEL_LOOP(C_index, size) { + int A_index = 0; + int B_index = 0; + int C_index_val = C_index; +#pragma unroll + for (int i = D - 1; i >= 0; --i) { + const int d = C_index_val % C_dims.data[i]; + A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i]; + B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i]; + C_index_val /= C_dims.data[i]; + } + C[C_index] = op(A[A_index], B[B_index]); + } +} + +template +void BinaryOpWith2DBroadcasting( + const int ndim, + const int* dims, + const int pivot, + const bool rowwise_broadcast, + const bool broadcast_1st, + const BinaryOperator& op, + const TIn* A, + const TIn* B, + TOut* C, + CUDAContext* context) { + const int rows = + std::accumulate(dims, dims + pivot, 1, std::multiplies()); + const int cols = + std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies()); + const int size = rows * cols; + if (rowwise_broadcast) { + if (broadcast_1st) { + RowwiseBinaryOpCUDAKenel + <<cuda_stream()>>>(rows, cols, op, A, B, C); + } else { + RowwiseBinaryOpCUDAKenel + <<cuda_stream()>>>(rows, cols, op, A, B, C); + } + } else { + if (broadcast_1st) { + ColwiseBinaryOpCUDAKenel + <<cuda_stream()>>>(rows, cols, op, A, B, C); + } else { + ColwiseBinaryOpCUDAKenel + <<cuda_stream()>>>(rows, cols, op, A, B, C); + } + } +} + +template +void BroadcastBinaryOpImpl( + const int* A_dims, + const int* B_dims, + const int* C_dims, + const BinaryOperator& op, + const TIn* A, + const TIn* B, + TOut* C, + CUDAContext* context) { + SimpleArray A_strides_array; + SimpleArray B_strides_array; + SimpleArray C_dims_array; + int A_stride = 1; + int B_stride = 1; + for (int i = D - 1; i >= 0; --i) { + A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride; + B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride; + A_stride *= A_dims[i]; + B_stride *= B_dims[i]; + } + std::copy(C_dims, C_dims + D, C_dims_array.data); + const int size = + std::accumulate(C_dims, C_dims + D, 1, std::multiplies()); + BroadcastBinaryOpCUDAKernel + <<cuda_stream()>>>( + size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C); +} + +template +void BroadcastBinaryOp( + const int A_ndim, + const int* A_dims, + const int B_ndim, + const int* B_dims, + const BinaryOperator& op, + const TIn* A, + const TIn* B, + TOut* C, + CUDAContext* context) { + const int ndim = std::max(A_ndim, B_ndim); + std::vector A_dims_array(ndim); + std::vector B_dims_array(ndim); + std::vector C_dims_array(ndim); + utils::ComputeBroadcastBinaryOpDims( + A_ndim, + A_dims, + B_ndim, + B_dims, + A_dims_array.data(), + B_dims_array.data(), + C_dims_array.data()); + if (A_dims_array == B_dims_array) { + const int size = std::accumulate( + C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies()); + SimpleBinaryOpCUDAKernel + <<cuda_stream()>>>(size, op, A, B, C); + return; + } + int pivot; + bool broadcast_1st; + if (utils::IsRowwiseBroadcastBinaryOp( + ndim, + A_dims_array.data(), + B_dims_array.data(), + &pivot, + &broadcast_1st)) { + BinaryOpWith2DBroadcasting( + ndim, + C_dims_array.data(), + pivot, + true, + broadcast_1st, + op, + A, + B, + C, + context); + return; + } + if (utils::IsColwiseBroadcastBinaryOp( + ndim, + A_dims_array.data(), + B_dims_array.data(), + &pivot, + &broadcast_1st)) { + BinaryOpWith2DBroadcasting( + ndim, + C_dims_array.data(), + pivot, + false, + broadcast_1st, + op, + A, + B, + C, + context); + return; + } + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3( + ndim, + BroadcastBinaryOpImpl, + TIn, + TOut, + BinaryOperator, + A_dims_array.data(), + B_dims_array.data(), + C_dims_array.data(), + op, + A, + B, + C, + context); +} + +} // namespace + +#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, op) \ + __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \ + CUDA_1D_KERNEL_LOOP(i, N) { \ + Y[i] = op(X[i]); \ + } \ + } \ + template <> \ + void Func( \ + const int N, const T* x, T* y, CUDAContext* context) { \ + Func##CUDAKernel<<< \ + CAFFE_GET_BLOCKS(N), \ + CAFFE_CUDA_NUM_THREADS, \ + 0, \ + context->cuda_stream()>>>(N, x, y); \ + } + +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, InvSqrt, rsqrtf) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, Square) + +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, Not) + +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, Negate) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, Negate) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg, Negate) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg, Negate) + +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, Sign) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, Sign) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign, Sign) +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign, Sign) #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION -#define DELEGATE_SINCOS_CUDA_FUNCTION(T) \ - __global__ void _Kernel_##T##_##SinCos( \ - const int N, const T* x, T* ys, T* yc) { \ - CUDA_1D_KERNEL_LOOP(i, N) { \ - sincos(x[i], ys + i, yc + i); \ - } \ - } \ +#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \ template <> \ void SinCos( \ const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \ - _Kernel_##T##_##SinCos<<< \ + SinCosCUDAKernel<<< \ CAFFE_GET_BLOCKS(N), \ CAFFE_CUDA_NUM_THREADS, \ 0, \ context->cuda_stream()>>>(N, x, ys, yc); \ } +CAFFE2_SPECIALIZED_CUDA_SINCOS(float) +CAFFE2_SPECIALIZED_CUDA_SINCOS(double) +#undef CAFFE2_SPECIALIZED_CUDA_SINCOS + +#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Func( \ + const int N, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CUDAContext* context) { \ + SimpleBinaryOpCUDAKernel> \ + <<cuda_stream()>>>(N, Op(), A, B, C); \ + } -DELEGATE_SINCOS_CUDA_FUNCTION(float) -DELEGATE_SINCOS_CUDA_FUNCTION(double) +#define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) -#undef DELEGATE_SINCOS_CUDA_FUNCTION +DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to) +DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to) +DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less) +DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal) +DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater) +DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal) -#define DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(T, Funcname, expr) \ - __global__ void _Kernel_##T##_##Funcname( \ - const int N, const T* a, const T* b, T* y) { \ - CUDA_1D_KERNEL_LOOP(i, N) { \ - float r = convert::To(a[i]) expr convert::To(b[i]); \ - y[i] = convert::To(r); \ - } \ - } \ - template <> \ - void Funcname( \ - const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ - _Kernel_##T##_##Funcname<<< \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_CUDA_NUM_THREADS, \ - 0, \ - context->cuda_stream()>>>(N, a, b, y); \ - } - -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Add, +); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(int32_t, Add, +); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Sub, -); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Mul, *); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Div, /); - -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Add, +); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Sub, -); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Mul, *); -DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Div, /); - -#undef DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION - -#define DELEGATE_SIMPLE_CUDA_BINARY_PREFIX_FUNCTION(T, Funcname, func) \ - __global__ void _Kernel_##T##_##Funcname( \ - const int N, const T* a, const T* b, T* y) { \ - CUDA_1D_KERNEL_LOOP(i, N) { \ - float r = \ - func(convert::To(a[i]), convert::To(b[i])); \ - y[i] = convert::To(r); \ - } \ - } \ - template <> \ - void Funcname( \ - const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ - _Kernel_##T##_##Funcname<<< \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_CUDA_NUM_THREADS, \ - 0, \ - context->cuda_stream()>>>(N, a, b, y); \ - } +#undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION + +#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, float16, Func, Op) + +DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor) +DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor) +DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor) +DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor) + +#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION + +DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) +DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) +DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) + +#define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) + +DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) +DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or) +DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) -DELEGATE_SIMPLE_CUDA_BINARY_PREFIX_FUNCTION(float, ElemwiseMax, fmaxf); +#undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION -#undef DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION +DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( + float, + float, + ElemwiseMax, + thrust::maximum); + +#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION + +#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CUDAContext* context) { \ + const int size = rows * cols; \ + RowwiseBinaryOpCUDAKenel, true> \ + <<cuda_stream()>>>(rows, cols, Op(), A, B, C); \ + } \ + template <> \ + void Rowwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CUDAContext* context) { \ + const int size = rows * cols; \ + RowwiseBinaryOpCUDAKenel, false> \ + <<cuda_stream()>>>(rows, cols, Op(), A, B, C); \ + } \ + template <> \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CUDAContext* context) { \ + const int size = rows * cols; \ + ColwiseBinaryOpCUDAKenel, true> \ + <<cuda_stream()>>>(rows, cols, Op(), A, B, C); \ + } \ + template <> \ + void Colwise##Func( \ + const int rows, \ + const int cols, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CUDAContext* context) { \ + const int size = rows * cols; \ + ColwiseBinaryOpCUDAKenel, false> \ + <<cuda_stream()>>>(rows, cols, Op(), A, B, C); \ + } + +#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) + +DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to) +DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to) +DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less) +DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal) +DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater) +DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal) + +#undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION + +#define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int64_t, std::int64_t, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op) + +DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor) +DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor) +DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor) +DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor) + +#undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION + +DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) +DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) +DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) + +#define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int64_t, std::int64_t, Func, Op) + +DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) +DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or) +DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) + +#undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION + +#undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION + +#define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Func( \ + const int A_ndim, \ + const int* A_dims, \ + const int B_ndim, \ + const int* B_dims, \ + const TIn* A, \ + const TIn* B, \ + TOut* C, \ + CUDAContext* context) { \ + BroadcastBinaryOp>( \ + A_ndim, A_dims, B_ndim, B_dims, Op(), A, B, C, context); \ + } + +#define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) + +DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to) +DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to) +DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less) +DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal) +DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater) +DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal) + +#undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION + +#define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int64_t, std::int64_t, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op) + +DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor) +DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor) +DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor) +DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor) + +#undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION + +DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) +DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) +DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) + +#define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \ + std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) + +DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) +DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or) +DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) + +#undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION + +#undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ template <> \ @@ -2228,43 +2717,6 @@ void Maximum( namespace { -std::vector MakeTransposeAxes( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes) { - std::vector transpose_axes(num_dims); - const int d = num_dims - num_axes; - std::copy_n(axes, num_axes, transpose_axes.begin() + d); - std::sort(transpose_axes.begin() + d, transpose_axes.end()); - int p = 0; - int q = d; - for (int i = 0; i < num_dims; ++i) { - if (q < num_dims && i == transpose_axes[q]) { - ++q; - } else { - transpose_axes[p++] = i; - } - } - return transpose_axes; -} - -template -void ComputeTransposedStrides( - const int* X_dims, - const int* axes, - int* X_strides) { - int buff[D]; - int cur_stride = 1; - for (int i = D - 1; i >= 0; --i) { - buff[i] = cur_stride; - cur_stride *= X_dims[i]; - } - for (int i = 0; i < D; ++i) { - X_strides[i] = buff[axes[i]]; - } -} - template __global__ void ReduceTensorCUDAKernel( const int outer_size, @@ -2282,9 +2734,9 @@ __global__ void ReduceTensorCUDAKernel( int X_index = 0; int Y_index = i * inner_size + j; #pragma unroll - for (int i = D - 1; i >= 0; --i) { - X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i]; - Y_index /= Y_dims.data[i]; + for (int d = D - 1; d >= 0; --d) { + X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d]; + Y_index /= Y_dims.data[d]; } #if __CUDA_ARCH__ >= 350 val = reducer(val, __ldg(X + X_index)); @@ -2313,7 +2765,7 @@ void ReduceTensorCUDAImpl( CUDAContext* context) { SimpleArray X_strides; SimpleArray Y_dims; - ComputeTransposedStrides(dims, axes, X_strides.data); + utils::ComputeTransposedStrides(D, dims, axes, X_strides.data); for (int i = 0; i < D; ++i) { Y_dims.data[i] = dims[axes[i]]; } @@ -2337,8 +2789,9 @@ void ReduceTensorCUDA( T* Y, CUDAContext* context) { CAFFE_ENFORCE_LE(num_axes, num_dims); - const std::vector transpose_axes = - MakeTransposeAxes(num_dims, dims, num_axes, axes); + std::vector transpose_axes(num_dims); + utils::ComputeTransposeAxesForReduceOp( + num_dims, num_axes, axes, transpose_axes.data()); const int pivot = num_dims - num_axes; int outer_size = 1; for (int i = 0; i < pivot; ++i) { @@ -2648,7 +3101,7 @@ void MomentsCUDAImpl( CUDAContext* context) { SimpleArray X_strides; SimpleArray Y_dims; - ComputeTransposedStrides(dims, axes, X_strides.data); + utils::ComputeTransposedStrides(D, dims, axes, X_strides.data); for (int i = 0; i < D; ++i) { Y_dims.data[i] = dims[axes[i]]; } @@ -2671,8 +3124,9 @@ void MomentsCUDA( T* variance, CUDAContext* context) { CAFFE_ENFORCE_LE(num_axes, num_dims); - const std::vector transpose_axes = - MakeTransposeAxes(num_dims, dims, num_axes, axes); + std::vector transpose_axes(num_dims); + utils::ComputeTransposeAxesForReduceOp( + num_dims, num_axes, axes, transpose_axes.data()); const int pivot = num_dims - num_axes; int outer_size = 1; for (int i = 0; i < pivot; ++i) { @@ -2757,7 +3211,7 @@ void TransposeCUDAImpl( CUDAContext* context) { SimpleArray X_strides; SimpleArray Y_dims; - ComputeTransposedStrides(dims, axes, X_strides.data); + utils::ComputeTransposedStrides(D, dims, axes, X_strides.data); int size = 1; for (int i = 0; i < D; ++i) { Y_dims.data[i] = dims[axes[i]]; diff --git a/caffe2/utils/math_utils.cc b/caffe2/utils/math_utils.cc new file mode 100644 index 00000000000000..4f245c9b0fd0e3 --- /dev/null +++ b/caffe2/utils/math_utils.cc @@ -0,0 +1,141 @@ +#include "caffe2/utils/math_utils.h" + +#include +#include + +#include "caffe2/core/logging.h" + +namespace caffe2 { +namespace math { +namespace utils { + +void IncreaseIndexInDims(const int n, const int* dims, int* index) { + for (int i = n - 1; i >= 0; --i) { + ++index[i]; + if (index[i] >= dims[i]) { + index[i] -= dims[i]; + } else { + break; + } + } +} + +int GetIndexFromDims(const int n, const int* dims, const int* index) { + int sum = 0; + for (int i = 0; i < n; ++i) { + if (dims[i] > 1) { + sum = sum * dims[i] + index[i]; + } + } + return sum; +} + +void ComputeBroadcastBinaryOpDims( + const int A_ndim, + const int* A_dims, + const int B_ndim, + const int* B_dims, + int* A_broadcast_dims, + int* B_broadcast_dims, + int* C_broadcast_dims) { + const int ndim = std::max(A_ndim, B_ndim); + std::fill(A_broadcast_dims, A_broadcast_dims + ndim - A_ndim, 1); + std::fill(B_broadcast_dims, B_broadcast_dims + ndim - B_ndim, 1); + std::copy(A_dims, A_dims + A_ndim, A_broadcast_dims + ndim - A_ndim); + std::copy(B_dims, B_dims + B_ndim, B_broadcast_dims + ndim - B_ndim); + for (int i = 0; i < ndim; ++i) { + CAFFE_ENFORCE( + A_broadcast_dims[i] == B_broadcast_dims[i] || + A_broadcast_dims[i] <= 1 || B_broadcast_dims[i] <= 1); + if (A_broadcast_dims[i] == 0 || B_broadcast_dims[i] == 0) { + C_broadcast_dims[i] = 0; + } else { + C_broadcast_dims[i] = std::max(A_broadcast_dims[i], B_broadcast_dims[i]); + } + } +} + +bool IsRowwiseBroadcastBinaryOp( + const int ndim, + const int* A_dims, + const int* B_dims, + int* pivot, + bool* broadcast_1st) { + if (ndim == 0) { + return false; + } + int A_pivot = 0; + for (; A_pivot < ndim && A_dims[A_pivot] == 1; ++A_pivot) + ; + int B_pivot = 0; + for (; B_pivot < ndim && B_dims[B_pivot] == 1; ++B_pivot) + ; + if (A_pivot == B_pivot) { + return false; + } + *pivot = std::max(A_pivot, B_pivot); + *broadcast_1st = A_pivot > B_pivot; + return std::equal(A_dims + *pivot, A_dims + ndim, B_dims + *pivot); +} + +bool IsColwiseBroadcastBinaryOp( + const int ndim, + const int* A_dims, + const int* B_dims, + int* pivot, + bool* broadcast_1st) { + if (ndim == 0) { + return false; + } + int A_pivot = ndim - 1; + for (; A_pivot >= 0 && A_dims[A_pivot] == 1; --A_pivot) + ; + int B_pivot = ndim - 1; + for (; B_pivot >= 0 && B_dims[B_pivot] == 1; --B_pivot) + ; + if (A_pivot == B_pivot) { + return false; + } + *pivot = std::min(A_pivot, B_pivot) + 1; + *broadcast_1st = A_pivot < B_pivot; + return std::equal(A_dims, A_dims + *pivot, B_dims); +} + +void ComputeTransposeAxesForReduceOp( + const int num_dims, + const int num_reduce_axes, + const int* reduce_axes, + int* transpose_axes) { + const int d = num_dims - num_reduce_axes; + std::copy_n(reduce_axes, num_reduce_axes, transpose_axes + d); + std::sort(transpose_axes + d, transpose_axes + num_dims); + int p = 0; + int q = d; + for (int i = 0; i < num_dims; ++i) { + if (q < num_dims && i == transpose_axes[q]) { + ++q; + } else { + transpose_axes[p++] = i; + } + } +} + +void ComputeTransposedStrides( + const int ndim, + const int* dims, + const int* axes, + int* strides) { + std::vector buff(ndim); + int cur_stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + buff[i] = cur_stride; + cur_stride *= dims[i]; + } + for (int i = 0; i < ndim; ++i) { + strides[i] = buff[axes[i]]; + } +} + +} // namespace utils +} // namespace math +} // namespace caffe2 diff --git a/caffe2/utils/math_utils.h b/caffe2/utils/math_utils.h new file mode 100644 index 00000000000000..f5570391b422d7 --- /dev/null +++ b/caffe2/utils/math_utils.h @@ -0,0 +1,54 @@ +#ifndef CAFFE2_UTILS_MATH_UTILS_H_ +#define CAFFE2_UTILS_MATH_UTILS_H_ + +namespace caffe2 { +namespace math { +namespace utils { + +// Increase the index digits by one based on dims. +void IncreaseIndexInDims(const int n, const int* dims, int* index); + +// Get index value from dims and index digits. +int GetIndexFromDims(const int n, const int* dims, const int* index); + +// Computest the broadcast binary operation dims. +void ComputeBroadcastBinaryOpDims( + const int A_ndim, + const int* A_dims, + const int B_ndim, + const int* B_dims, + int* A_broadcast_dims, + int* B_broadcast_dims, + int* C_broadcast_dims); + +bool IsRowwiseBroadcastBinaryOp( + const int ndim, + const int* A_dims, + const int* B_dims, + int* pivot, + bool* broadcast_1st); + +bool IsColwiseBroadcastBinaryOp( + const int ndim, + const int* A_dims, + const int* B_dims, + int* pivot, + bool* broadcast_1st); + +void ComputeTransposeAxesForReduceOp( + const int num_dims, + const int num_reduce_axes, + const int* reduce_axes, + int* transpose_axes); + +void ComputeTransposedStrides( + const int ndim, + const int* dims, + const int* axes, + int* strides); + +} // namespace utils +} // namespace math +} // namespace caffe2 + +#endif // CAFFE2_UTILS_MATH_UTILS_H_ diff --git a/caffe2/utils/murmur_hash3.cc b/caffe2/utils/murmur_hash3.cc index 3173d3a80415d2..28b373165c98e5 100644 --- a/caffe2/utils/murmur_hash3.cc +++ b/caffe2/utils/murmur_hash3.cc @@ -18,6 +18,8 @@ #define FORCE_INLINE __forceinline +#define FALLTHROUGH_INTENDED + #include #define ROTL32(x, y) _rotl(x, y) @@ -31,6 +33,17 @@ #define FORCE_INLINE inline __attribute__((__always_inline__)) +#if defined(__clang__) && defined(__has_cpp_attribute) +#if __has_cpp_attribute(clang::fallthrough) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#endif +#elif defined(__GNUC__) && __GNUC__ > 6 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#endif +#ifndef FALLTHROUGH_INTENDED +#define FALLTHROUGH_INTENDED +#endif + inline uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); } @@ -121,8 +134,10 @@ void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out) { switch (len & 3) { case 3: k1 ^= tail[2] << 16; + FALLTHROUGH_INTENDED; case 2: k1 ^= tail[1] << 8; + FALLTHROUGH_INTENDED; case 1: k1 ^= tail[0]; k1 *= c1; @@ -222,47 +237,61 @@ void MurmurHash3_x86_128( switch (len & 15) { case 15: k4 ^= tail[14] << 16; + FALLTHROUGH_INTENDED; case 14: k4 ^= tail[13] << 8; + FALLTHROUGH_INTENDED; case 13: k4 ^= tail[12] << 0; k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; + FALLTHROUGH_INTENDED; case 12: k3 ^= tail[11] << 24; + FALLTHROUGH_INTENDED; case 11: k3 ^= tail[10] << 16; + FALLTHROUGH_INTENDED; case 10: k3 ^= tail[9] << 8; + FALLTHROUGH_INTENDED; case 9: k3 ^= tail[8] << 0; k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; + FALLTHROUGH_INTENDED; case 8: k2 ^= tail[7] << 24; + FALLTHROUGH_INTENDED; case 7: k2 ^= tail[6] << 16; + FALLTHROUGH_INTENDED; case 6: k2 ^= tail[5] << 8; + FALLTHROUGH_INTENDED; case 5: k2 ^= tail[4] << 0; k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; + FALLTHROUGH_INTENDED; case 4: k1 ^= tail[3] << 24; + FALLTHROUGH_INTENDED; case 3: k1 ^= tail[2] << 16; + FALLTHROUGH_INTENDED; case 2: k1 ^= tail[1] << 8; + FALLTHROUGH_INTENDED; case 1: k1 ^= tail[0] << 0; k1 *= c1; @@ -359,37 +388,51 @@ void MurmurHash3_x64_128( switch (len & 15) { case 15: k2 ^= ((uint64_t)tail[14]) << 48; + FALLTHROUGH_INTENDED; case 14: k2 ^= ((uint64_t)tail[13]) << 40; + FALLTHROUGH_INTENDED; case 13: k2 ^= ((uint64_t)tail[12]) << 32; + FALLTHROUGH_INTENDED; case 12: k2 ^= ((uint64_t)tail[11]) << 24; + FALLTHROUGH_INTENDED; case 11: k2 ^= ((uint64_t)tail[10]) << 16; + FALLTHROUGH_INTENDED; case 10: k2 ^= ((uint64_t)tail[9]) << 8; + FALLTHROUGH_INTENDED; case 9: k2 ^= ((uint64_t)tail[8]) << 0; k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; + FALLTHROUGH_INTENDED; case 8: k1 ^= ((uint64_t)tail[7]) << 56; + FALLTHROUGH_INTENDED; case 7: k1 ^= ((uint64_t)tail[6]) << 48; + FALLTHROUGH_INTENDED; case 6: k1 ^= ((uint64_t)tail[5]) << 40; + FALLTHROUGH_INTENDED; case 5: k1 ^= ((uint64_t)tail[4]) << 32; + FALLTHROUGH_INTENDED; case 4: k1 ^= ((uint64_t)tail[3]) << 24; + FALLTHROUGH_INTENDED; case 3: k1 ^= ((uint64_t)tail[2]) << 16; + FALLTHROUGH_INTENDED; case 2: k1 ^= ((uint64_t)tail[1]) << 8; + FALLTHROUGH_INTENDED; case 1: k1 ^= ((uint64_t)tail[0]) << 0; k1 *= c1; diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h index 4c3bfcf0a4fe46..9d2ffd57864b68 100644 --- a/caffe2/utils/threadpool/WorkersPool.h +++ b/caffe2/utils/threadpool/WorkersPool.h @@ -331,9 +331,9 @@ class WorkersPool { // One of the tasks will be run on the current thread. int workers_count = tasks.size() - 1; CreateWorkers(workers_count); - DCHECK_LE(workers_count, workers_.size()); + DCHECK_LE(workers_count, (int)workers_.size()); counter_to_decrement_when_ready_.Reset(workers_count); - for (auto task = 1; task < tasks.size(); ++task) { + for (size_t task = 1; task < tasks.size(); ++task) { workers_[task - 1]->StartWork(tasks[task].get()); } // Execute the remaining workload immediately on the current thread. diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in index 3b9bb04afa213a..fe393669750304 100644 --- a/cmake/Caffe2Config.cmake.in +++ b/cmake/Caffe2Config.cmake.in @@ -69,22 +69,31 @@ if (@CAFFE2_KNOWN_PROTOBUF_VERSION@) endif() endif() -# Cuda if (@USE_CUDA@) + # The file public/cuda.cmake exclusively uses CAFFE2_USE_*. + # If Caffe2 was compiled with the libraries below, they must + # be found again when including the Caffe2 target. + set(CAFFE2_USE_CUDA @USE_CUDA@) + set(CAFFE2_USE_CUDNN @USE_CUDNN@) + set(CAFFE2_USE_TENSORRT @USE_TENSORRT@) include("${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake") - if (NOT CAFFE2_FOUND_CUDA) + if (@CAFFE2_USE_CUDA@ AND NOT CAFFE2_USE_CUDA) message(FATAL_ERROR - "Your installed Caffe2 version uses CUDA but I cannot find the CUDA " - "libraries. Please set the proper CUDA prefixes and / or install " - "CUDA.") + "Your installed Caffe2 version uses CUDA but I cannot find the CUDA " + "libraries. Please set the proper CUDA prefixes and / or install " + "CUDA.") endif() - if (@BUILD_CAFFE2@) - if (NOT CAFFE2_FOUND_CUDNN) - message(FATAL_ERROR - "Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN " - "libraries. Please set the proper cuDNN prefixes and / or install " - "cuDNN.") - endif() + if (@CAFFE2_USE_CUDNN@ AND NOT CAFFE2_USE_CUDNN) + message(FATAL_ERROR + "Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN " + "libraries. Please set the proper cuDNN prefixes and / or install " + "cuDNN.") + endif() + if (@CAFFE2_USE_TENSORRT@ AND NOT CAFFE2_USE_TENSORRT) + message(FATAL_ERROR + "Your installed Caffe2 version uses TensorRT but I cannot find the TensorRT " + "libraries. Please set the proper TensorRT prefixes and / or install " + "TensorRT.") endif() endif() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 9a6611ebb99b72..30802b28590a5d 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -180,7 +180,7 @@ if(BUILD_TEST) set(gtest_force_shared_crt ON) endif() add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest) - include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include) + include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include) # We will not need to test benchmark lib itself. set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable benchmark testing as we don't need it.") @@ -393,32 +393,53 @@ endif() # ---[ CUDA if(USE_CUDA) + # public/*.cmake uses CAFFE2_USE_* + set(CAFFE2_USE_CUDA ${USE_CUDA}) + set(CAFFE2_USE_CUDNN ${USE_CUDNN}) + set(CAFFE2_USE_TENSORRT ${USE_TENSORRT}) include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake) - if(CAFFE2_FOUND_CUDA) + if(CAFFE2_USE_CUDA) # A helper variable recording the list of Caffe2 dependent libraries # caffe2::cudart is dealt with separately, due to CUDA_ADD_LIBRARY # design reason (it adds CUDA_LIBRARIES itself). - set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS - caffe2::cuda caffe2::cufft caffe2::curand caffe2::nvrtc) - if(CAFFE2_FOUND_CUDNN) - LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn) + set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cufft caffe2::curand) + if(BUILD_CAFFE2) + # Don't be deceived! caffe2::cuda is the low-level DRIVER API, + # not the actual CUDA library. Caffe2 depends directly on nvrtc + # and needs it, but ATen doesn't use it at all, and so the + # dependency is unnecessary. + # + # BTW, if you change this so that PyTorch has this dependency + # again, make sure Mac OS X GPU builds still work; there's + # a decent chance the library finding algorithm picked up + # on cuda.framework, which is totally not going to work when + # linking. + list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cuda caffe2::nvrtc) + endif() + if(CAFFE2_USE_CUDNN) + list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn) else() if(BUILD_CAFFE2) # TODO: Get rid of special case for Caffe2 where we require # CUDA *and* cuDNN to be installed. message(WARNING - "Not compiling with CUDA since cuDNN is missing. Suppress " - "this warning with -DUSE_CUDA=OFF.") + "Not compiling with CUDA since cuDNN is missing. Suppress " + "this warning with -DUSE_CUDA=OFF.") caffe2_update_option(USE_CUDA OFF) caffe2_update_option(USE_CUDNN OFF) + caffe2_update_option(USE_TENSORRT OFF) + set(CAFFE2_USE_CUDA OFF) + set(CAFFE2_USE_CUDNN OFF) + set(CAFFE2_USE_TENSORRT OFF) else() message(WARNING - "Not compiling with cuDNN. Suppress this warning with " - "-DUSE_CUDNN=OFF.") + "Not compiling with cuDNN. Suppress this warning with " + "-DUSE_CUDNN=OFF.") caffe2_update_option(USE_CUDNN OFF) + set(CAFFE2_USE_CUDNN OFF) endif() endif() - if(USE_CUDA) + if(CAFFE2_USE_CUDA) if(CAFFE2_STATIC_LINK_CUDA) # When statically linking, this must be the order of the libraries LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS @@ -426,21 +447,28 @@ if(USE_CUDA) else() LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cublas) endif() - if(USE_TENSORRT) + if(CAFFE2_USE_TENSORRT) list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt) + else() + caffe2_update_option(USE_TENSORRT OFF) endif() endif() else() message(WARNING - "Not compiling with CUDA. Suppress this warning with " - "-DUSE_CUDA=OFF.") + "Not compiling with CUDA. Suppress this warning with " + "-DUSE_CUDA=OFF.") caffe2_update_option(USE_CUDA OFF) + caffe2_update_option(USE_CUDNN OFF) + caffe2_update_option(USE_TENSORRT OFF) + set(CAFFE2_USE_CUDA OFF) + set(CAFFE2_USE_CUDNN OFF) + set(CAFFE2_USE_TENSORRT OFF) endif() endif() # ---[ HIP -if(BUILD_CAFFE2) - include(cmake/public/LoadHIP.cmake) +if(BUILD_CAFFE2 OR BUILD_ATEN) + include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake) if(PYTORCH_FOUND_HIP) message(INFO "Compiling with HIP for AMD.") caffe2_update_option(USE_ROCM ON) @@ -449,7 +477,7 @@ if(BUILD_CAFFE2) set(Caffe2_HIP_INCLUDES ${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS}) set(Caffe2_HIP_DEPENDENCY_LIBS - ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES}) + ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES} ${hipblas_LIBRARIES}) # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} list(APPEND Caffe2_HIP_DEPENDENCY_LIBS @@ -464,7 +492,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2) include_directories(${HIP_PATH}/include) include_directories(${HIPBLAS_PATH}/include) include_directories(${HIPSPARSE_PATH}/include) - include_directories(${HIPRNG_PATH}/include) + include_directories(${HIPRAND_PATH}/include) include_directories(${THRUST_PATH}) # load HIP cmake module and load platform id @@ -472,7 +500,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2) EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) # Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc - SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE}) + # SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE}) # Show message that we're using ROCm. MESSAGE(STATUS "ROCM TRUE:") @@ -661,7 +689,6 @@ if (BUILD_ATEN) if (USE_CUDA) list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS aten_op_header_gen) endif() - include_directories(${PROJECT_BINARY_DIR}/caffe2/contrib/aten/aten/src/ATen) include_directories(${PROJECT_BINARY_DIR}/caffe2/contrib/aten) endif() endif() @@ -1126,6 +1153,13 @@ if (BUILD_ATEN) set(AT_CUDNN_ENABLED 1) ENDIF() + IF (NOT USE_ROCM) + MESSAGE(STATUS "MIOpen not found. Compiling without MIOpen support") + set(AT_MIOPEN_ENABLED 0) + ELSE() + set(AT_MIOPEN_ENABLED 1) + ENDIF() + if (NO_MKLDNN) message("disabling MKLDNN because NO_MKLDNN is set") set(AT_MKLDNN_ENABLED 0) diff --git a/cmake/Modules/FindMIOpen.cmake b/cmake/Modules/FindMIOpen.cmake new file mode 100644 index 00000000000000..6a047df7e52f3e --- /dev/null +++ b/cmake/Modules/FindMIOpen.cmake @@ -0,0 +1,63 @@ +# - Try to find MIOpen +# +# The following variables are optionally searched for defaults +# MIOPEN_ROOT_DIR: Base directory where all MIOpen components are found +# +# The following are set after configuration is done: +# MIOPEN_FOUND +# MIOPEN_INCLUDE_DIRS +# MIOPEN_LIBRARIES +# MIOPEN_LIBRARY_DIRS +# +# Borrowed from https://github.com/caffe2/caffe2/blob/master/cmake/Modules/FindCuDNN.cmake + +include(FindPackageHandleStandardArgs) + +set(MIOPEN_ROOT_DIR "" CACHE PATH "Folder contains MIOpen") + +if($ENV{MIOPEN_INCLUDE_DIR}) + SET(MIOPEN_INCLUDE_DIR $ENV{MIOPEN_INCLUDE_DIR}) +else($ENV{MIOPEN_INCLUDE_DIR}) + find_path(MIOPEN_INCLUDE_DIR miopen.h + HINTS ${MIOPEN_ROOT_DIR} + PATH_SUFFIXES include include/miopen) +endif($ENV{MIOPEN_INCLUDE_DIR}) + +if($ENV{MIOPEN_LIBRARY}) + SET(MIOPEN_LIBRARY $ENV{MIOPEN_LIBRARY}) +else($ENV{MIOPEN_LIBRARY}) + find_library(MIOPEN_LIBRARY MIOpen + HINTS ${MIOPEN_LIB_DIR} ${MIOPEN_ROOT_DIR} + PATH_SUFFIXES lib) +endif($ENV{MIOPEN_LIBRARY}) + +find_package_handle_standard_args( + MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY) + +if(MIOPEN_FOUND) + # get MIOpen version + file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS) + string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)" + MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1" + MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}") + string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)" + MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1" + MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}") + string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)" + MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1" + MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}") + # Assemble MIOpen version + if(NOT MIOPEN_VERSION_MAJOR) + set(MIOPEN_VERSION "?") + else() + set(MIOPEN_VERSION "${MIOPEN_VERSION_MAJOR}.${MIOPEN_VERSION_MINOR}.${MIOPEN_VERSION_PATCH}") + endif() + + set(MIOPEN_INCLUDE_DIRS ${MIOPEN_INCLUDE_DIR}) + set(MIOPEN_LIBRARIES ${MIOPEN_LIBRARY}) + message(STATUS "Found MIOpen: v${MIOPEN_VERSION} (include: ${MIOPEN_INCLUDE_DIR}, library: ${MIOPEN_LIBRARY})") + mark_as_advanced(MIOPEN_ROOT_DIR MIOPEN_LIBRARY MIOPEN_INCLUDE_DIR) +endif() diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake index 11e6068f65fe82..8e287336e7d536 100644 --- a/cmake/ProtoBuf.cmake +++ b/cmake/ProtoBuf.cmake @@ -13,14 +13,14 @@ macro(custom_protobuf_find) if (${CAFFE2_LINK_LOCAL_PROTOBUF}) # If we are going to link protobuf locally, we will need to turn off # shared libs build for protobuf. - set(protobuf_BUILD_SHARED_LIBS OFF) + option(protobuf_BUILD_SHARED_LIBS "" OFF) else() # If we are building Caffe2 as shared libs, we will also build protobuf as # shared libs. - set(protobuf_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) + option(protobuf_BUILD_SHARED_LIBS "" ${BUILD_SHARED_LIBS}) endif() # We will make sure that protobuf and caffe2 uses the same msvc runtime. - set(protobuf_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME}) + option(protobuf_MSVC_STATIC_RUNTIME "" ${CAFFE2_USE_MSVC_STATIC_RUNTIME}) if (${CAFFE2_LINK_LOCAL_PROTOBUF}) set(__caffe2_CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ${CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS}) diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 7d813cc22de066..33cef8d0095fee 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -38,13 +38,6 @@ ELSE() SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH}) ENDIF() -# HIPRNG_PATH -IF(NOT DEFINED ENV{HIPRNG_PATH}) - SET(HIPRNG_PATH ${ROCM_PATH}/hcrng) -ELSE() - SET(HIPRNG_PATH $ENV{HIPRNG_PATH}) -ENDIF() - # HIPSPARSE_PATH IF(NOT DEFINED ENV{HIPSPARSE_PATH}) SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse) @@ -103,11 +96,15 @@ IF(HIP_FOUND) set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand) set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) + set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas) + set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) find_package(rocrand REQUIRED) find_package(hiprand REQUIRED) find_package(rocblas REQUIRED) find_package(miopen REQUIRED) + #find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package. + #find_package(hipsparse REQUIRED) # TODO: hip_hcc has an interface include flag "-hc" which is only # recognizable by hcc, but not gcc and clang. Right now in our @@ -117,6 +114,9 @@ IF(HIP_FOUND) # TODO: miopen_LIBRARIES should return fullpath to the library file, # however currently it's just the lib name FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib) + FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib) + FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib) + FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib) set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip) diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake index cd790ed98d64e1..b4b2b89c4380d3 100644 --- a/cmake/public/cuda.cmake +++ b/cmake/public/cuda.cmake @@ -1,8 +1,5 @@ # ---[ cuda -set(CAFFE2_FOUND_CUDA FALSE) -set(CAFFE2_FOUND_CUDNN FALSE) - # Find CUDA. find_package(CUDA 7.0) if(NOT CUDA_FOUND) @@ -10,9 +7,9 @@ if(NOT CUDA_FOUND) "Caffe2: CUDA cannot be found. Depending on whether you are building " "Caffe2 or a Caffe2 dependent library, the next warning / error will " "give you more info.") + set(CAFFE2_USE_CUDA OFF) return() endif() -set(CAFFE2_FOUND_CUDA TRUE) message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION}) message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE}) message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR}) @@ -22,10 +19,10 @@ if(CUDA_FOUND) # compiling with, e.g., if a ccache nvcc is fed to us by CUDA_NVCC_EXECUTABLE # but the PATH is not consistent with CUDA_HOME. It's better safe # than sorry: make sure everything is consistent. - set(file "${PROJECT_BINARY_DIR}/detect_cuda_version.c") + set(file "${PROJECT_BINARY_DIR}/detect_cuda_version.cc") file(WRITE ${file} "" "#include \n" - "#include \n" + "#include \n" "int main() {\n" " printf(\"%d.%d\", CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100);\n" " return 0;\n" @@ -94,13 +91,11 @@ find_package_handle_standard_args( if(NOT CUDNN_FOUND) message(WARNING "Caffe2: Cannot find cuDNN library. Turning the option off") - set(USE_CUDNN OFF) -else() - set(CAFFE2_FOUND_CUDNN TRUE) + set(CAFFE2_USE_CUDNN OFF) endif() # Optionally, find TensorRT -if (${USE_TENSORRT}) +if(CAFFE2_USE_TENSORRT) find_path(TENSORRT_INCLUDE_DIR NvInfer.h HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES include) @@ -112,12 +107,12 @@ if (${USE_TENSORRT}) if(NOT TENSORRT_FOUND) message(WARNING "Caffe2: Cannot find TensorRT library. Turning the option off") - set(USE_TENSORRT OFF) + set(CAFFE2_USE_TENSORRT OFF) endif() endif() -# ---[ Exract versions -if (CAFFE2_FOUND_CUDNN) +# ---[ Extract versions +if(CAFFE2_USE_CUDNN) # Get cuDNN version file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_HEADER_CONTENTS) string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" @@ -189,7 +184,7 @@ set_property( # cudnn # static linking is handled by USE_STATIC_CUDNN environment variable -if(${USE_CUDNN}) +if(CAFFE2_USE_CUDNN) add_library(caffe2::cudnn UNKNOWN IMPORTED) set_property( TARGET caffe2::cudnn PROPERTY IMPORTED_LOCATION @@ -231,7 +226,7 @@ set_property( ${CUDA_INCLUDE_DIRS}) # TensorRT -if(${USE_TENSORRT}) +if(CAFFE2_USE_TENSORRT) add_library(caffe2::tensorrt UNKNOWN IMPORTED) set_property( TARGET caffe2::tensorrt PROPERTY IMPORTED_LOCATION @@ -270,191 +265,6 @@ set_property( # now, Caffe2 only uses the above libraries, so we will only wrap # these. -# ---[ Cuda flags - -# Known NVIDIA GPU achitectures Caffe2 can be compiled for. -# Default is set to cuda 9. If we detect the cuda architectures to be less than -# 9, we will lower it to the corresponding known archs. -set(Caffe2_known_gpu_archs "30 35 50 52 60 61 70") # for CUDA 9.x -set(Caffe2_known_gpu_archs8 "30 35 50 52 60 61") # for CUDA 8.x -set(Caffe2_known_gpu_archs7 "30 35 50 52") # for CUDA 7.x - -################################################################################################ -# A function for automatic detection of GPUs installed (if autodetection is enabled) -# Usage: -# caffe2_detect_installed_gpus(out_variable) -function(caffe2_detect_installed_gpus out_variable) - if(NOT CUDA_gpu_detect_output) - set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) - - file(WRITE ${__cufile} "" - "#include \n" - "int main()\n" - "{\n" - " int count = 0;\n" - " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" - " if (count == 0) return -1;\n" - " for (int device = 0; device < count; ++device)\n" - " {\n" - " cudaDeviceProp prop;\n" - " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" - " std::printf(\"%d.%d \", prop.major, prop.minor);\n" - " }\n" - " return 0;\n" - "}\n") - - execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" ${CUDA_NVCC_FLAGS} "--run" "${__cufile}" - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(__nvcc_res EQUAL 0) - string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}") - set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from caffe2_detect_installed_gpus tool" FORCE) - endif() - endif() - - if(NOT CUDA_gpu_detect_output) - message(STATUS "Automatic GPU detection failed. Building for all known architectures.") - set(${out_variable} ${Caffe2_known_gpu_archs} PARENT_SCOPE) - else() - message(STATUS "Automatic GPU detection returned ${CUDA_gpu_detect_output}.") - set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) - endif() -endfunction() - - -################################################################################################ -# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME -# Usage: -# caffe_select_nvcc_arch_flags(out_variable) -function(caffe2_select_nvcc_arch_flags out_variable) - # List of arch names - set(__archs_names "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual") - set(__archs_name_default "All") - if(NOT CMAKE_CROSSCOMPILING) - list(APPEND __archs_names "Auto") - set(__archs_name_default "Auto") - endif() - - # Set CUDA_ARCH_NAME strings (so it will be seen as dropbox in the CMake GUI) - set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU architecture.") - set_property(CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names}) - mark_as_advanced(CUDA_ARCH_NAME) - - # Verify CUDA_ARCH_NAME value - if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};") - string(REPLACE ";" ", " __archs_names "${__archs_names}") - message(FATAL_ERROR "Invalid CUDA_ARCH_NAME, supported values: ${__archs_names}. Got ${CUDA_ARCH_NAME}.") - endif() - - if(${CUDA_ARCH_NAME} STREQUAL "Manual") - set(CUDA_ARCH_BIN "" CACHE STRING - "Specify GPU architectures to build binaries for (BIN(PTX) format is supported)") - set(CUDA_ARCH_PTX "" CACHE STRING - "Specify GPU architectures to build PTX intermediate code for") - mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) - else() - unset(CUDA_ARCH_BIN CACHE) - unset(CUDA_ARCH_PTX CACHE) - endif() - - set(CUDA_ARCH_LIST) - if(DEFINED ENV{TORCH_CUDA_ARCH_LIST}) - set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST}) - string(REGEX REPLACE "[ \t]+" ";" TORCH_CUDA_ARCH_LIST "${TORCH_CUDA_ARCH_LIST}") - list(APPEND CUDA_ARCH_LIST ${TORCH_CUDA_ARCH_LIST}) - message(STATUS "Set CUDA arch from TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}") - else() - list(APPEND CUDA_ARCH_LIST ${CUDA_ARCH_NAME}) - message(STATUS "Set CUDA arch from CUDA_ARCH_NAME: ${CUDA_ARCH_NAME}") - endif() - list(REMOVE_DUPLICATES CUDA_ARCH_LIST) - - set(__cuda_arch_bin) - set(__cuda_arch_ptx) - foreach(arch_name ${CUDA_ARCH_LIST}) - set(arch_bin) - set(arch_ptx) - set(add_ptx FALSE) - # Check to see if we are compiling PTX - if(arch_name MATCHES "(.*)\\+PTX$") - set(add_ptx TRUE) - set(arch_name ${CMAKE_MATCH_1}) - endif() - if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$") - set(arch_bin ${CMAKE_MATCH_1}) - set(arch_ptx ${arch_bin}) - else() - # Look for it in our list of known architectures - if(${arch_name} STREQUAL "Kepler") - set(arch_bin "30 35") - elseif(${arch_name} STREQUAL "Maxwell") - set(arch_bin "50") - elseif(${arch_name} STREQUAL "Pascal") - set(arch_bin "60 61") - elseif(${arch_name} STREQUAL "Volta") - set(arch_bin "70") - elseif(${arch_name} STREQUAL "All") - set(arch_bin ${Caffe2_known_gpu_archs}) - elseif(${arch_name} STREQUAL "Manual") - set(arch_bin ${CUDA_ARCH_BIN}) - set(arch_ptx ${CUDA_ARCH_PTX}) - set(add_ptx TRUE) - elseif(${arch_name} STREQUAL "Auto") - caffe2_detect_installed_gpus(arch_bin) - else() - message(FATAL_ERROR "Unknown CUDA architecture name ${arch_name}") - endif() - endif() - list(APPEND __cuda_arch_bin ${arch_bin}) - if(add_ptx) - if (NOT arch_ptx) - set(arch_ptx ${arch_bin}) - endif() - list(APPEND __cuda_arch_ptx ${arch_ptx}) - endif() - endforeach() - - # Remove dots and convert to lists - string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}") - string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${__cuda_arch_ptx}") - string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}") - string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}") - list(REMOVE_DUPLICATES __cuda_arch_bin) - list(REMOVE_DUPLICATES __cuda_arch_ptx) - - set(__nvcc_flags "") - set(__nvcc_archs_readable "") - - # Tell NVCC to add binaries for the specified GPUs - foreach(__arch ${__cuda_arch_bin}) - if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)") - # User explicitly specified PTX for the concrete BIN - list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) - list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1}) - else() - # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN - list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch}) - list(APPEND __nvcc_archs_readable sm_${__arch}) - endif() - endforeach() - - # Tell NVCC to add PTX intermediate code for the specified architectures - foreach(__arch ${__cuda_arch_ptx}) - list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch}) - list(APPEND __nvcc_archs_readable compute_${__arch}) - endforeach() - - string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}") - set(${out_variable} ${__nvcc_flags} PARENT_SCOPE) - set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE) -endfunction() - -################################################################################################ -### Non macro section -################################################################################################ - # Special care for windows platform: we know that 32-bit windows does not # support cuda. if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") @@ -467,11 +277,9 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif() if (${CUDA_VERSION} LESS 8.0) # CUDA 7.x - set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x - set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") # CUDA 8 may complain that sm_20 is no longer supported. Suppress the @@ -510,9 +318,26 @@ elseif (CUDA_VERSION VERSION_EQUAL 8.0) endif() # setting nvcc arch flags -caffe2_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +if ((NOT EXISTS ${TORCH_CUDA_ARCH_LIST}) AND (DEFINED ENV{TORCH_CUDA_ARCH_LIST})) + message(WARNING + "In the future we will require one to explicitly pass " + "TORCH_CUDA_ARCH_LIST to cmake instead of implicitly setting it as an " + "env variable. This will become a FATAL_ERROR in future version of " + "pytorch.") + set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST}) +endif() +if (EXISTS ${CUDA_ARCH_NAME}) + message(WARNING + "CUDA_ARCH_NAME is no longer used. Use TORCH_CUDA_ARCH_LIST instead. " + "Right now, CUDA_ARCH_NAME is ${CUDA_ARCH_NAME} and " + "TORCH_CUDA_ARCH_LIST is ${TORCH_CUDA_ARCH_LIST}.") + set(TORCH_CUDA_ARCH_LIST TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME}) +endif() + +# Invoke cuda_select_nvcc_arch_flags from proper cmake FindCUDA. +cuda_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA ${TORCH_CUDA_ARCH_LIST}) list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) -message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}") # disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc. foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used) diff --git a/conda/caffe2/normal/.gitignore b/conda/caffe2/normal/.gitignore new file mode 100644 index 00000000000000..0552636118f261 --- /dev/null +++ b/conda/caffe2/normal/.gitignore @@ -0,0 +1 @@ +meta.yaml diff --git a/conda/integrated/.gitignore b/conda/integrated/.gitignore new file mode 100644 index 00000000000000..0552636118f261 --- /dev/null +++ b/conda/integrated/.gitignore @@ -0,0 +1 @@ +meta.yaml diff --git a/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile b/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile index 194d7cbdf06387..9235da3102275f 100644 --- a/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile +++ b/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile @@ -29,7 +29,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ && rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \ +RUN pip install --no-cache-dir --upgrade pip==9.0.3 setuptools wheel && \ pip install --no-cache-dir \ flask \ future \ @@ -50,8 +50,8 @@ RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \ tornado ########## INSTALLATION STEPS ################### -RUN git clone --branch master --recursive https://github.com/caffe2/caffe2.git -RUN cd caffe2 && mkdir build && cd build \ +RUN git clone --branch master --recursive https://github.com/pytorch/pytorch.git +RUN cd pytorch && mkdir build && cd build \ && cmake .. \ -DCUDA_ARCH_NAME=Manual \ -DCUDA_ARCH_BIN="35 52 60 61" \ diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst index e220aa930eda1f..058c0e6ba1dfa4 100644 --- a/docs/source/autograd.rst +++ b/docs/source/autograd.rst @@ -73,6 +73,15 @@ Tensor autograd functions .. autoclass:: Function :members: +.. _grad-check: + +Numerical gradient checking +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: gradcheck + +.. autofunction:: gradgradcheck + Profiler ^^^^^^^^ diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst index 55732ae0ca6231..810ef12bcc18a4 100644 --- a/docs/source/notes/extending.rst +++ b/docs/source/notes/extending.rst @@ -98,6 +98,12 @@ non-Variable arguments:: # Gradients of non-Tensor arguments to forward must be None. return grad_output * ctx.constant, None +.. note:: + Inputs to ``backward``, i.e., :attr:`grad_output`, can also be Tensors that + track history. So if ``backward`` is implemented with differentiable + operations, (e.g., invocation of another custom + :class:`~torch.autograd.function`), higher order derivatives will work. + You probably want to check if the backward method you implemented actually computes the derivatives of your function. It is possible by comparing with numerical approximations using small finite differences:: @@ -111,6 +117,8 @@ numerical approximations using small finite differences:: test = gradcheck(linear, input, eps=1e-6, atol=1e-4) print(test) +See :ref:`grad-check` for more details on finite-difference gradient comparisons. + Extending :mod:`torch.nn` ------------------------- diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc index 488fba3abfe1d1..51958dabdf5df4 100644 --- a/modules/observers/net_observer_reporter_print.cc +++ b/modules/observers/net_observer_reporter_print.cc @@ -1,5 +1,6 @@ #include "observers/net_observer_reporter_print.h" +#include #include "caffe2/core/init.h" #include "observers/observer_config.h" @@ -10,11 +11,44 @@ const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer "; void NetObserverReporterPrint::report( NetBase* net, std::map& info) { - LOG(INFO) << IDENTIFIER << "Net Name - " << net->Name(); - LOG(INFO) << IDENTIFIER << "Delay Start"; + std::map> caffe2_perf; + std::map op_perf; + std::map net_perf; + for (auto& p : info) { - LOG(INFO) << IDENTIFIER << p.first << " - " << p.second.latency << "\t(ms)"; + if ((p.first == "NET_DELAY") && (info.size() == 1)) { + // for Net_delay perf + net_perf["latency"] = caffe2::to_string(p.second.latency); + net_perf["flops"] = caffe2::to_string(-1); + caffe2_perf["NET"] = net_perf; + } else if (p.first != "NET_DELAY") { + // for operator perf + op_perf["latency"] = caffe2::to_string(p.second.latency); + op_perf["flops"] = caffe2::to_string(p.second.flops); + caffe2_perf[p.first] = op_perf; + } + } + + std::stringstream buffer; + buffer << IDENTIFIER << "{"; + for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) { + buffer << "\"" << it->first << "\"" + << ": {"; + for (auto jt = it->second.begin(); jt != it->second.end(); jt++) { + buffer << "\"" << jt->first << "\"" + << ": " << jt->second; + auto kt = jt; + if ((++kt) != it->second.end()) { + buffer << ", "; + } + } + buffer << "}"; + auto lt = it; + if ((++lt) != caffe2_perf.end()) { + buffer << ", "; + } } - LOG(INFO) << IDENTIFIER << "Delay End"; + buffer << "}"; + LOG(INFO) << buffer.str(); } } diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc index 9320625f147ac3..f5ab3f11af1c6f 100644 --- a/modules/observers/perf_observer.cc +++ b/modules/observers/perf_observer.cc @@ -85,6 +85,11 @@ void PerfNetObserver::Stop() { p.latency = static_cast(observerMap_[op]) ->getMilliseconds(); +#ifndef CAFFE2_IOS + auto cost = static_cast(observerMap_[op]) + ->getAnalyticalCost(); + p.flops = cost.flops; +#endif // CAFFE2_MOBILE p.engine = op->engine(); p.type = op->type(); diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh index 4497a330facd5a..ce65c7914d7b11 100755 --- a/scripts/build_anaconda.sh +++ b/scripts/build_anaconda.sh @@ -324,7 +324,7 @@ if [[ -n $pytorch_too ]]; then fi if [[ -z $slim ]]; then - add_package 'opencv' + add_package 'opencv' '<3.4' else caffe2_cmake_args+=("-DUSE_OPENCV=OFF") fi @@ -355,13 +355,8 @@ else fi # Change flags based on target gcc ABI +# Default conda channels use gcc 7.2, conda-forge uses gcc 4.8.5 if [[ "$(uname)" != 'Darwin' && "$GCC_USE_C11" -eq 0 ]]; then - # opencv 3.3.1 in conda-forge doesn't have imgcodecs, and opencv 3.1.0 - # requires numpy 1.12 - remove_lines_with 'opencv' - add_package 'opencv' '==3.1.0' - - # Default conda channels use gcc 7.2, conda-forge uses gcc 4.8.5 caffe2_cmake_args+=("-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0") conda_channel+=('-c conda-forge') fi diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat index b3432d37ef495e..185d43ac0a8992 100644 --- a/scripts/build_windows.bat +++ b/scripts/build_windows.bat @@ -51,7 +51,7 @@ cmake .. ^ -DBUILD_TEST=OFF ^ -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ -DUSE_CUDA=%USE_CUDA% ^ - -DCUDA_ARCH_NAME=Maxwell ^ + -DTORCH_CUDA_ARCH_LIST=5.0 ^ -DUSE_NNPACK=OFF ^ -DUSE_CUB=OFF ^ -DUSE_GLOG=OFF ^ diff --git a/setup.py b/setup.py index 8dfeca7424d48b..50c28867b296fd 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,9 @@ # NO_CUDNN # disables the cuDNN build # +# NO_MIOPEN +# disables the MIOpen build +# # NO_MKLDNN # disables the MKLDNN build # @@ -91,6 +94,7 @@ import distutils.unixccompiler import distutils.command.build import distutils.command.clean +import distutils.sysconfig import platform import subprocess import shutil @@ -103,8 +107,11 @@ from tools.setup_helpers.env import check_env_flag from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION +from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY, CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR) +from tools.setup_helpers.miopen import (WITH_MIOPEN, MIOPEN_LIBRARY, + MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR) from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \ NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB from tools.setup_helpers.mkldnn import (WITH_MKLDNN, MKLDNN_LIBRARY, @@ -116,31 +123,58 @@ from tools.setup_helpers.dist_check import WITH_DISTRIBUTED, \ WITH_DISTRIBUTED_MW, WITH_GLOO_IBVERBS -DEBUG = check_env_flag('DEBUG') +################################################################################ +# Parameters parsed from environment +################################################################################ + +DEBUG = check_env_flag('DEBUG') IS_WINDOWS = (platform.system() == 'Windows') IS_DARWIN = (platform.system() == 'Darwin') IS_LINUX = (platform.system() == 'Linux') - -# Check if ROCM is enabled -WITH_ROCM = check_env_flag('WITH_ROCM') +FULL_CAFFE2 = check_env_flag('FULL_CAFFE2') +BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH') NUM_JOBS = multiprocessing.cpu_count() max_jobs = os.getenv("MAX_JOBS") if max_jobs is not None: NUM_JOBS = min(NUM_JOBS, int(max_jobs)) +# Ninja try: import ninja WITH_NINJA = True + ninja_global = NinjaBuilder('global') except ImportError: WITH_NINJA = False + ninja_global = None -if not WITH_NINJA: - ################################################################################ - # Monkey-patch setuptools to compile in parallel - ################################################################################ +# Constant known variables used throughout this file +cwd = os.path.dirname(os.path.abspath(__file__)) +lib_path = os.path.join(cwd, "torch", "lib") +third_party_path = os.path.join(cwd, "third_party") +tmp_install_path = lib_path + "/tmp_install" + + +class PytorchCommand(setuptools.Command): + """ + Base Pytorch command to avoid implementing initialize/finalize_options in + every subclass + """ + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + +################################################################################ +# Patches and workarounds +################################################################################ +# Monkey-patch setuptools to compile in parallel +if not WITH_NINJA: def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None): @@ -160,6 +194,7 @@ def _single_compile(obj): return objects distutils.ccompiler.CCompiler.compile = parallelCCompile +# Patch for linking with ccache original_link = distutils.unixccompiler.UnixCCompiler.link @@ -170,35 +205,72 @@ def patched_link(self, *args, **kwargs): self.compiler_cxx = _cxx return result - distutils.unixccompiler.UnixCCompiler.link = patched_link -################################################################################ # Workaround setuptools -Wstrict-prototypes warnings # I lifted this code from https://stackoverflow.com/a/29634231/23845 -################################################################################ -import distutils.sysconfig cfg_vars = distutils.sysconfig.get_config_vars() for key, value in cfg_vars.items(): if type(value) == str: cfg_vars[key] = value.replace("-Wstrict-prototypes", "") + +################################################################################ +# Version and create_version_file +################################################################################ +version = '0.5.0a0' +if os.getenv('PYTORCH_BUILD_VERSION'): + assert os.getenv('PYTORCH_BUILD_NUMBER') is not None + build_number = int(os.getenv('PYTORCH_BUILD_NUMBER')) + version = os.getenv('PYTORCH_BUILD_VERSION') + if build_number > 1: + version += '.post' + str(build_number) +else: + try: + sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() + version += '+' + sha[:7] + except Exception: + pass + + +class create_version_file(PytorchCommand): + def run(self): + global version, cwd + print('-- Building version ' + version) + version_path = os.path.join(cwd, 'torch', 'version.py') + with open(version_path, 'w') as f: + f.write("__version__ = '{}'\n".format(version)) + # NB: This is not 100% accurate, because you could have built the + # library code with DEBUG, but csrc without DEBUG (in which case + # this would claim to be a release build when it's not.) + f.write("debug = {}\n".format(repr(DEBUG))) + f.write("cuda = {}\n".format(repr(CUDA_VERSION))) + + ################################################################################ -# Custom build commands +# Building dependent libraries ################################################################################ +# All libraries that torch could depend on dep_libs = [ 'nccl', 'caffe2', 'libshm', 'libshm_windows', 'gloo', 'THD', 'nanopb', ] +missing_pydep = ''' +Missing build dependency: Unable to `import {importname}`. +Please install it via `conda install {module}` or `pip install {module}` +'''.strip() + -# global ninja file for building generated code stuff -ninja_global = None -if WITH_NINJA: - ninja_global = NinjaBuilder('global') +def check_pydep(importname, module): + try: + importlib.import_module(importname) + except ImportError: + raise RuntimeError(missing_pydep.format(importname=importname, module=module)) +# Calls build_pytorch_libs.sh/bat with the correct env variables def build_libs(libs): for lib in libs: assert lib in dep_libs, 'invalid lib: {}'.format(lib) @@ -229,44 +301,30 @@ def build_libs(libs): my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR + if WITH_MIOPEN: + my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR + my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY + my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR if WITH_MKLDNN: my_env["MKLDNN_LIB_DIR"] = MKLDNN_LIB_DIR my_env["MKLDNN_LIBRARY"] = MKLDNN_LIBRARY my_env["MKLDNN_INCLUDE_DIR"] = MKLDNN_INCLUDE_DIR build_libs_cmd += ['--with-mkldnn'] - if WITH_GLOO_IBVERBS: build_libs_cmd += ['--with-gloo-ibverbs'] - if WITH_DISTRIBUTED_MW: build_libs_cmd += ['--with-distributed-mw'] + if FULL_CAFFE2: + build_libs_cmd += ['--full-caffe2'] + if subprocess.call(build_libs_cmd + libs, env=my_env) != 0: print("Failed to run '{}'".format(' '.join(build_libs_cmd + libs))) sys.exit(1) -missing_pydep = ''' -Missing build dependency: Unable to `import {importname}`. -Please install it via `conda install {module}` or `pip install {module}` -'''.strip() - - -def check_pydep(importname, module): - try: - importlib.import_module(importname) - except ImportError: - raise RuntimeError(missing_pydep.format(importname=importname, module=module)) - - -class build_deps(Command): - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass +# Build all dependent libraries +class build_deps(PytorchCommand): def run(self): # Check if you remembered to check out submodules def check_file(f): @@ -334,15 +392,7 @@ def run(self): build_dep_cmds['build_' + lib.lower()] = build_dep -class build_module(Command): - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - +class build_module(PytorchCommand): def run(self): self.run_command('build_py') self.run_command('build_ext') @@ -351,27 +401,14 @@ def run(self): class build_py(setuptools.command.build_py.build_py): def run(self): - self.create_version_file() + self.run_command('create_version_file') setuptools.command.build_py.build_py.run(self) - @staticmethod - def create_version_file(): - global version, cwd - print('-- Building version ' + version) - version_path = os.path.join(cwd, 'torch', 'version.py') - with open(version_path, 'w') as f: - f.write("__version__ = '{}'\n".format(version)) - # NB: This is not 100% accurate, because you could have built the - # library code with DEBUG, but csrc without DEBUG (in which case - # this would claim to be a release build when it's not.) - f.write("debug = {}\n".format(repr(DEBUG))) - f.write("cuda = {}\n".format(repr(CUDA_VERSION))) - class develop(setuptools.command.develop.develop): def run(self): - build_py.create_version_file() + self.run_command('create_version_file') setuptools.command.develop.develop.run(self) self.create_compile_commands() @@ -426,6 +463,8 @@ def run(self): print('-- Detected cuDNN at ' + CUDNN_LIBRARY + ', ' + CUDNN_INCLUDE_DIR) else: print('-- Not using cuDNN') + if WITH_MIOPEN: + print('-- Detected MIOpen at ' + MIOPEN_LIBRARY + ', ' + MIOPEN_INCLUDE_DIR) if WITH_CUDA: print('-- Detected CUDA at ' + CUDA_HOME) else: @@ -474,6 +513,39 @@ def run(self): self.copy_file(export_lib, target_lib) + def build_extensions(self): + # If also building Caffe2 python, then we need this extra logic copied + # from the old setup_caffe2.py + if FULL_CAFFE2: + pybind_exts = [ + 'caffe2.python.caffe2_pybind11_state', + 'caffe2.python.caffe2_pybind11_state_gpu', + ] + # The cmake of Caffe2 puts these in the site-packages rather than + # the build directory like the other torch extensions + sp_dir = distutils.sysconfig.get_python_lib(prefix='') + i = 0 + while i < len(self.extensions): + ext = self.extensions[i] + if ext.name not in pybind_exts: + i += 1 + continue + fullname = self.get_ext_fullname(ext.name) + filename = self.get_ext_filename(fullname) + + src = os.path.join(tmp_install_path, sp_dir, filename) + if not os.path.exists(src): + print("{} does not exist".format(src)) + del self.extensions[i] + else: + dst = os.path.join(os.path.realpath(self.build_lib), filename) + dst_dir = os.path.dirname(dst) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + self.copy_file(src, dst) + i += 1 + distutils.command.build_ext.build_ext.build_extensions(self) + class build(distutils.command.build.build): sub_commands = [ @@ -553,12 +625,6 @@ def run(self): if check_env_flag('WERROR'): extra_compile_args.append('-Werror') -cwd = os.path.dirname(os.path.abspath(__file__)) -lib_path = os.path.join(cwd, "torch", "lib") -third_party_path = os.path.join(cwd, "third_party") - - -tmp_install_path = lib_path + "/tmp_install" include_dirs += [ cwd, os.path.join(cwd, "torch", "csrc"), @@ -573,8 +639,10 @@ def run(self): # we specify exact lib names to avoid conflict with lua-torch installs CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')] -if WITH_CUDA or WITH_ROCM: +if WITH_CUDA: CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed']) +if WITH_ROCM: + CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed']) THD_LIB = os.path.join(lib_path, 'libTHD.a') NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1') @@ -583,14 +651,18 @@ def run(self): if IS_DARWIN: CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')] - if WITH_CUDA or WITH_ROCM: + if WITH_CUDA: CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib')) + if WITH_ROCM: + CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib')) NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib') if IS_WINDOWS: CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')] - if WITH_CUDA or WITH_ROCM: + if WITH_CUDA: CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib')) + if WITH_ROCM: + CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib')) if DEBUG: NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib') else: @@ -664,6 +736,7 @@ def run(self): "torch/csrc/jit/passes/canonicalize.cpp", "torch/csrc/jit/passes/batch_mm.cpp", "torch/csrc/jit/passes/decompose_addmm.cpp", + "torch/csrc/jit/passes/loop_unrolling.cpp", "torch/csrc/jit/passes/onnx/peephole.cpp", "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp", "torch/csrc/jit/generated/aten_dispatch.cpp", @@ -789,7 +862,6 @@ def run(self): extra_link_args.append('-Wl,-rpath,' + hip_lib_path) extra_compile_args += ['-DWITH_ROCM'] extra_compile_args += ['-D__HIP_PLATFORM_HCC__'] - main_sources += [ "torch/csrc/cuda/Module.cpp", "torch/csrc/cuda/Storage.cpp", @@ -819,6 +891,12 @@ def run(self): if not IS_WINDOWS: extra_link_args.insert(0, '-Wl,-rpath,' + CUDNN_LIB_DIR) extra_compile_args += ['-DWITH_CUDNN'] +if WITH_MIOPEN: + # main_libraries += [MIOPEN_LIBRARY] + include_dirs.insert(0, MIOPEN_INCLUDE_DIR) + if not IS_WINDOWS: + extra_link_args.insert(0, '-Wl,-rpath,' + MIOPEN_LIB_DIR) + extra_compile_args += ['-DWITH_MIOPEN'] if DEBUG: if IS_WINDOWS: @@ -884,21 +962,22 @@ def make_relative_rpath(path): ) extensions.append(THNVRTC) -version = '0.5.0a0' -if os.getenv('PYTORCH_BUILD_VERSION'): - assert os.getenv('PYTORCH_BUILD_NUMBER') is not None - build_number = int(os.getenv('PYTORCH_BUILD_NUMBER')) - version = os.getenv('PYTORCH_BUILD_VERSION') - if build_number > 1: - version += '.post' + str(build_number) -else: - try: - sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() - version += '+' + sha[:7] - except Exception: - pass +if FULL_CAFFE2: + # If building Caffe2 python as well, these extensions are built by cmake + # copied manually in build_extensions() inside the build_ext implementaiton + extensions.append( + setuptools.Extension( + name=str('caffe2.python.caffe2_pybind11_state'), + sources=[]), + ) + extensions.append( + setuptools.Extension( + name=str('caffe2.python.caffe2_pybind11_state_gpu'), + sources=[]), + ) cmdclass = { + 'create_version_file': create_version_file, 'build': build, 'build_py': build_py, 'build_ext': build_ext, diff --git a/setup_caffe2.py b/setup_caffe2.py index d5a0e1db20f0cc..77a7b64f4a3302 100644 --- a/setup_caffe2.py +++ b/setup_caffe2.py @@ -261,4 +261,10 @@ def run(self): author='jiayq', author_email='jiayq@fb.com', url='https://caffe2.ai', + entry_points={ + 'console_scripts': [ + 'convert-caffe2-to-onnx = caffe2.python.onnx.bin.conversion:caffe2_to_onnx', + 'convert-onnx-to-caffe2 = caffe2.python.onnx.bin.conversion:onnx_to_caffe2', + ] + }, ) diff --git a/test/common.py b/test/common.py index 52033b3a8858bb..94da44ea1b664c 100644 --- a/test/common.py +++ b/test/common.py @@ -187,7 +187,8 @@ def __init__(self, method_name='runTest'): # Wraps the tested method if we should do CUDA memory check. test_method = getattr(self, method_name) self._do_cuda_memory_leak_check &= getattr(test_method, '_do_cuda_memory_leak_check', True) - if self._do_cuda_memory_leak_check: + # FIXME: figure out the flaky -1024 anti-leaks on windows. See #8044 + if self._do_cuda_memory_leak_check and not IS_WINDOWS: # the import below may initialize CUDA context, so we do it only if # self._do_cuda_memory_leak_check is True. from common_cuda import TEST_CUDA diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp index f8f43da1c8ddad..0f4fa33ddf3e1c 100644 --- a/test/cpp/api/misc.cpp +++ b/test/cpp/api/misc.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -12,6 +13,9 @@ using namespace torch; using namespace torch::nn; +template +using OrderedDict = detail::OrderedDict; + using Catch::StartsWith; TEST_CASE("misc") { @@ -67,7 +71,7 @@ TEST_CASE("autograd") { } SECTION("custom gradient inputs") { z.sum().backward( - autograd::make_variable(at::ones(at::CPU(at::kFloat), {1}) * 2)); + autograd::make_variable(at::ones(at::CPU(at::kFloat), {}) * 2)); REQUIRE(x.grad().allclose(y * 2)); } // Assume everything else is safe from PyTorch tests. @@ -153,3 +157,193 @@ TEST_CASE("make_unique") { REQUIRE(ptr[2] == 0); } } + +TEST_CASE("ordered-dict") { + SECTION("is empty after default construction") { + OrderedDict dict; + REQUIRE(dict.subject() == "Key"); + REQUIRE(dict.is_empty()); + REQUIRE(dict.size() == 0); + } + + SECTION("insert inserts elements when they are not yet present") { + OrderedDict dict; + dict.insert("a", 1); + dict.insert("b", 2); + REQUIRE(dict.size() == 2); + } + + SECTION("get returns values when present") { + OrderedDict dict; + dict.insert("a", 1); + dict.insert("b", 2); + REQUIRE(dict.get("a") == 1); + REQUIRE(dict.get("b") == 2); + } + + SECTION("get throws when passed keys that are not present") { + OrderedDict dict; + dict.insert("a", 1); + dict.insert("b", 2); + REQUIRE_THROWS_WITH( + dict.get("foo"), StartsWith("Key 'foo' is not defined")); + REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined")); + } + + SECTION("can initialize from list") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict.size() == 2); + REQUIRE(dict.get("a") == 1); + REQUIRE(dict.get("b") == 2); + } + + SECTION("insert throws when passed elements that are present") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE_THROWS_WITH( + dict.insert("a", 1), StartsWith("Key 'a' already defined")); + REQUIRE_THROWS_WITH( + dict.insert("b", 1), StartsWith("Key 'b' already defined")); + } + + SECTION("front() returns the first item") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict.front().key == "a"); + REQUIRE(dict.front().value == 1); + } + + SECTION("back() returns the last item") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict.back().key == "b"); + REQUIRE(dict.back().value == 2); + } + + SECTION("find returns pointers to values when present") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict.find("a") != nullptr); + REQUIRE(*dict.find("a") == 1); + REQUIRE(dict.find("b") != nullptr); + REQUIRE(*dict.find("b") == 2); + } + + SECTION("find returns null pointers when passed keys that are not present") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict.find("bar") == nullptr); + REQUIRE(dict.find("") == nullptr); + } + + SECTION("operator[] returns values when passed keys that are present") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict["a"] == 1); + REQUIRE(dict["b"] == 2); + } + + SECTION("operator[] returns items positionally when passed integers") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(dict[0].key == "a"); + REQUIRE(dict[0].value == 1); + REQUIRE(dict[1].key == "b"); + REQUIRE(dict[1].value == 2); + } + + SECTION("operator[] throws when passed keys that are not present") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE_THROWS_WITH( + dict.get("foo"), StartsWith("Key 'foo' is not defined")); + REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined")); + } + + SECTION("update inserts all items from another OrderedDict") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + OrderedDict dict2 = {{"c", 3}}; + dict2.update(dict); + REQUIRE(dict2.size() == 3); + REQUIRE(dict2.find("a") != nullptr); + REQUIRE(dict2.find("b") != nullptr); + REQUIRE(dict2.find("c") != nullptr); + } + + SECTION("update also checks for duplicates") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + OrderedDict dict2 = {{"a", 1}}; + REQUIRE_THROWS_WITH( + dict2.update(dict), StartsWith("Key 'a' already defined")); + } + + SECTION("Can iterate items") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + auto iterator = dict.begin(); + REQUIRE(iterator != dict.end()); + REQUIRE(iterator->key == "a"); + REQUIRE(iterator->value == 1); + ++iterator; + REQUIRE(iterator != dict.end()); + REQUIRE(iterator->key == "b"); + REQUIRE(iterator->value == 2); + ++iterator; + REQUIRE(iterator == dict.end()); + } + + SECTION("clear makes the dict empty") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + REQUIRE(!dict.is_empty()); + dict.clear(); + REQUIRE(dict.is_empty()); + } + + SECTION("can copy construct") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + OrderedDict copy = dict; + REQUIRE(copy.size() == 2); + REQUIRE(*copy[0] == 1); + REQUIRE(*copy[1] == 2); + } + + SECTION("can copy assign") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + OrderedDict copy = {{"c", 1}}; + REQUIRE(copy.find("c") != nullptr); + copy = dict; + REQUIRE(copy.size() == 2); + REQUIRE(*copy[0] == 1); + REQUIRE(*copy[1] == 2); + REQUIRE(copy.find("c") == nullptr); + } + + SECTION("can move construct") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + OrderedDict copy = std::move(dict); + REQUIRE(copy.size() == 2); + REQUIRE(*copy[0] == 1); + REQUIRE(*copy[1] == 2); + } + + SECTION("can move assign") { + OrderedDict dict = {{"a", 1}, {"b", 2}}; + OrderedDict copy = {{"c", 1}}; + REQUIRE(copy.find("c") != nullptr); + copy = std::move(dict); + REQUIRE(copy.size() == 2); + REQUIRE(*copy[0] == 1); + REQUIRE(*copy[1] == 2); + REQUIRE(copy.find("c") == nullptr); + } + + SECTION("can insert with braces") { + OrderedDict> dict; + dict.insert("a", {1, 2}); + REQUIRE(!dict.is_empty()); + REQUIRE(dict["a"].first == 1); + REQUIRE(dict["a"].second == 2); + } + + SECTION("Error messages include the what") { + OrderedDict dict("Penguin"); + REQUIRE(dict.subject() == "Penguin"); + dict.insert("a", 1); + REQUIRE(!dict.is_empty()); + REQUIRE_THROWS_WITH( + dict.get("b"), StartsWith("Penguin 'b' is not defined")); + REQUIRE_THROWS_WITH( + dict.insert("a", 1), StartsWith("Penguin 'a' already defined")); + } +} diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect new file mode 100644 index 00000000000000..5bcef43f7c7a3d --- /dev/null +++ b/test/expect/TestScript.test_cat_lifts.expect @@ -0,0 +1,12 @@ +graph(%x : Dynamic) { + %1 : Dynamic = aten::cat[dim=1](%x, %x) + return (%1); +} +graph(%x : Dynamic) { + %1 : Dynamic = aten::cat[dim=1]() + return (%1); +} +graph(%x : Dynamic) { + %1 : Dynamic = aten::cat[dim=1](%x) + return (%1); +} diff --git a/test/expect/TestScript.test_loop_unroll_unused_counter.expect b/test/expect/TestScript.test_loop_unroll_unused_counter.expect new file mode 100644 index 00000000000000..6db04ad55ec313 --- /dev/null +++ b/test/expect/TestScript.test_loop_unroll_unused_counter.expect @@ -0,0 +1,36 @@ +graph(%x : Dynamic) { + %y.1 : Long() = prim::Constant[value={0}]() + %2 : Byte() = prim::Constant[value={1}]() + %3 : Dynamic = aten::div[other={8}](%x) + %4 : Dynamic = aten::mul[other={8}](%3) + %5 : Dynamic = aten::sub[alpha={1}](%x, %4) + %y.3 : Long() = prim::Loop(%3, %2, %y.1) + block0(%i.1 : Dynamic, %8 : Long()) { + %9 : Long() = prim::Constant[value={1}]() + %y.12 : Dynamic = aten::add[alpha={1}](%8, %9) + %11 : Long() = prim::Constant[value={1}]() + %y.5 : Dynamic = aten::add[alpha={1}](%y.12, %11) + %13 : Long() = prim::Constant[value={1}]() + %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %13) + %15 : Long() = prim::Constant[value={1}]() + %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %15) + %17 : Long() = prim::Constant[value={1}]() + %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %17) + %19 : Long() = prim::Constant[value={1}]() + %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %19) + %21 : Long() = prim::Constant[value={1}]() + %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %21) + %23 : Long() = prim::Constant[value={1}]() + %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %23) + %25 : Byte() = prim::Constant[value={1}]() + -> (%25, %y.11) + } + %y : Long() = prim::Loop(%5, %2, %y.3) + block0(%i : Dynamic, %28 : Long()) { + %29 : Long() = prim::Constant[value={1}]() + %y.4 : Dynamic = aten::add[alpha={1}](%28, %29) + %31 : Byte() = prim::Constant[value={1}]() + -> (%31, %y.4) + } + return (%y); +} diff --git a/test/expect/TestScript.test_loop_unrolling.expect b/test/expect/TestScript.test_loop_unrolling.expect new file mode 100644 index 00000000000000..0c61346c786fbc --- /dev/null +++ b/test/expect/TestScript.test_loop_unrolling.expect @@ -0,0 +1,37 @@ +graph(%x : Dynamic) { + %y.1 : Long() = prim::Constant[value={0}]() + %2 : Byte() = prim::Constant[value={1}]() + %3 : Long() = prim::Constant[value={0}]() + %4 : Dynamic = aten::div[other={8}](%x) + %5 : Dynamic = aten::mul[other={8}](%4) + %6 : Dynamic = aten::sub[alpha={1}](%x, %5) + %7 : Dynamic, %y.3 : Long() = prim::Loop(%4, %2, %3, %y.1) + block0(%i.1 : Dynamic, %10 : Dynamic, %11 : Long()) { + %y.12 : Dynamic = aten::add[alpha={1}](%11, %10) + %13 : Dynamic = aten::add[alpha={1}, other={1}](%10) + %y.5 : Dynamic = aten::add[alpha={1}](%y.12, %13) + %15 : Dynamic = aten::add[alpha={1}, other={1}](%13) + %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %15) + %17 : Dynamic = aten::add[alpha={1}, other={1}](%15) + %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %17) + %19 : Dynamic = aten::add[alpha={1}, other={1}](%17) + %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %19) + %21 : Dynamic = aten::add[alpha={1}, other={1}](%19) + %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %21) + %23 : Dynamic = aten::add[alpha={1}, other={1}](%21) + %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %23) + %25 : Dynamic = aten::add[alpha={1}, other={1}](%23) + %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %25) + %27 : Byte() = prim::Constant[value={1}]() + %28 : Dynamic = aten::add[alpha={1}, other={1}](%25) + -> (%27, %28, %y.11) + } + %29 : Dynamic, %y : Long() = prim::Loop(%6, %2, %7, %y.3) + block0(%i : Dynamic, %32 : Dynamic, %33 : Long()) { + %y.4 : Dynamic = aten::add[alpha={1}](%33, %32) + %35 : Byte() = prim::Constant[value={1}]() + %36 : Dynamic = aten::add[alpha={1}, other={1}](%32) + -> (%35, %36, %y.4) + } + return (%y); +} diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_const.expect b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect new file mode 100644 index 00000000000000..28eb86abd3f0bd --- /dev/null +++ b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect @@ -0,0 +1,24 @@ +graph() { + %y.1 : Long() = prim::Constant[value={0}]() + %1 : Long() = prim::Constant[value={1}]() + %y.11 : Dynamic = aten::add[alpha={1}](%y.1, %1) + %3 : Long() = prim::Constant[value={1}]() + %y.2 : Dynamic = aten::add[alpha={1}](%y.11, %3) + %5 : Long() = prim::Constant[value={1}]() + %y.3 : Dynamic = aten::add[alpha={1}](%y.2, %5) + %7 : Long() = prim::Constant[value={1}]() + %y.4 : Dynamic = aten::add[alpha={1}](%y.3, %7) + %9 : Long() = prim::Constant[value={1}]() + %y.5 : Dynamic = aten::add[alpha={1}](%y.4, %9) + %11 : Long() = prim::Constant[value={1}]() + %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %11) + %13 : Long() = prim::Constant[value={1}]() + %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %13) + %15 : Long() = prim::Constant[value={1}]() + %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %15) + %17 : Long() = prim::Constant[value={1}]() + %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %17) + %19 : Long() = prim::Constant[value={1}]() + %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %19) + return (%y.10); +} diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect b/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect new file mode 100644 index 00000000000000..99e0996ffcf995 --- /dev/null +++ b/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect @@ -0,0 +1,24 @@ +graph() { + %y.1 : Long() = prim::Constant[value={0}]() + %1 : Long() = prim::Constant[value={0}]() + %y.11 : Dynamic = aten::add[alpha={1}](%y.1, %1) + %3 : Dynamic = aten::add[alpha={1}, other={1}](%1) + %y.2 : Dynamic = aten::add[alpha={1}](%y.11, %3) + %5 : Dynamic = aten::add[alpha={1}, other={1}](%3) + %y.3 : Dynamic = aten::add[alpha={1}](%y.2, %5) + %7 : Dynamic = aten::add[alpha={1}, other={1}](%5) + %y.4 : Dynamic = aten::add[alpha={1}](%y.3, %7) + %9 : Dynamic = aten::add[alpha={1}, other={1}](%7) + %y.5 : Dynamic = aten::add[alpha={1}](%y.4, %9) + %11 : Dynamic = aten::add[alpha={1}, other={1}](%9) + %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %11) + %13 : Dynamic = aten::add[alpha={1}, other={1}](%11) + %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %13) + %15 : Dynamic = aten::add[alpha={1}, other={1}](%13) + %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %15) + %17 : Dynamic = aten::add[alpha={1}, other={1}](%15) + %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %17) + %19 : Dynamic = aten::add[alpha={1}, other={1}](%17) + %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %19) + return (%y.10); +} diff --git a/test/expect/TestScript.test_loop_unrolling_nested.expect b/test/expect/TestScript.test_loop_unrolling_nested.expect new file mode 100644 index 00000000000000..97eb4245c4f86d --- /dev/null +++ b/test/expect/TestScript.test_loop_unrolling_nested.expect @@ -0,0 +1,44 @@ +graph(%x : Dynamic) { + %y.1 : Long() = prim::Constant[value={0}]() + %2 : Long() = prim::Constant[value={10}]() + %3 : Byte() = prim::Constant[value={1}]() + %y : Long() = prim::Loop(%2, %3, %y.1) + block0(%i : Dynamic, %6 : Long()) { + %7 : Byte() = prim::Constant[value={1}]() + %8 : Long() = prim::Constant[value={0}]() + %9 : Dynamic = aten::div[other={8}](%x) + %10 : Dynamic = aten::mul[other={8}](%9) + %11 : Dynamic = aten::sub[alpha={1}](%x, %10) + %12 : Dynamic, %y.4 : Long() = prim::Loop(%9, %7, %8, %6) + block0(%j.1 : Dynamic, %15 : Dynamic, %16 : Long()) { + %y.13 : Dynamic = aten::add[alpha={1}](%16, %15) + %18 : Dynamic = aten::add[alpha={1}, other={1}](%15) + %y.6 : Dynamic = aten::add[alpha={1}](%y.13, %18) + %20 : Dynamic = aten::add[alpha={1}, other={1}](%18) + %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %20) + %22 : Dynamic = aten::add[alpha={1}, other={1}](%20) + %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %22) + %24 : Dynamic = aten::add[alpha={1}, other={1}](%22) + %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %24) + %26 : Dynamic = aten::add[alpha={1}, other={1}](%24) + %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %26) + %28 : Dynamic = aten::add[alpha={1}, other={1}](%26) + %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %28) + %30 : Dynamic = aten::add[alpha={1}, other={1}](%28) + %y.12 : Dynamic = aten::add[alpha={1}](%y.11, %30) + %32 : Byte() = prim::Constant[value={1}]() + %33 : Dynamic = aten::add[alpha={1}, other={1}](%30) + -> (%32, %33, %y.12) + } + %34 : Dynamic, %y.3 : Long() = prim::Loop(%11, %7, %12, %y.4) + block0(%j : Dynamic, %37 : Dynamic, %38 : Long()) { + %y.5 : Dynamic = aten::add[alpha={1}](%38, %37) + %40 : Byte() = prim::Constant[value={1}]() + %41 : Dynamic = aten::add[alpha={1}, other={1}](%37) + -> (%40, %41, %y.5) + } + %42 : Byte() = prim::Constant[value={1}]() + -> (%42, %y.3) + } + return (%y); +} diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect index a55811b061a9a1..355af689a3db80 100644 --- a/test/onnx/expect/TestOperators.test_flatten.expect +++ b/test/onnx/expect/TestOperators.test_flatten.expect @@ -134,7 +134,7 @@ graph { op_type: "Cast" attribute { name: "to" - i: 11 + i: 7 type: INT } } diff --git a/test/test_autograd.py b/test/test_autograd.py index 0c19e24208998e..6211de479b40be 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -75,7 +75,7 @@ def _function_test(self, cls): x = torch.randn(5, 5, requires_grad=True) y = torch.randn(5, 5, requires_grad=True) result = cls.apply(x, 2, y) - go = torch.ones(1, requires_grad=True) + go = torch.ones((), requires_grad=True) result.sum().backward(go, create_graph=True) self.assertEqual(x.grad.data, y.data + torch.ones(5, 5)) @@ -173,6 +173,23 @@ def backward(self, grad_output): MyFunction()(y).sum().backward() self.assertEqual(v.grad.data, torch.zeros(shape)) + def test_invalid_gradients(self): + class MyFunction(Function): + @staticmethod + def forward(ctx, x): + return x * 2 + + @staticmethod + def backward(ctx, grad_output): + return torch.randn(10, dtype=torch.float) + + with self.assertRaisesRegex(RuntimeError, 'expected shape'): + input = torch.randn(5, 5, dtype=torch.float, requires_grad=True) + MyFunction.apply(input).sum().backward() + with self.assertRaisesRegex(RuntimeError, 'expected type'): + input = torch.randn(10, dtype=torch.double, requires_grad=True) + MyFunction.apply(input).sum().backward() + def test_accumulate_grad(self): grad_output = torch.ones(5, 5) @@ -495,7 +512,6 @@ def test_backward(self): def test_sparse_backward(self): class FixedGradientFunction(Function): - def __init__(self, grad): self.grad = grad @@ -524,15 +540,15 @@ def backward(self, grad_x): dense_fn = FixedGradientFunction(dense_grad) # sparse first - x = torch.randn(5, 5, requires_grad=True) + x = torch.randn(size, requires_grad=True) (sparse_fn1(x) + dense_fn(x) + sparse_fn2(x)).sum().backward() self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2) # dense first - x = torch.randn(5, 5, requires_grad=True) + x = torch.randn(size, requires_grad=True) (dense_fn(x) + sparse_fn1(x) + sparse_fn2(x)).sum().backward() self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2) # sparse only - x = torch.randn(5, 5, requires_grad=True) + x = torch.randn(size, requires_grad=True) (sparse_fn1(x) + sparse_fn2(x)).sum().backward() self.assertEqual(x.grad, sparse_grad1 + sparse_grad2) @@ -2277,6 +2293,19 @@ def test_set_requires_grad_only_for_floats_cuda(self): def test_set_requires_grad_only_for_floats(self): self._test_set_requires_grad_only_for_floats(self, False) + @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable") + def test_rnn_backward_to_input_but_not_parameters_cuda(self): + # this checks whether it is possible to not require + # weight parameters, but require inputs, see #7722 + dev = torch.device('cuda') + l = torch.nn.LSTM(2, 3).to(dev) + for p in l.parameters(): + p.requires_grad = False + s = torch.randn(1, 1, 2, requires_grad=True, device=dev) + out, _ = l(s) + out.sum().backward() + self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0) + def index_variable(shape, max_indices): if not isinstance(shape, tuple): diff --git a/test/test_cuda.py b/test/test_cuda.py index 0139c746f4f50b..d9bf6e66a41178 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3,16 +3,23 @@ import tempfile import re import unittest +import sys from itertools import repeat import torch import torch.cuda import torch.cuda.comm as comm +from torch import multiprocessing as mp from test_torch import TestTorch from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, PY3 -from common_cuda import TEST_CUDA, TEST_MULTIGPU +# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here, +# because if we do that, the TEST_CUDNN line from common_cuda will be executed +# multiple times as well during the execution of this test suite, and it will +# cause CUDA OOM error on Windows. +TEST_CUDA = torch.cuda.is_available() +TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2 if not TEST_CUDA: print('CUDA not available, skipping tests') @@ -85,6 +92,10 @@ def number(floating, integer, t): else: return integer + +def cast_tensor(tensor, t): + return t(tensor.size()).copy_(tensor) + S = 10 M = 50 @@ -401,6 +412,10 @@ def tmp(t): ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types), ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types), ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types), + ('__lshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(1, 5), t)), + lambda t: [2], None, signed_types), + ('__rshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(3, 7), t)), + lambda t: [2], None, signed_types), # lapack tests ('qr', small_2d_lapack, lambda t: [], 'square', float_types, False, unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), @@ -480,6 +495,8 @@ def tmp(t): 'tanh': 1e-3, 'trace': 1e-3, 'var': 1e-3, + '__lshift__': 1e-3, + '__rshift__': 1e-3, } simple_pointwise = [ @@ -1389,6 +1406,34 @@ def test_multinomial(self): sample = torch.multinomial(freqs, 1000, True) self.assertNotEqual(freqs[sample].min(), 0) + def _spawn_method(self, method, arg): + try: + mp.set_start_method('spawn') + except RuntimeError: + pass + with mp.Pool(1) as pool: + self.assertTrue(pool.map(method, [arg])) + + @staticmethod + def _test_multinomial_invalid_probs_cuda(probs): + try: + with torch.random.fork_rng(devices=[0]): + torch.multinomial(probs.to('cuda'), 1) + torch.cuda.synchronize() + return False # Should not be reached + except RuntimeError as e: + return 'device-side assert triggered' in str(e) + + @unittest.skipIf(not PY3, + "spawn start method is not supported in Python 2, \ + but we need it for creating another process with CUDA") + def test_multinomial_invalid_probs_cuda(self): + test_method = TestCuda._test_multinomial_invalid_probs_cuda + self._spawn_method(test_method, torch.Tensor([0, -1])) + self._spawn_method(test_method, torch.Tensor([0, float('inf')])) + self._spawn_method(test_method, torch.Tensor([0, float('-inf')])) + self._spawn_method(test_method, torch.Tensor([0, float('nan')])) + def test_broadcast(self): TestTorch._test_broadcast(self, lambda t: t.cuda()) @@ -1643,6 +1688,41 @@ def test_nvtx(self): torch.cuda.nvtx.mark("bar") torch.cuda.nvtx.range_pop() + def test_randperm_cuda(self): + cuda = torch.device('cuda:0') + + # For small inputs, randperm is offloaded to CPU instead + with torch.random.fork_rng(devices=[0]): + res1 = torch.randperm(100, device=cuda) + res2 = torch.cuda.LongTensor() + torch.randperm(100, out=res2, device=cuda) + self.assertEqual(res1, res2, 0) + + with torch.random.fork_rng(devices=[0]): + res1 = torch.randperm(100000, device=cuda) + res2 = torch.cuda.LongTensor() + torch.randperm(100000, out=res2, device=cuda) + self.assertEqual(res1, res2, 0) + + with torch.random.fork_rng(devices=[0]): + res1 = torch.randperm(100, dtype=torch.half, device=cuda) + res2 = torch.cuda.HalfTensor() + torch.randperm(100, out=res2, device=cuda) + self.assertEqual(res1, res2, 0) + + with torch.random.fork_rng(devices=[0]): + res1 = torch.randperm(50000, dtype=torch.half, device=cuda) + res2 = torch.cuda.HalfTensor() + torch.randperm(50000, out=res2, device=cuda) + self.assertEqual(res1, res2, 0) + + # randperm of 0 elements is an empty tensor + res1 = torch.randperm(0, device=cuda) + res2 = torch.cuda.LongTensor(5) + torch.randperm(0, out=res2, device=cuda) + self.assertEqual(res1.numel(), 0) + self.assertEqual(res2.numel(), 0) + def test_random_neg_values(self): TestTorch._test_random_neg_values(self, use_cuda=True) diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 23bc1ba2ec0bd8..49122d41d53082 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -344,7 +344,6 @@ def test_sequential_pin_memory(self): self.assertTrue(input.is_pinned()) self.assertTrue(target.is_pinned()) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_multiple_dataloaders(self): loader1_it = iter(DataLoader(self.dataset, num_workers=1)) loader2_it = iter(DataLoader(self.dataset, num_workers=2)) @@ -355,7 +354,6 @@ def test_multiple_dataloaders(self): next(loader1_it) next(loader2_it) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") @unittest.skip("temporarily disable until flaky failures are fixed") def test_segfault(self): p = ErrorTrackingProcess(target=_test_segfault) @@ -373,7 +371,6 @@ def test_segfault(self): finally: p.terminate() - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_timeout(self): p = ErrorTrackingProcess(target=_test_timeout) p.start() @@ -386,7 +383,6 @@ def test_timeout(self): finally: p.terminate() - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_worker_seed(self): num_workers = 6 dataset = SynchronizedSeedDataset(num_workers, num_workers) @@ -396,7 +392,6 @@ def test_worker_seed(self): seeds.add(batch[0]) self.assertEqual(len(seeds), num_workers) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_worker_init_fn(self): dataset = SeedDataset(4) dataloader = DataLoader(dataset, batch_size=2, num_workers=2, @@ -411,19 +406,15 @@ def test_shuffle(self): def test_shuffle_batch(self): self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True)) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_sequential_workers(self): self._test_sequential(DataLoader(self.dataset, num_workers=4)) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_seqential_batch_workers(self): self._test_sequential(DataLoader(self.dataset, batch_size=2, num_workers=4)) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_shuffle_workers(self): self._test_shuffle(DataLoader(self.dataset, shuffle=True, num_workers=4)) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_shuffle_batch_workers(self): self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4)) @@ -446,12 +437,10 @@ def _test_batch_sampler(self, **kwargs): self.assertEqual(len(input), 3) self.assertEqual(input, self.data[offset:offset + 3]) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_batch_sampler(self): self._test_batch_sampler() self._test_batch_sampler(num_workers=4) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_shuffle_pin_memory(self): loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True) @@ -478,11 +467,10 @@ def __len__(self): def test_error(self): self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True)) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_error_workers(self): self._test_error(DataLoader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4)) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") + @unittest.skipIf(IS_WINDOWS, "FIXME: stuck test") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_partial_workers(self): "check that workers exit even if the iterator is not exhausted" @@ -637,7 +625,6 @@ class TestStringDataLoader(TestCase): def setUp(self): self.dataset = StringDataset() - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_shuffle_pin_memory(self): loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True) @@ -721,7 +708,6 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers): if current_worker_idx == num_workers: current_worker_idx = 0 - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_ind_worker_queue(self): for batch_size in (8, 16, 32, 64): for num_workers in range(1, 6): diff --git a/test/test_jit.py b/test/test_jit.py index e667520024be8a..7cccc6a73d9a25 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -92,7 +92,7 @@ def get_fn(file_name, script_path): return fn -class TestJit(TestCase): +class JitTestCase(TestCase): def assertExpectedONNXGraph(self, trace, *args, **kwargs): torch.onnx._optimize_trace(trace, aten=False) self.assertExpectedGraph(trace, *args, **kwargs) @@ -110,21 +110,6 @@ def assertExpectedGraph(self, trace, *args, **kwargs): torch._C._jit_pass_lint(graph) self.assertExpected(str(graph), *args, **kwargs) - def assertExportImport(self, trace, inputs): - initializers = [] - - def run(graph): - return torch._C.GraphExecutor(graph, False)(*inputs) - - proto, _ = trace.graph().export(initializers, onnx_opset_version=0, - defer_weight_export=False, export_raw_ir=True) - self.assertFalse(initializers) - - imported_graph, initializers = torch._C._jit_import_graph(proto) - self.assertFalse(initializers) - - self.assertEqual(run(trace.graph()), run(imported_graph)) - def run_pass(self, name, trace): if isinstance(trace, torch._C.Graph): graph = trace @@ -143,6 +128,23 @@ def run_pass(self, name, trace): trace.set_graph(graph) return graph + +class TestJit(JitTestCase): + def assertExportImport(self, trace, inputs): + initializers = [] + + def run(graph): + return torch._C.GraphExecutor(graph, False)(*inputs) + + proto, _ = trace.graph().export(initializers, onnx_opset_version=0, + defer_weight_export=False, export_raw_ir=True) + self.assertFalse(initializers) + + imported_graph, initializers = torch._C._jit_import_graph(proto) + self.assertFalse(initializers) + + self.assertEqual(run(trace.graph()), run(imported_graph)) + def test_simple(self): x = torch.tensor([0.4], requires_grad=True) y = torch.tensor([0.7], requires_grad=True) @@ -154,17 +156,16 @@ def f(x, y): self.assertExpectedGraph(trace) self.assertExportImport(trace, (x, y)) - # index-2 is not implemented in interpreter - @unittest.expectedFailure def test_index(self): x = torch.tensor([0.4], requires_grad=True) - y = torch.tensor([0], dtype=torch.int64, requires_grad=True) + y = torch.tensor([0], dtype=torch.int64) - @torch.jit.compile(nderivs=0) def fn(x, y): return x[y] - fn(x, y) # Fails + fn_traced = torch.jit.trace(x, y)(fn) + + self.assertEqual(fn(x, y), fn_traced(x, y)) # Backwards tracing was broken for indexing by a constant, # because it's internally implemented using as_strided, @@ -704,7 +705,10 @@ def allSum(vs): nograd_inputs = reference_tensors recording_inputs = [t.clone().requires_grad_() for t in reference_tensors] - ge = torch.jit.trace(*input_tensors, optimize=optimize)(func) + if isinstance(func, torch._C.Graph): + ge = torch._C.GraphExecutor(func, optimize) + else: + ge = torch.jit.trace(*input_tensors, optimize=optimize)(func) # test no gradients case @@ -925,8 +929,23 @@ def addmm(mat, mat1, mat2, alpha, beta): self.assertEqual(out_ref, out_test) self.assertExpected(canonical(addmm.graph)) + def test_index_put(self): + ten = torch.zeros(3, 3) + mask = torch.Tensor([[True, True, True], + [True, False, False], + [True, True, False]]).byte() + + def test_fn(ten, mask): + ten[mask] = torch.ones(6) + return ten + + traced_test_fn = torch.jit.trace(ten, mask)(test_fn) + + ten = torch.rand(3, 3) + self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask)) -class TestScript(TestCase): + +class TestScript(JitTestCase): @contextmanager def capture_stdout(self): @@ -982,7 +1001,7 @@ def checkScript(self, script, inputs, optimize=True, outputs=None, name='func', source = textwrap.dedent(inspect.getsource(script)) self.checkScript(source, inputs, optimize, outputs, script.__name__, capture_output, frames_up=2) # Continue checking the Python frontend - ge = torch.jit.script(script, _frames_up=1) + ge = torch.jit.script(script, optimize, _frames_up=1) if capture_output: with self.capture_stdout() as captured: @@ -1133,7 +1152,7 @@ def func(x, y): out = func(x, y) self.assertEqual(func(x, y), x + y) - grad = torch.randn(2, 3) + grad = torch.randn(2, 3, dtype=torch.float) out.backward(grad) self.assertEqual(x.grad, grad) self.assertEqual(y.grad, grad.sum(dim=0)) @@ -1165,6 +1184,24 @@ def func(x): def func(x): return torch.cat((x, x), x, dim=0) + def test_cat_lifts(self): + @torch.jit.script + def foo(x): + return torch.cat([x, x], dim=1) + + @torch.jit.script + def foo2(x): + return torch.cat([], dim=1) + + @torch.jit.script + def foo3(x): + return torch.cat([x], dim=1) + + self.assertExpected( + canonical(foo.graph) + + canonical(foo2.graph) + + canonical(foo3.graph)) + def test_func_call(self): script = ''' def add(a, b): @@ -1385,18 +1422,15 @@ def func(a, b): self.checkScript(func, inputs, optimize=True) def test_script_for_in_range(self): - script = ''' - def test_for_in_range(): + def fn(): c = 0 for i in range(100): c += i return c - ''' - self.checkScript(script, [], outputs=[4950], optimize=True, name='test_for_in_range') + self.checkScript(fn, (), outputs=4950, optimize=True) def test_script_for_in_range_dynamic(self): - script = ''' - def test_script_for_in_range_dynamic(): + def fn(): c = 0 for i in range(100): acc = 0 @@ -1404,8 +1438,7 @@ def test_script_for_in_range_dynamic(): acc += j c += acc return c - ''' - self.checkScript(script, [], outputs=[161700], optimize=True, name='test_script_for_in_range_dynamic') + self.checkScript(fn, (), optimize=False) def test_script_for_in_range_ast(self): @torch.jit.script @@ -2739,7 +2772,7 @@ def foo(a, b, x): b = x return a - self.checkScript(foo, (torch.zeros(1), torch.zeros(4), torch.zeros(5)), False) + self.checkScript(foo, (torch.zeros(1), torch.zeros(4), torch.zeros(5)), optimize=False) def test_intlist_args(self): def func_1(x): @@ -2920,6 +2953,78 @@ def test_rand(): self.checkScript(test_rand, ()) + def test_loop_unrolling(self): + def fn(x): + y = 0 + for i in range(x): + y += i + return y + + graph = torch.jit._script_graph(fn) + self.run_pass('loop_unrolling', graph) + self.assertExpectedGraph(graph) + self.checkScript(fn, (torch.tensor(10),)) + + def test_loop_unrolling_const(self): + def fn(): + y = 0 + for i in range(10): + y += 1 + return y + + def fn2(): + y = 0 + for i in range(10): + y += i + return y + + def check(fn, name): + graph = torch.jit._script_graph(fn) + self.run_pass('loop_unrolling', graph) + self.assertExpectedGraph(graph, subname=name) + self.checkScript(fn, ()) + + check(fn, 'add_const') + check(fn2, 'add_iter') + + def test_loop_unrolling_nested(self): + def fn(x): + y = 0 + for i in range(10): + for j in range(x): + y += j + return y + + graph = torch.jit._script_graph(fn) + self.run_pass('loop_unrolling', graph) + self.assertExpectedGraph(graph) + self.checkScript(fn, (torch.tensor(10),)) + + def test_loop_unroll_unused_counter(self): + def fn(x): + y = 0 + for i in range(x): + y += 1 + return y + + graph = torch.jit._script_graph(fn) + self.run_pass('loop_unrolling', graph) + self.assertExpectedGraph(graph) + + def test_loop_unroll_negative(self): + def fn(x): + y = 0 + for i in range(x): + y += 1 + return y + + self.checkScript(fn, (torch.tensor(-20),)) + self.checkScript(fn, (torch.tensor(-2),)) + self.checkScript(fn, (torch.tensor(-1),)) + self.checkScript(fn, (torch.tensor(0),)) + self.checkScript(fn, (torch.tensor(1),)) + self.checkScript(fn, (torch.tensor(2),)) + # Smoke tests for export methods class TestPytorchExportModes(unittest.TestCase): diff --git a/test/test_nn.py b/test/test_nn.py index 3e3356dfad4725..13b7013200b805 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -2430,16 +2430,15 @@ def test_parallel_apply(self): i2 = torch.randn(2, 10, device="cuda:1", dtype=torch.float) expected1 = l1(i1).data expected2 = l2(i2).data - inputs = ((i1,), (i2,)) modules = (l1, l2) expected_outputs = (expected1, expected2) - outputs = dp.parallel_apply(modules, inputs, None) - for out, expected in zip(outputs, expected_outputs): - self.assertEqual(out.data, expected) - - inputs = (i1, i2.new_empty(0)) - expected_outputs = (expected1, expected2.new_empty(0)) + # each input can be either a collection of positional arguments + # or an object representing the single argument + for inputs in [((i1,), (i2,)), (i1, i2)]: + outputs = dp.parallel_apply(modules, inputs, None) + for out, expected in zip(outputs, expected_outputs): + self.assertEqual(out.data, expected) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") def test_data_parallel_multiple_input(self): @@ -6289,7 +6288,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm1d', @@ -6298,7 +6296,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='3d_input', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm1d', @@ -6307,7 +6304,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='affine_simple_average', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm1d', @@ -6316,7 +6312,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='not_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm1d', @@ -6325,7 +6320,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='not_tracking_stats', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm1d', @@ -6334,7 +6328,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='3d_input_not_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm2d', @@ -6342,7 +6335,6 @@ def multimarginloss_weights_no_reduce_test(): input_size=(2, 3, 6, 6), cudnn=True, check_eval=True, - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm2d', @@ -6351,7 +6343,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='2d_simple_average', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm2d', @@ -6360,7 +6351,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='momentum', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm2d', @@ -6369,7 +6359,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='not_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm2d', @@ -6378,7 +6367,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='not_tracking_stats', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm3d', @@ -6386,7 +6374,6 @@ def multimarginloss_weights_no_reduce_test(): input_size=(2, 3, 4, 4, 4), cudnn=True, check_eval=True, - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm3d', @@ -6395,7 +6382,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='3d_simple_average', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm3d', @@ -6404,7 +6390,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='momentum', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm3d', @@ -6413,7 +6398,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='not_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='BatchNorm3d', @@ -6422,7 +6406,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='not_tracking_stats', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='InstanceNorm1d', @@ -6430,7 +6413,6 @@ def multimarginloss_weights_no_reduce_test(): input_size=(4, 3, 15), cudnn=True, check_eval=True, - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='InstanceNorm1d', @@ -6439,7 +6421,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='tracking_stats', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='InstanceNorm2d', @@ -6447,7 +6428,6 @@ def multimarginloss_weights_no_reduce_test(): input_size=(2, 3, 6, 6), cudnn=True, check_eval=True, - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='InstanceNorm2d', @@ -6456,7 +6436,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='tracking_stats', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='InstanceNorm3d', @@ -6464,7 +6443,6 @@ def multimarginloss_weights_no_reduce_test(): input_size=(2, 3, 4, 4, 4), cudnn=True, check_eval=True, - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='InstanceNorm3d', @@ -6473,7 +6451,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='tracking_stats', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='LayerNorm', @@ -6482,7 +6459,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='1d_elementwise_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='LayerNorm', @@ -6491,7 +6467,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='1d_no_elementwise_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='LayerNorm', @@ -6500,7 +6475,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='3d_elementwise_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='LayerNorm', @@ -6509,7 +6483,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='3d_no_elementwise_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='GroupNorm', @@ -6518,7 +6491,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='1d_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='GroupNorm', @@ -6527,7 +6499,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='1d_no_affine_IN', # this setting is equivalent with InstanceNorm - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='GroupNorm', @@ -6536,7 +6507,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='1d_no_affine_LN', # this setting is equivalent with LayerNorm - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='GroupNorm', @@ -6545,7 +6515,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='2d_affine', - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='GroupNorm', @@ -6554,7 +6523,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='2d_no_affine_IN', # this setting is equivalent with InstanceNorm - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='GroupNorm', @@ -6563,7 +6531,6 @@ def multimarginloss_weights_no_reduce_test(): cudnn=True, check_eval=True, desc='2d_no_affine_LN', # this setting is equivalent with LayerNorm - decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS), ), dict( module_name='Conv1d', diff --git a/test/test_sparse.py b/test/test_sparse.py index 3c9e0e93524428..f2d557faca05cd 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -910,6 +910,12 @@ def test_factory_size_check(self): with self.assertRaisesRegex(RuntimeError, "values and sizes are inconsistent"): self.SparseTensor(indices, values, sizes) + def test_factory_empty_indices(self): + device = 'cuda' if self.is_cuda else 'cpu' + tensor = torch.sparse_coo_tensor([], [], torch.Size([]), device=device) + expected_indices = torch.tensor([], dtype=torch.long, device=device) + self.assertEqual(tensor._indices(), expected_indices) + @cpu_only def test_factory_type_inference(self): t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32)) diff --git a/test/test_torch.py b/test/test_torch.py index 723481db1c4443..1d331b40abca60 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -17,6 +17,7 @@ from torch._utils import _rebuild_tensor from itertools import product, combinations from functools import reduce +from torch import multiprocessing as mp from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \ run_tests, download_file, skipIfNoLapack, suppress_warnings, IS_WINDOWS, PY3 @@ -2368,6 +2369,32 @@ def make_prob_dist(shape, is_contiguous): def test_multinomial(self): self._test_multinomial(self, torch.FloatTensor) + def _spawn_method(self, method, arg): + try: + mp.set_start_method('spawn') + except RuntimeError: + pass + with mp.Pool(1) as pool: + self.assertTrue(pool.map(method, [arg])) + + @staticmethod + def _test_multinomial_invalid_probs(probs): + try: + torch.multinomial(probs.to('cpu'), 1) + return False # Should not be reached + except RuntimeError as e: + return 'invalid multinomial distribution' in str(e) + + @unittest.skipIf(not PY3, + "spawn start method is not supported in Python 2, \ + but we need it for for testing failure case for CPU RNG on Windows") + def test_multinomial_invalid_probs(self): + test_method = TestTorch._test_multinomial_invalid_probs + self._spawn_method(test_method, torch.Tensor([0, -1])) + self._spawn_method(test_method, torch.Tensor([0, float('inf')])) + self._spawn_method(test_method, torch.Tensor([0, float('-inf')])) + self._spawn_method(test_method, torch.Tensor([0, float('nan')])) + @suppress_warnings def test_range(self): res1 = torch.range(0, 1) diff --git a/test/test_utils.py b/test/test_utils.py index 42d4b6444736a1..91dfdbff7f2143 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -17,7 +17,6 @@ from torch.utils.trainer import Trainer from torch.utils.trainer.plugins import * from torch.utils.trainer.plugins.plugin import Plugin -from torch.utils.serialization import load_lua from torch.autograd._functions.utils import prepare_onnx_paddings from torch.autograd._functions.utils import check_onnx_broadcast from common import IS_WINDOWS, IS_PPC @@ -281,7 +280,6 @@ def test_multi_keep(self): dataiter = iter(dataloader) self.assertEqual(len(list(dataiter)), 2) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") def test_multi_drop(self): dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, @@ -593,14 +591,14 @@ def _check_autograd_summary(self, output): 'Distance between autograd prof output and end of output not in [6, 100] lines', output)) def _check_cuda(self, output): - if torch.cuda.is_available(): + if HAS_CUDA: results = re.search('CUDA mode', output) self.assertIsNotNone(results, self._fail_msg('Should tell users CUDA', output)) else: results = re.search('CUDA mode', output) self.assertIsNone(results, self._fail_msg('Should not tell users about CUDA', output)) - @unittest.skipIf(torch.cuda.is_available(), 'CPU-only test') + @unittest.skipIf(HAS_CUDA, 'CPU-only test') def test_bottleneck_cpu_only(self): rc, out, err = self._run_bottleneck('bottleneck/test.py') self.assertEqual(rc, 0, 'Run failed with\n{}'.format(err)) @@ -611,8 +609,7 @@ def test_bottleneck_cpu_only(self): self._check_cprof_summary(out) self._check_cuda(out) - @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error") - @unittest.skipIf(not torch.cuda.is_available(), 'No CUDA') + @unittest.skipIf(not HAS_CUDA, 'No CUDA') def test_bottleneck_cuda(self): rc, out, err = self._run_bottleneck('bottleneck/test_cuda.py') self.assertEqual(rc, 0, 'Run failed with\n{}'.format(err)) @@ -747,6 +744,7 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail): try_check_onnx_broadcast(dims1, dims2, True, False) -TestLuaReader.init() if __name__ == '__main__': + from torch.utils.serialization import load_lua + TestLuaReader.init() run_tests() diff --git a/third_party/benchmark b/third_party/benchmark index 105ac14b2f671f..505be96ab23056 160000 --- a/third_party/benchmark +++ b/third_party/benchmark @@ -1 +1 @@ -Subproject commit 105ac14b2f671f7d448f621290404098189f424d +Subproject commit 505be96ab23056580a3a2315abba048f4428b04e diff --git a/third_party/gloo b/third_party/gloo index aad0002fb40612..69eef748cc1dfb 160000 --- a/third_party/gloo +++ b/third_party/gloo @@ -1 +1 @@ -Subproject commit aad0002fb40612e991390d8e807f247ed23f13c5 +Subproject commit 69eef748cc1dfbe0fefed69b34e6545495f67ac5 diff --git a/third_party/onnx b/third_party/onnx index 968d28d901d2ef..3a035f439799de 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit 968d28d901d2efdaf6d5fcfd529106762524cdfa +Subproject commit 3a035f439799de3568c364f3f87014841037708e diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py index 315220ec4c7072..5ce6a5514a10e8 100644 --- a/tools/amd_build/build_pytorch_amd.py +++ b/tools/amd_build/build_pytorch_amd.py @@ -53,7 +53,7 @@ if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False): continue # Update contents. - with open(source, "r+", encoding="utf-8") as f: + with open(source, "r+") as f: contents = f.read() contents = contents.replace("WITH_CUDA", "WITH_ROCM") contents = contents.replace("CUDA_VERSION", "0") diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml index 60fd665e08d1ae..65dfb955a020a2 100644 --- a/tools/amd_build/disabled_features.yaml +++ b/tools/amd_build/disabled_features.yaml @@ -1,5 +1,5 @@ { - "disabled_hip_function_calls": + "disable_unsupported_hip_calls": [ { "path": "aten/src/THC/generic/THCTensorSort.cu", @@ -72,6 +72,12 @@ "#include ": "" } }, + { + "path": "aten/src/ATen/native/cuda/Distributions.cu", + "s_constants": { + "#include ": "" + } + }, { "path": "aten/src/THC/THCNumerics.cuh", "s_constants": { @@ -129,10 +135,16 @@ "disabled_modules": [ "aten/src/ATen/native/cuda/CuFFTUtils.h", "aten/src/ATen/native/cuda/SpectralOps.cu", - "aten/src/ATen/native/cuda/Distributions.cu", - "aten/src/THCUNN/RReLU.cu" + "aten/src/THCUNN/RReLU.cu", + "aten/src/ATen/native/cuda/Distributions.cu" ], "disabled_functions": [ + { + "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh", + "functions": [ + "kernelPointwiseApply4" + ] + }, { "path": "aten/src/ATen/cuda/detail/IndexUtils.cu", "non_device_functions": [ @@ -148,7 +160,9 @@ { "path": "aten/src/ATen/native/cuda/Distributions.cu", "functions": [ - "_s_poisson_cuda" + "_s_poisson_cuda", + "poisson_cuda_kernel", + "gamma_cuda_kernel" ] }, { @@ -304,3 +318,4 @@ } ] } + diff --git a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch b/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch deleted file mode 100644 index d1bcb76bb5866d..00000000000000 --- a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch +++ /dev/null @@ -1,23 +0,0 @@ -diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh -index 2653fed0b..c4e9089e0 100644 ---- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh -+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh -@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) { - } - #endif - -+/* In ROCm we have a conversion from half to __fp16, and then there's a -+conversion operator from __fp16 to double (w/ the standard conversion -+double to float), so comment out these two lines to prevent ambiguous calls -+for fmaxType when half is passed in. - inline __host__ __device__ float fmaxType(float x, float y) { - return fmaxf(x, y); - } - - inline __host__ __device__ double fmaxType(double x, double y) { - return fmax(x, y); --} -+}*/ - - #ifdef CUDA_HALF_TENSOR - diff --git a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch index 72e4160ea4a994..29c27ba5a90874 100644 --- a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch +++ b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch @@ -1,39 +1,37 @@ diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh -index ca6bf7cbe..a523648f1 100644 +index d118f5dd3..dda7f3c9c 100644 --- a/aten/src/THC/THCTensorMathReduce.cuh +++ b/aten/src/THC/THCTensorMathReduce.cuh -@@ -105,7 +105,7 @@ struct SquareFunctor { +@@ -73,14 +73,14 @@ struct SquareFunctor { template struct ReduceMin { inline __device__ T operator()(T a, T b) const { -- return (THCNumerics::lt(a, b) || -+ return ((int)THCNumerics::sub(a, b) < 0 || - THCNumerics::isnan(a)) ? a : b; +- return (THCNumerics::lt(a, b) || THCNumerics::isnan(a)) ? a : b; ++ return ((int)THCNumerics::sub(a, b) < 0 || THCNumerics::isnan(a)) ? a : b; } }; -@@ -113,7 +113,7 @@ struct ReduceMin { + template struct ReduceMax { inline __device__ T operator()(T a, T b) const { -- return (THCNumerics::gt(a, b) || -+ return ((int)THCNumerics::sub(a, b) > 0 || - THCNumerics::isnan(a)) ? a : b; +- return (THCNumerics::gt(a, b) || THCNumerics::isnan(a)) ? a : b; ++ return ((int)THCNumerics::sub(a, b) > 0 || THCNumerics::isnan(a)) ? a : b; } }; -@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd - buffer[tx] = ScalarConvert::to(0); - Real norm; -- -+ #if !defined(__HIP_DEVICE_COMPILE__) - if (THCNumerics::eq(value, ScalarConvert::to(INFINITY))) { - // get norm of axis - for (ptrdiff_t i=tx; i::mul(row[i], norm); +@@ -108,6 +108,7 @@ __global__ void THCTensor_kernel_renorm(T *data, + const AccT value, + const ptrdiff_t size, + const AccT maxnorm) { ++#if !defined(__HIP_DEVICE_COMPILE__) + __shared__ AccT buffer[32]; + int64_t tx = threadIdx.x; + int64_t bx = blockIdx.x; +@@ -163,6 +164,7 @@ __global__ void THCTensor_kernel_renorm(T *data, + row[i] = scalar_cast(THCNumerics::mul(val, norm)); } } -+ #endif ++#endif } template diff --git a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch index 8fbc7ad7e90643..355c1e3a7c3faf 100644 --- a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch +++ b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch @@ -1,5 +1,5 @@ diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu -index 906780b4f..c99f156ab 100644 +index 906780b4f..b03e051cb 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_ @@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644 if (range > 1ULL << 32) { generate_random_64<<>>( - gen->state.gen_states, size, data, min_val, range); -+ gen->state.gen_states, static_cast(size), data, static_cast::value, float, double>::type>(min_val), static_cast::value, float, double>::type>(range)); ++ gen->state.gen_states, static_cast(size), data, min_val, range); } else { #endif generate_random<<>>( @@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644 } #endif @@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_) - + #if defined(THC_REAL_IS_HALF) generate_random<<>>( - gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1); @@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644 - gen->state.gen_states, size, data, 0UL, static_cast(std::numeric_limits::max()) + 1); + gen->state.gen_states, static_cast(size), data, static_cast(0UL), static_cast(std::numeric_limits::max()) + 1); #endif - + THCTensor_(freeCopyTo)(state, self, self_); diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 8792233d572f55..8a1adeb37efd9f 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -690,7 +690,7 @@ - name: zero_(Tensor self) self: zeros_like(grad) -- name: _sparse_mask(Tensor self, SparseTensor mask) +- name: _sparse_mask(Tensor self, SparseTensorRef mask) self: not_implemented("_sparse_mask") mask: not_implemented("_sparse_mask") diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py index a343050eb34806..c91317e94043c7 100644 --- a/tools/autograd/gen_autograd_functions.py +++ b/tools/autograd/gen_autograd_functions.py @@ -18,7 +18,7 @@ struct ${op} : public ${superclass} { using ${superclass}::${superclass}; variable_list apply(const variable_list& grads) override; - std::string name() override { return "${op}"; } + std::string name() const override { return "${op}"; } void release_variables() override { ${release_variables} } @@ -63,6 +63,12 @@ } """) +DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate("""\ + if (should_compute_output({ ${name}_ix })) { + copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result)); + } +""") + DERIVATIVE_MULTI = CodeTemplate("""\ if (should_compute_output({ ${idx_ranges} })) { ${grad_input_mask} @@ -173,7 +179,7 @@ def emit_derivative(derivative): idx_ranges = ', '.join("{}_ix".format(n) for n in var_names) copy_ranges = [] for i, n in enumerate(var_names): - copy_ranges.append("copy_range(grad_inputs, {}_ix, std::get<{}>(grad_result));".format(n, i)) + copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i)) return DERIVATIVE_MULTI.substitute( idx_ranges=idx_ranges, copy_ranges=copy_ranges, derivative=formula, diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 34b06d4c64a3c5..3b4d4216a20dcb 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -129,12 +129,12 @@ def should_generate_python_binding(declaration): return False # TODO: fix handling of SparseTensor. We don't want to generate Python - # bindings to SparseTensor overloads, such as add(Tensor, SparseTensor), + # bindings to SparseTensor overloads, such as add(Tensor, SparseTensorRef), # since the Tensor-based signature already dynamically dispatches correctly. # However, _sparse_mask only has a SparseTensor signature so we need to bind # that function. for arg in declaration['arguments']: - if arg['type'] == 'SparseTensor' and declaration['name'] != '_sparse_mask': + if arg['type'] == 'SparseTensorRef' and declaration['name'] != '_sparse_mask': return False return True @@ -207,7 +207,7 @@ def create_python_bindings(python_functions, has_self, is_module=False): unpack_methods = { 'const Tensor &': 'tensor', - 'SparseTensor': 'tensor', + 'SparseTensorRef': 'tensor', 'Tensor &': 'tensor', 'Generator *': 'generator', 'Storage &': 'storage', @@ -310,8 +310,8 @@ def parse_arg(arg, arg_index, unpack_args=False): if typename == 'Storage &': expr = '*' + expr - if typename == 'SparseTensor': - expr = 'SparseTensor({})'.format(expr) + if typename == 'SparseTensorRef': + expr = 'SparseTensorRef({})'.format(expr) dispatch_type = typename if dispatch_type == 'Tensor': @@ -542,7 +542,7 @@ def process_function(name, declarations): if not has_self: # Use 'input' instead of 'self' for NN functions signature = signature.replace('Tensor self', 'Tensor input') - signature = signature.replace('SparseTensor', 'Tensor') + signature = signature.replace('SparseTensorRef', 'Tensor') if dictionary['base'].get('deprecated', False): signature += '|deprecated' env['signatures'].append('"{}",'.format(signature)) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index c402855a8ba800..6348807be93eb7 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -328,7 +328,7 @@ def save_variables(saved_variables, is_output): def reference_args(args): res = [] for arg in args: - if arg['type'] == 'SparseTensor': + if arg['type'] == 'SparseTensorRef': res.append('{}.tref'.format(arg['name'])) else: res.append(arg['name']) @@ -538,7 +538,7 @@ def requires_unpack(arg): dynamic_type = arg['dynamic_type'] is_nullable = arg.get('is_nullable', False) - ref = (not is_nullable) and dynamic_type not in ['TensorList', 'SparseTensor'] + ref = (not is_nullable) and dynamic_type not in ['TensorList', 'SparseTensorRef'] suffix = '_opt' if is_nullable else '' body.append(UNPACK_TENSOR.substitute( diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 38fa9c2ed654eb..7c8a81cc2de8d4 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -36,7 +36,7 @@ namespace torch { namespace autograd { // don't want to make the codegen do the dispatch manually) static void setattr(jit::Node* n, jit::Symbol name, int64_t v) { n->i_(name, v); } static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v) { n->t_(name, v.toTensor()); } -static void setattr(jit::Node* n, jit::Symbol name, SparseTensor s) { n->t_(name, s.tref); } +static void setattr(jit::Node* n, jit::Symbol name, SparseTensorRef s) { n->t_(name, s.tref); } static void setattr(jit::Node* n, jit::Symbol name, const at::IntList& v) { n->is_(name, v); } static void setattr(jit::Node* n, jit::Symbol name, bool v) { n->i_(name, v); } static void setattr(jit::Node* n, jit::Symbol name, double v) { n->f_(name, v); } @@ -60,7 +60,7 @@ void failPositionalAttr() { static void setposattr(jit::Node* n, size_t idx, const char *name, int64_t v) { genericInsertInput(n, idx, v); } static void setposattr(jit::Node* n, size_t idx, const char *name, const at::Scalar& v) { genericInsertInput(n, idx, v); } -static void setposattr(jit::Node* n, size_t idx, const char *name, SparseTensor s) { failPositionalAttr(); } +static void setposattr(jit::Node* n, size_t idx, const char *name, SparseTensorRef s) { failPositionalAttr(); } static void setposattr(jit::Node* n, size_t idx, const char *name, const at::IntList& v) { using ArgumentStash = jit::tracer::ArgumentStash; if (ArgumentStash::hasIntList(name)) { @@ -236,8 +236,8 @@ Tensor & VariableType::unpack(const Tensor & t, const char * name, int pos) { return checked_cast_variable(t, name, pos).data(); } -SparseTensor VariableType::unpack(SparseTensor t, const char * name, int pos) { - return SparseTensor(checked_cast_variable(t.tref, name, pos).data()); +SparseTensorRef VariableType::unpack(SparseTensorRef t, const char * name, int pos) { + return SparseTensorRef(checked_cast_variable(t.tref, name, pos).data()); } Tensor VariableType::unpack_opt(const Tensor & t, const char * name, int pos) { @@ -359,35 +359,35 @@ static void throw_error_out_requires_grad(const char* name) { static void rebase_history(Variable& var, std::shared_ptr grad_fn) { if (grad_fn && var.defined()) { - grad_fn->set_num_inputs(1); + grad_fn->add_input_metadata(var.type(), var.sizes()); var.rebase_history({std::move(grad_fn), 0}); } } static void rebase_history(ArrayRef vars, std::shared_ptr grad_fn) { if (grad_fn) { - grad_fn->set_num_inputs(vars.size()); - uint32_t output_nr = 0; for (auto& var : vars) { if (var.defined()) { // TODO: eliminate const_cast + auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes()); const_cast(var).rebase_history({grad_fn, output_nr}); + } else { + grad_fn->add_input_metadata(Function::undefined_input()); } - output_nr++; } } } static void set_history(ArrayRef vars, std::shared_ptr grad_fn) { if (grad_fn) { - grad_fn->set_num_inputs(vars.size()); - uint32_t output_nr = 0; for (auto& var : vars) { if (var.defined()) { // TODO: eliminate const_cast + auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes()); const_cast(var).set_gradient_edge({grad_fn, output_nr}); + } else { + grad_fn->add_input_metadata(Function::undefined_input()); } - output_nr++; } } } @@ -428,7 +428,6 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block if (requires_grad) { grad_fn = std::make_shared(); grad_fn->set_next_edges(collect_next_edges(self, src)); - grad_fn->set_num_inputs(1); grad_fn->src_type = &src.type(); grad_fn->src_device = src.is_cuda() ? src.get_device() : -1; } diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index c90e3f5247f17c..01cd96c4396257 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -17,7 +17,7 @@ using at::Context; using at::Generator; using at::IntList; using at::Scalar; -using at::SparseTensor; +using at::SparseTensorRef; using at::Storage; using at::Tensor; using at::TensorList; @@ -62,7 +62,7 @@ struct VariableType final : public at::Type { // checks that t is actually a Variable static Variable & checked_cast_variable(const Tensor & t, const char * name, int pos); static at::Tensor & unpack(const Tensor & t, const char * name, int pos); - static at::SparseTensor unpack(SparseTensor t, const char * name, int pos); + static at::SparseTensorRef unpack(SparseTensorRef t, const char * name, int pos); static at::Tensor unpack_opt(const Tensor & t, const char * name, int pos); static std::vector unpack(at::TensorList tl, const char *name, int pos); diff --git a/tools/autograd/templates/python_torch_functions_dispatch.h b/tools/autograd/templates/python_torch_functions_dispatch.h index 45c4f79868ecc6..c6ab4d5d18e55f 100644 --- a/tools/autograd/templates/python_torch_functions_dispatch.h +++ b/tools/autograd/templates/python_torch_functions_dispatch.h @@ -19,7 +19,7 @@ using at::Scalar; using at::TensorList; using at::IntList; using at::Generator; -using at::SparseTensor; +using at::SparseTensorRef; using at::Storage; static at::Type& default_type() { diff --git a/tools/autograd/templates/python_variable_methods_dispatch.h b/tools/autograd/templates/python_variable_methods_dispatch.h index c17b30c51ed116..82e7e409472e29 100644 --- a/tools/autograd/templates/python_variable_methods_dispatch.h +++ b/tools/autograd/templates/python_variable_methods_dispatch.h @@ -16,7 +16,7 @@ using at::Scalar; using at::TensorList; using at::IntList; using at::Generator; -using at::SparseTensor; +using at::SparseTensorRef; using at::Storage; ${py_method_dispatch} diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat index 07683866995f75..7a7970e7f100e9 100755 --- a/tools/build_pytorch_libs.bat +++ b/tools/build_pytorch_libs.bat @@ -26,6 +26,13 @@ IF "%~1"=="--with-cuda" ( set /a USE_CUDA=0 ) +IF "%~1"=="--with-rocm" ( + set /a WITH_ROCM=1 + shift +) ELSE ( + set /a WITH_ROCM=0 +) + IF "%~1"=="--with-nnpack" ( set /a NO_NNPACK=0 set /a USE_NNPACK=1 @@ -35,6 +42,29 @@ IF "%~1"=="--with-nnpack" ( set /a USE_NNPACK=0 ) +IF "%~1"=="--with-mkldnn" ( + set /a NO_MKLDNN=0 + shift +) ELSE ( + set /a NO_MKLDNN=1 +) + +IF "%~1"=="--with-gloo-ibverbs" ( + set /a WITH_GLOO_IBVERBS=1 + echo Warning: gloo iverbs is enabled but build is not yet implemented 1>&2 + shift +) ELSE ( + set /a WITH_GLOO_IBVERBS=0 +) + +IF "%~1"=="--with-distributed-mw" ( + set /a WITH_DISTRIBUTED_MW=1 + echo Warning: distributed mw is enabled but build is not yet implemented 1>&2 + shift +) ELSE ( + set /a WITH_DISTRIBUTED_MW=0 +) + set BUILD_TYPE=Release IF "%DEBUG%"=="1" ( set BUILD_TYPE=Debug @@ -144,7 +174,7 @@ goto:eof mkdir build cd build cmake .. %CMAKE_GENERATOR_COMMAND% ^ - -DCMAKE_INSTALL_PREFIX="%INSTALL_DIR%" ^ + -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^ -DBUILD_CAFFE2=OFF ^ -DBUILD_ATEN=ON ^ -DBUILD_PYTHON=OFF ^ @@ -154,7 +184,18 @@ goto:eof -DCUDNN_INCLUDE_DIR="%CUDNN_INCLUDE_DIR%" ^ -DCUDNN_LIB_DIR="%CUDNN_LIB_DIR%" ^ -DCUDNN_LIBRARY="%CUDNN_LIBRARY%" ^ - -DCMAKE_BUILD_TYPE=%BUILD_TYPE% + -DNO_MKLDNN=%NO_MKLDNN% ^ + -DMKLDNN_INCLUDE_DIR="%MKLDNN_INCLUDE_DIR%" ^ + -DMKLDNN_LIB_DIR="%MKLDNN_LIB_DIR%" ^ + -DMKLDNN_LIBRARY="%MKLDNN_LIBRARY%" ^ + -DATEN_NO_CONTRIB=1 ^ + -DCMAKE_INSTALL_PREFIX="%INSTALL_DIR%" ^ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^ + -DCMAKE_C_FLAGS="%USER_CFLAGS%" ^ + -DCMAKE_CXX_FLAGS="/EHa %USER_CFLAGS%" ^ + -DCMAKE_EXE_LINKER_FLAGS="%USER_LDFLAGS%" ^ + -DCMAKE_SHARED_LINKER_FLAGS="%USER_LDFLAGS%" ^ + -DWITH_ROCM=%WITH_ROCM% %MAKE_COMMAND% IF NOT %ERRORLEVEL%==0 exit 1 diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 60e1dfeac863bf..0323b96c835b31 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -17,6 +17,7 @@ WITH_NNPACK=0 WITH_MKLDNN=0 WITH_GLOO_IBVERBS=0 WITH_DISTRIBUTED_MW=0 +FULL_CAFFE2=0 while [[ $# -gt 0 ]]; do case "$1" in --with-cuda) @@ -37,6 +38,9 @@ while [[ $# -gt 0 ]]; do --with-distributed-mw) WITH_DISTRIBUTED_MW=1 ;; + --full-caffe2) + FULL_CAFFE2=1 + ;; *) break ;; @@ -81,12 +85,10 @@ C_FLAGS=" -DTH_INDEX_BASE=0 -I\"$INSTALL_DIR/include\" \ # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=686926 C_FLAGS="${C_FLAGS} -DOMPI_SKIP_MPICXX=1" LDFLAGS="-L\"$INSTALL_DIR/lib\" " -LD_POSTFIX=".so.1" -LD_POSTFIX_UNVERSIONED=".so" +LD_POSTFIX=".so" if [[ $(uname) == 'Darwin' ]]; then LDFLAGS="$LDFLAGS -Wl,-rpath,@loader_path" - LD_POSTFIX=".1.dylib" - LD_POSTFIX_UNVERSIONED=".dylib" + LD_POSTFIX=".dylib" else LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN" fi @@ -139,6 +141,9 @@ function build() { nanopb ) BUILD_C_FLAGS=$C_FLAGS" -fPIC -fexceptions";; *) BUILD_C_FLAGS=$C_FLAGS" -fexceptions";; esac + # TODO: The *_LIBRARIES cmake variables should eventually be + # deprecated because we are using .cmake files to handle finding + # installed libraries instead ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/FindCUDA" \ ${CMAKE_GENERATOR} \ -DTorch_FOUND="1" \ @@ -177,9 +182,6 @@ function build() { popd local lib_prefix=$INSTALL_DIR/lib/lib$1 - if [ -f "$lib_prefix$LD_POSTFIX" ]; then - rm -rf -- "$lib_prefix$LD_POSTFIX_UNVERSIONED" - fi if [[ $(uname) == 'Darwin' ]]; then pushd "$INSTALL_DIR/lib" @@ -224,9 +226,9 @@ function build_caffe2() { ${CMAKE_VERSION} .. \ ${CMAKE_GENERATOR} \ -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - -DBUILD_CAFFE2=OFF \ + -DBUILD_CAFFE2=$FULL_CAFFE2 \ -DBUILD_ATEN=ON \ - -DBUILD_PYTHON=OFF \ + -DBUILD_PYTHON=$FULL_CAFFE2 \ -DBUILD_BINARY=OFF \ -DBUILD_SHARED_LIBS=ON \ -DUSE_CUDA=$WITH_CUDA \ @@ -249,6 +251,17 @@ function build_caffe2() { # to CMakeLists.txt and aten/CMakeLists.txt, not here. # We need the vanilla cmake build to work. ${CMAKE_INSTALL} -j"$NUM_JOBS" + + # Install Python proto files + if [[ $FULL_CAFFE2 -ne 0 ]]; then + find . -name proto + for proto_file in ./caffe/proto/*.py; do + cp $proto_file "$BASE_DIR/caffe/proto/" + done + for proto_file in ./caffe2/proto/*.py; do + cp $proto_file "$BASE_DIR/caffe2/proto/" + done + fi popd } diff --git a/tools/cpp_build/libtorch/CMakeLists.txt b/tools/cpp_build/libtorch/CMakeLists.txt index 4feb0e52e5293d..3e0a012a620a66 100644 --- a/tools/cpp_build/libtorch/CMakeLists.txt +++ b/tools/cpp_build/libtorch/CMakeLists.txt @@ -231,6 +231,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 7c3133aa3fd986..27fb9a9aa5c83a 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -77,7 +77,7 @@ def is_magic_method(api_name): return api_name.startswith('__') and api_name.endswith('__') -blacklisted_types = {'SparseTensor', 'Storage', 'ScalarType', 'optional'} +blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional', 'std::string'} default_only_types = {'Generator'} @@ -97,9 +97,8 @@ def is_jit_op(decl): # we currently only support vararg tensor lists when they are the _first_ argument # and the only tensor argument arguments = decl['arguments'] - has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments) - num_tensor_args = sum(map(is_tensor_arg, arguments)) - if has_tensorlist and (num_tensor_args != 1 or arguments[0]['simple_type'] != 'TensorList'): + # Only support a single TensorList arg + if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1: return False return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and @@ -157,25 +156,54 @@ def emit_decl_variant(decl, is_positional_arg, has_tensorlist): # from the end of the stack static_inputs = sum(is_positional_arg) - 1 num_dynamic_inputs = 'varargs_length' + tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0] else: static_inputs = sum(is_positional_arg) num_dynamic_inputs = static_inputs - real_inputs = count() + real_inputs = 0 for i, arg in enumerate(decl['arguments']): - # XXX: we currently support only TensorList ops that have a TensorList as - # the first argument, that is then followed by a number of positional args. + # This conditional allows us to process argument lists with a flattened argument list + # with a single TensorList. Given the sequence of arguments: + # a b c [d e f g] h i # [] is the list + # + # 1. For the section where we are processing positional inputs before the + # TensorList: + # a b c [d e f g] h i # [] is the list + # ~~~~~~~~~~~~ <- N + # we set this view_length to the total number of varargs inputs (i.e. the length) + # of the whole argument list. This means that indexing into the list using peek() + # we will retrieve arguments ar their true indices (i.e. peek at 0 points to a, + # 1 points to b, etc...). Similarly, we can use peekSlice() to index into the + # list itself this way. + # 2. After the list: + # a b c [d e f g] h i # [] is the list + # ~~~~~~ <- N + # Here we set the view length to static_inputs. In our example, + # we effectively ignore the fact that we have a list here. What is + # significant is that our index i is equivalent when the view length + # is right-justified, whether we have the list or not. Concretely, + # indexing h or i from `a b c [d e f g] h i` is equvalent to indexing + # h or i from `a b c h i`. + view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs + if arg['simple_type'] == 'TensorList': - arguments.append('peekSlice(stack, 0, varargs_length - {}, varargs_length)'.format(static_inputs)) + # NOTE: don't advance real_inputs here. After this we are going + # to switch over to indexing from the end as if we only had + # the static arguments. + arguments.append('peekSlice(stack, {}, varargs_length - {}, varargs_length)' + .format(real_inputs, static_inputs)) elif arg['simple_type'] in default_only_types: arguments.append(arg['default']) elif is_tensor_arg(arg): - arguments.append('std::move(peek(stack, {}, {}))'.format(next(real_inputs), static_inputs)) + arguments.append('std::move(peek(stack, {}, {}))'.format(real_inputs, view_length)) + real_inputs += 1 elif is_positional_arg[i]: template_kwargs = dict(type=arg['simple_type'], name=arg['name'], - i=next(real_inputs), - N=static_inputs) + i=real_inputs, + N=view_length) + real_inputs += 1 if is_sized_intlist_arg(arg): assign = POS_INTLIST_ASSIGNMENT.substitute(size=arg['size'], @@ -358,12 +386,13 @@ def emit_arg(arg, is_return): def emit(decl): arguments = [a for a in decl['arguments'] if a['simple_type'] not in default_only_types] n = get_name(decl['name']) + n_args = len(arguments) + n_returns = len(decl['returns']) + env['arguments'].append('// Arguments for {} ({} args, {} returns)'.format(decl['name'], n_args, n_returns)) for a in arguments: emit_arg(a, False) for a in decl['returns']: emit_arg(a, True) - n_args = len(arguments) - n_returns = len(decl['returns']) env['operators'].append('{{ {}, {}, {} }}, // FunctionSchema("{}", <{} arguments>, <{} returns>) '.format( n, n_args, n_returns, decl['name'], n_args, n_returns)) diff --git a/tools/jit/templates/aten_dispatch.cpp b/tools/jit/templates/aten_dispatch.cpp index f6d6f87bb16c14..d1545d52898890 100644 --- a/tools/jit/templates/aten_dispatch.cpp +++ b/tools/jit/templates/aten_dispatch.cpp @@ -79,6 +79,8 @@ int deviceForInputs(Stack & stack, size_t N) { std::unordered_set tensor_vararg_fns = { aten::cat, aten::stack, + aten::index, + aten::index_put, }; template diff --git a/tools/setup_helpers/miopen.py b/tools/setup_helpers/miopen.py new file mode 100644 index 00000000000000..a87996688e7172 --- /dev/null +++ b/tools/setup_helpers/miopen.py @@ -0,0 +1,16 @@ +import os +import glob + +from .env import IS_WINDOWS, IS_CONDA, CONDA_DIR, check_env_flag, gather_paths +from .rocm import WITH_ROCM, ROCM_HOME + + +WITH_MIOPEN = False +MIOPEN_LIB_DIR = None +MIOPEN_INCLUDE_DIR = None +MIOPEN_LIBRARY = None +if WITH_ROCM and not check_env_flag('NO_MIOPEN'): + WITH_MIOPEN = True + MIOPEN_LIB_DIR = ROCM_HOME + "/miopen/lib" + MIOPEN_INCLUDE_DIR = ROCM_HOME + "/miopen/include/miopen" + MIOPEN_LIBRARY = "MIOpen" diff --git a/tools/setup_helpers/rocm.py b/tools/setup_helpers/rocm.py new file mode 100644 index 00000000000000..b981dd82dc5020 --- /dev/null +++ b/tools/setup_helpers/rocm.py @@ -0,0 +1,5 @@ +from .env import check_env_flag +# Check if ROCM is enabled +WITH_ROCM = check_env_flag('WITH_ROCM') +ROCM_HOME = "/opt/rocm" +ROCM_VERSION = "" diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py index d6915e4ac6f520..6fdc1f5a24711d 100644 --- a/torch/_tensor_str.py +++ b/torch/_tensor_str.py @@ -196,6 +196,14 @@ def _tensor_str(self, indent, fmt, scale, sz, summarize): return '[' + tensor_str + ']' +def _maybe_wrap_suffix(suffix, indent, tensor_str): + suffix_len = len(suffix) + last_line_len = len(tensor_str) - tensor_str.rfind('\n') + 1 + if suffix_len > 2 and last_line_len + suffix_len > PRINT_OPTS.linewidth: + return ',\n' + ' ' * indent + suffix[2:] + return suffix + + def get_summarized_data(self): dim = self.dim() if dim == 0: @@ -224,27 +232,36 @@ def _str(self): indent = len(prefix) summarize = self.numel() > PRINT_OPTS.threshold - suffix = ')' + suffix = '' if not torch._C._is_default_type_cuda(): if self.device.type == 'cuda': - suffix = ', device=\'' + str(self.device) + '\'' + suffix + suffix += ', device=\'' + str(self.device) + '\'' else: if self.device.type == 'cpu' or torch.cuda.current_device() != self.device.index: - suffix = ', device=\'' + str(self.device) + '\'' + suffix + suffix += ', device=\'' + str(self.device) + '\'' if self.numel() == 0: # In an empty tensor, there are no elements to infer if the dtype should be int64, # so it must be shown explicitly. if self.dtype != torch.get_default_dtype(): - suffix = ', dtype=' + str(self.dtype) + suffix + suffix += ', dtype=' + str(self.dtype) tensor_str = '[]' else: if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64: - suffix = ', dtype=' + str(self.dtype) + suffix + suffix += ', dtype=' + str(self.dtype) fmt, scale, sz = _number_format(get_summarized_data(self) if summarize else self) if scale != 1: prefix = prefix + SCALE_FORMAT.format(scale) + ' ' * indent tensor_str = _tensor_str(self, indent, fmt, scale, sz, summarize) + if self.grad_fn is not None: + suffix += ', grad_fn=<{}>'.format(type(self.grad_fn).__name__) + elif self.requires_grad: + suffix += ', requires_grad=True' + + suffix += ')' + + suffix = _maybe_wrap_suffix(suffix, indent, tensor_str) + return prefix + tensor_str + suffix diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index bc96fa18ae41c1..79bf688fe01042 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -9,7 +9,7 @@ from .variable import Variable from .function import Function, NestedIOFunction -from .gradcheck import gradcheck +from .gradcheck import gradcheck, gradgradcheck from .grad_mode import no_grad, enable_grad, set_grad_enabled from . import profiler diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py index 972f2656364a2a..648eb16d45058f 100644 --- a/torch/autograd/gradcheck.py +++ b/torch/autograd/gradcheck.py @@ -21,7 +21,7 @@ def make_jacobian(input, num_out): return None if not input.requires_grad: return None - return torch.zeros(input.nelement(), num_out) + return torch.zeros(input.nelement(), num_out, dtype=input.dtype) elif isinstance(input, Iterable): jacobians = list(filter( lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input))) @@ -125,25 +125,36 @@ def _differentiable_outputs(x): def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True): - """Check gradients computed via small finite differences - against analytical gradients + r"""Check gradients computed via small finite differences against analytical + gradients - The check between numerical and analytical has the same behaviour as - numpy.allclose https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html - meaning it check that - absolute(a - n) <= (atol + rtol * absolute(n)) - is true for all elements of analytical jacobian a and numerical jacobian n. + The check between numerical and analytical gradients has the same behaviour as + `numpy.allclose `_, + i.e., it checks that + + .. math:: + + \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert + + holds for all elements of analytical gradient :math:`a` and numerical + gradient :math:`n`. + + .. note:: + The default values are designed for :attr:`input` of double precision. + This check will likely fail if :attr:`input` is of single precision, + i.e., ``FloatTensor``. Args: - func: Python function that takes Tensor inputs and returns + func (function): a Python function that takes Tensor inputs and returns a Tensor or a tuple of Tensors - inputs: tuple of Tensors - eps: perturbation for finite differences - atol: absolute tolerance - rtol: relative tolerance - raise_exception: bool indicating whether to raise an exception if - gradcheck fails. The exception gives more information about the + inputs (tuple of Tensor): inputs to the function + eps (float, optional): perturbation for finite differences + atol (float, optional): absolute tolerance + rtol (float, optional): relative tolerance + raise_exception (bool, optional): indicating whether to raise an exception if + the check fails. The exception gives more information about the exact nature of the failure. This is helpful when debugging gradchecks. + Returns: True if all differences satisfy allclose condition """ @@ -208,19 +219,30 @@ def fn(input): def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e-3, gen_non_contig_grad_outputs=False, raise_exception=True): - """Check gradients of gradients computed via small finite differences - against analytical gradients + r"""Check gradients of gradients computed via small finite differences + against analytical gradients + This function checks that backpropagating through the gradients computed - to the given grad_outputs are correct. + to the given :attr:`grad_outputs` are correct. + + The check between numerical and analytical gradients has the same behaviour as + `numpy.allclose `_, + i.e., it checks that + + .. math:: - The check between numerical and analytical has the same behaviour as - numpy.allclose https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html - meaning it check that - absolute(a - n) <= (atol + rtol * absolute(n)) - is true for all elements of analytical gradient a and numerical gradient n. + \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert + + holds for all elements of analytical gradient :math:`a` and numerical + gradient :math:`n`. + + .. note:: + The default values are designed for :attr:`input` of double precision. + This check will likely fail if :attr:`input` is of single precision, + i.e., ``FloatTensor``. Args: - func (function): Python function that takes Tensor inputs and returns + func (function): a Python function that takes Tensor inputs and returns a Tensor or a tuple of Tensors inputs (tuple of Tensor): inputs to the function grad_outputs (tuple of Tensor, optional): The gradients with respect to @@ -231,13 +253,12 @@ def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e- gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is ``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the randomly generated gradient outputs are made to be noncontiguous - raise_exception: bool indicating whether to raise an exception if - gradcheck fails. The exception gives more information about the + raise_exception (bool, optional): indicating whether to raise an exception if + the check fails. The exception gives more information about the exact nature of the failure. This is helpful when debugging gradchecks. Returns: - True if all differences satisfy allclose condition. Raises an exception - otherwise. + True if all differences satisfy allclose condition """ if grad_outputs is None: # If grad_outputs is not specified, create random Tensors of the same diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h index 68451ea5c95744..6cb660e6a89149 100644 --- a/torch/csrc/THP.h +++ b/torch/csrc/THP.h @@ -24,6 +24,11 @@ #define LIBRARY_STATE_TYPE #define LIBRARY_STATE_TYPE_NOARGS +#define THWStorage THStorage +#define THWStorage_(NAME) THStorage_(NAME) +#define THWTensor THTensor +#define THWTensor_(NAME) THTensor_(NAME) + #include "PtrWrapper.h" #include "Exceptions.h" #include "Generator.h" diff --git a/torch/csrc/api/include/torch/detail/ordered_dict.h b/torch/csrc/api/include/torch/detail/ordered_dict.h new file mode 100644 index 00000000000000..c165ca5d5def82 --- /dev/null +++ b/torch/csrc/api/include/torch/detail/ordered_dict.h @@ -0,0 +1,220 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace detail { + +/// A simple ordered dictionary implementation, akin to Python's `OrderedDict`. +template +class OrderedDict { + public: + struct Item { + Item(Key key_, Value value_) + : key(std::move(key_)), value(std::move(value_)) {} + + Value& operator*() { + return value; + } + const Value& operator*() const { + return value; + } + Value* operator->() { + return &value; + } + const Value* operator->() const { + return &value; + } + + const Key key; + Value value; + }; + + // The lifetime of an iterator is bound to the lifetime of the `OrderedDict`. + // Further, any `insert()` operation may invalidate all iterators + // pointing into the vector. + + using Iterator = typename std::vector::iterator; + using ConstIterator = typename std::vector::const_iterator; + + /// Constructs the `OrderedDict` with a string that should describe the kind + /// of value stored in this `OrderedDict`, for example 'parameter' or + /// 'module'. + explicit OrderedDict(std::string subject = "Key") + : subject_(std::move(subject)) {} + + // Copy we have to do ourselves, because items' keys are const, so we have to + // re-insert the items. + OrderedDict(const OrderedDict& other) + : index_(other.index_), subject_(other.subject_) { + for (const auto& item : other.items_) { + items_.push_back(item); + } + } + + OrderedDict& operator=(const OrderedDict& other) { + index_ = other.index_; + items_.clear(); + for (auto& item : other.items_) { + items_.push_back(item); + } + subject_ = other.subject_; + return *this; + } + + // Move works by default, because you can move-construct vectors of const + // values.. + OrderedDict(OrderedDict&& other) = default; + OrderedDict& operator=(OrderedDict&& other) = default; + + ~OrderedDict() = default; + + /*implicit */ OrderedDict(std::initializer_list initializer_list) + : OrderedDict("Key") { + items_.reserve(initializer_list.size()); + for (auto& item : initializer_list) { + // Copy the key here and move it into the index. + items_.emplace_back(item.key, std::move(item.value)); + index_.emplace(std::move(item.key), size() - 1); + } + } + + Iterator begin() { + return items_.begin(); + } + + ConstIterator begin() const { + return items_.begin(); + } + + Iterator end() { + return items_.end(); + } + + ConstIterator end() const { + return items_.end(); + } + + Item& front() { + return items_.front(); + } + + const Item& front() const { + return items_.front(); + } + + Item& back() { + return items_.back(); + } + + const Item& back() const { + return items_.back(); + } + + Item& operator[](size_t index) { + return items_[index]; + } + + const Item& operator[](size_t index) const { + return items_[index]; + } + + Value& operator[](const Key& key) { + return get(key); + } + + const Value& operator[](const Key& key) const { + return get(key); + } + + template + Value& insert(K&& key, V&& value) { + AT_CHECK(index_.count(key) == 0, subject_, " '", key, "' already defined"); + // Copy `key` here and move it into the index. + items_.emplace_back(key, std::forward(value)); + index_.emplace(std::forward(key), size() - 1); + return items_.back().value; + } + + /// Allows calling `insert` with an initializer list for the value, e.g. + /// `insert(key, {...})`. + Value& insert(Key key, Value&& value) { + return insert(std::move(key), std::move(value)); + } + + void update(OrderedDict&& other) { + for (auto& item : other) { + // We want to call `insert()` to prevent duplicate keys. + insert(std::move(item.key), std::move(item.value)); + } + } + + void update(const OrderedDict& other) { + for (auto& item : other) { + // We want to call `insert()` to prevent duplicate keys. + insert(item.key, item.value); + } + } + + Value* find(const Key& key) noexcept { + auto iterator = index_.find(key); + if (iterator == index_.end()) { + return nullptr; + } + return &items_[iterator->second].value; + } + + const Value* find(const Key& key) const noexcept { + auto iterator = index_.find(key); + if (iterator == index_.end()) { + return nullptr; + } + return &items_[iterator->second].value; + } + + Value& get(const Key& key) { + if (auto* value = find(key)) { + return *value; + } + AT_ERROR(subject_, " '", key, "' is not defined"); + } + + const Value& get(const Key& key) const { + if (auto* value = find(key)) { + return *value; + } + AT_ERROR(subject_, " '", key, "' is not defined"); + } + + void clear() { + index_.clear(); + items_.clear(); + } + + size_t size() const noexcept { + return items_.size(); + } + + bool is_empty() const noexcept { + return items_.empty(); + } + + const std::string& subject() const noexcept { + return subject_; + } + + private: + std::unordered_map index_; + std::vector items_; + std::string subject_; +}; +} // namespace detail +} // namespace torch diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h index 7f998efcd032b0..63cb4aa46a2e1c 100644 --- a/torch/csrc/api/include/torch/nn/module.h +++ b/torch/csrc/api/include/torch/nn/module.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -93,26 +94,29 @@ class Module { } protected: - Variable register_parameter(const std::string& name, at::Tensor tensor); - Variable register_buffer(const std::string& name, at::Tensor tensor); + autograd::Variable& register_parameter(std::string name, at::Tensor tensor); + autograd::Variable& register_buffer(std::string name, at::Tensor tensor); template std::shared_ptr register_module( - const std::string& name, - const std::shared_ptr& module) { - const auto pair = children_.emplace(name, module); - AT_CHECK(pair.second, "Module has already been registered"); - return module; + std::string name, + std::shared_ptr module) { + auto& base_module = children_.insert(std::move(name), std::move(module)); + return std::static_pointer_cast(base_module); } private: + template + using OrderedDict = torch::detail::OrderedDict; + template friend class CloneableModule; virtual void clone_(Module& other); - std::unordered_map parameters_; - std::unordered_map> children_; + OrderedDict parameters_; + OrderedDict buffers_; + OrderedDict> children_; /// The module's name (e.g. "LSTM"). mutable at::optional name_; @@ -151,13 +155,11 @@ class CloneableModule : public Module { copy->parameters_.clear(); copy->children_.clear(); copy->reset(); - for (auto& parameter : parameters_) { - copy->parameters_.at(parameter.first) - .data() - .copy_(parameter.second.data()); + for (const auto& parameter : parameters_) { + copy->parameters_[parameter.key].data().copy_(parameter->data()); } - for (auto& child : children_) { - copy->children_.at(child.first)->clone_(*child.second); + for (const auto& child : children_) { + copy->children_[child.key]->clone_(*child.value); } return copy; } diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp index dac4093b731d80..3f9ca4a46630b5 100644 --- a/torch/csrc/api/src/nn/module.cpp +++ b/torch/csrc/api/src/nn/module.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -45,15 +44,13 @@ std::shared_ptr Module::clone() const { std::map Module::parameters() const { std::map ret; - for (const auto& pair : children_) { - auto& name = pair.first; - auto& child = pair.second; - for (auto& p : child->parameters()) { - ret[name + "." + p.first] = p.second; + for (const auto& child : children_) { + for (auto& p : child.value->parameters()) { + ret[child.key + "." + p.first] = p.second; } } - for (const auto& pair : parameters_) { - ret[pair.first] = pair.second; + for (const auto& parameter : parameters_) { + ret[parameter.key] = parameter.value; } return ret; } @@ -67,34 +64,33 @@ Variable& Module::param(std::string const& name) { break; } - auto child_name = name.substr(begin, dot_pos - begin); - auto it = container->children_.find(child_name); - if (it == container->children_.end()) { - throw std::runtime_error("No such child: " + child_name); + const auto child_name = name.substr(begin, dot_pos - begin); + if (auto* child = container->children_.find(child_name)) { + container = child->get(); + } else { + AT_ERROR("No such child: ", child_name); } - container = it->second.get(); begin = dot_pos + 1; // Skip the dot } - auto param_name = name.substr(begin); - auto it = container->parameters_.find(param_name); - if (it == container->parameters_.end()) { - throw std::runtime_error("No such param: " + param_name); + const auto parameter_name = name.substr(begin); + if (auto* parameter = container->parameters_.find(parameter_name)) { + return *parameter; } - return it->second; + AT_ERROR("No such param: ", parameter_name); } void Module::train() { - for (auto& pair : children_) { - pair.second->train(); + for (auto& child : children_) { + child.value->train(); } is_training_ = true; } void Module::eval() { - for (auto& pair : children_) { - pair.second->eval(); + for (auto& child : children_) { + child.value->eval(); } is_training_ = false; } @@ -109,39 +105,36 @@ void Module::cpu() { void Module::to(at::Type& type) { for (auto& child : children_) { - child.second->to(type); + child.value->to(type); } - for (auto& pair : parameters_) { - auto parameter = pair.second; - at::detail::set_data(parameter, parameter.data().toType(type)); - AT_ASSERT(parameter.data().type() == type); - AT_ASSERT(¶meter.type() == autograd::VariableType::getType(type)); + for (auto& parameter : parameters_) { + at::detail::set_data(*parameter, parameter->data().toType(type)); + AT_ASSERT(parameter->data().type() == type); + AT_ASSERT(¶meter->type() == autograd::VariableType::getType(type)); } } void Module::to(at::ScalarType scalar_type) { for (auto& child : children_) { - child.second->to(scalar_type); + child.value->to(scalar_type); } - for (auto& pair : parameters_) { - auto parameter = pair.second; - auto& new_type = parameter.data().type().toScalarType(scalar_type); - at::detail::set_data(parameter, parameter.data().toType(new_type)); - AT_ASSERT(parameter.data().type().scalarType() == scalar_type); - AT_ASSERT(parameter.type().scalarType() == scalar_type); + for (auto& parameter : parameters_) { + auto& new_type = parameter->data().type().toScalarType(scalar_type); + at::detail::set_data(*parameter, parameter->data().toType(new_type)); + AT_ASSERT(parameter->data().type().scalarType() == scalar_type); + AT_ASSERT(parameter->type().scalarType() == scalar_type); } } void Module::to(at::Backend backend) { for (auto& child : children_) { - child.second->to(backend); + child.value->to(backend); } - for (auto& pair : parameters_) { - auto parameter = pair.second; - auto& new_type = parameter.data().type().toBackend(backend); - at::detail::set_data(parameter, parameter.data().toType(new_type)); - AT_ASSERT(parameter.data().type().backend() == backend); - AT_ASSERT(parameter.type().backend() == backend); + for (auto& parameter : parameters_) { + auto& new_type = parameter->data().type().toBackend(backend); + at::detail::set_data(*parameter, parameter->data().toType(new_type)); + AT_ASSERT(parameter->data().type().backend() == backend); + AT_ASSERT(parameter->type().backend() == backend); } } @@ -151,27 +144,25 @@ bool Module::is_training() const noexcept { void Module::zero_grad() { for (auto& child : children_) { - child.second->zero_grad(); + child.value->zero_grad(); } - for (auto& pair : parameters_) { - pair.second.grad().zero_(); + for (auto& parameter : parameters_) { + parameter->grad().zero_(); } } -Variable Module::register_parameter( - const std::string& name, +autograd::Variable& Module::register_parameter( + std::string name, at::Tensor tensor) { auto variable = autograd::make_variable(tensor, /*requires_grad=*/true); - const auto pair = parameters_.emplace(name, std::move(variable)); - AT_CHECK(pair.second, "Parameter has already been registered"); - return pair.first->second; + return parameters_.insert(std::move(name), std::move(variable)); } -Variable Module::register_buffer(const std::string& name, at::Tensor tensor) { +autograd::Variable& Module::register_buffer( + std::string name, + at::Tensor tensor) { auto variable = autograd::make_variable(tensor, /*requires_grad=*/false); - const auto pair = parameters_.emplace(name, std::move(variable)); - AT_CHECK(pair.second, "Parameter has already been registered"); - return pair.first->second; + return parameters_.insert(std::move(name), std::move(variable)); } void Module::clone_(Module& other) {} diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 8f4d72398aca32..88aec1573e1e3b 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -288,6 +288,49 @@ static variable_list call_post_hooks(Function& fn, variable_list outputs, variab return outputs; } +static bool is_compatible_type(const at::Type& expected, const at::Type& actual) { + // Types are compatible if they exactly match or if the gradient is a sparse + // version of the expected type. + return expected == actual || (actual.is_sparse() && + expected == actual.toBackend(toDense(actual.backend()))); +} + +template +static void validate_outputs(const edge_list& edges, const variable_list& grads, const F& format_error) { + if (grads.size() != edges.size()) { + std::stringstream ss; + ss << "invalid number of gradients - expected "; + ss << edges.size() << ", but got " << grads.size(); + throw std::runtime_error(format_error(ss.str())); + } + for (size_t i = 0; i < grads.size(); i++) { + const auto& edge = edges[i]; + if (!edge.is_valid()) continue; + + const auto& metadata = edge.function->input_metadata(edge.input_nr); + const auto& output = grads[i]; + if (!output.defined()) { + // FIXME: TestJit.test_ge_optimized fails this assertion. + // std::stringstream ss; + // ss << "undefined gradient at index " << i; + // throw std::runtime_error(format_error(ss.str())); + continue; + } + if (!grads[i].sizes().equals(metadata.shape())) { + std::stringstream ss; + ss << "invalid gradient at index " << i << " - expected shape "; + ss << metadata.shape() << " but got " << grads[i].sizes(); + throw std::runtime_error(format_error(ss.str())); + } + if (!is_compatible_type(metadata.type(), grads[i].type())) { + std::stringstream ss; + ss << "invalid gradient at index " << i << " - expected type "; + ss << metadata.type() << " but got " << grads[i].type(); + throw std::runtime_error(format_error(ss.str())); + } + } +} + static variable_list call_function(FunctionTask& task) { bool prev_checkpoint_valid_state = checkpoint_valid; checkpoint_valid = task.base->can_checkpoint() && prev_checkpoint_valid_state; @@ -298,6 +341,11 @@ static variable_list call_function(FunctionTask& task) { fn.will_release_variables(); } auto outputs = fn(inputs); + validate_outputs(fn.next_edges(), outputs, [&](const std::string& msg) { + std::ostringstream ss; + ss << "Function " << fn.name() << " returned an " << msg; + return ss.str(); + }); checkpoint_valid = prev_checkpoint_valid_state; return call_post_hooks(fn, std::move(outputs), std::move(inputs)); } @@ -323,13 +371,6 @@ auto Engine::evaluate_function(FunctionTask& task) -> void { fn.release_variables(); } - if (outputs.size() != fn.num_outputs()) { - std::stringstream ss; - ss << "Function '" << fn.name() << "' returned an invalid number of outputs - expected "; - ss << fn.num_outputs() << ", but got " << outputs.size(); - throw std::runtime_error(ss.str()); - } - int num_outputs = outputs.size(); if (num_outputs == 0) return; // Don't even acquire the mutex std::lock_guard lock(task.base->mutex); @@ -426,6 +467,11 @@ auto Engine::execute(const edge_list& input_roots, bool create_graph, const edge_list& outputs) -> variable_list { std::call_once(start_threads_flag, &Engine::start_threads, this); + + validate_outputs(input_roots, inputs, [](const std::string& msg) { + return msg; + }); + // Callbacks are only valid for the duration of this run and should always be cleared ClearCallbacks _cb_guard(final_callbacks, post_callbacks_lock); diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp index 47b4e0620e0d7c..d116069149fa1e 100644 --- a/torch/csrc/autograd/function.cpp +++ b/torch/csrc/autograd/function.cpp @@ -19,8 +19,8 @@ namespace torch { namespace autograd { thread_local uint64_t Function::next_sequence_nr_ = 0; -auto Function::name() -> std::string { - return std::string(typeid(*this).name()); +auto Function::name() const -> std::string { + return at::demangle(typeid(*this).name()); } // This function is analogous to make_trace which operates on PythonOp, but this diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 77e9b218469e7c..d76296a10a11d8 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -5,6 +5,7 @@ #include "torch/csrc/autograd/grad_mode.h" #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/saved_variable.h" +#include "torch/csrc/autograd/type_and_shape.h" #include "torch/csrc/autograd/variable.h" #include "torch/csrc/jit/tracer.h" #include "torch/csrc/utils/auto_unique_ptr.h" @@ -91,18 +92,15 @@ struct Function : std::enable_shared_from_this { /// in the backward() pass, with higher sequence numbers prioritized /// before lower sequence numbers. explicit Function( - uint32_t num_inputs, uint64_t sequence_nr, edge_list&& next_edges = edge_list()) : sequence_nr_(sequence_nr), - num_inputs_(num_inputs), next_edges_(std::move(next_edges)) {} explicit Function( - uint32_t num_inputs = 0, edge_list&& next_edges = edge_list()) - : Function(num_inputs, next_sequence_nr_++, std::move(next_edges)) {} - + : Function(next_sequence_nr_++, std::move(next_edges)) {} + /// Functions are neither copyable nor moveable. Function(const Function& other) = delete; Function(Function&& other) = delete; @@ -123,20 +121,37 @@ struct Function : std::enable_shared_from_this { // Graph Connectivity API //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // Inputs + // Inputs. NOTE: inputs of the grad_fn correspond to Tensor outputs of the + // forward function. + + // Marker for expected undefined input + struct undefined_input {}; - /// Increments the number of inputs of the function and returns the previous - /// value. - uint32_t bump_inputs() noexcept { - return num_inputs_++; + /// Adds the type and shape metadata for a new input. Returns the index of + /// of the new input. + uint32_t add_input_metadata(const at::Type& type, at::IntList shape) noexcept { + uint32_t input_nr = input_metadata_.size(); + input_metadata_.emplace_back(type, shape); + return input_nr; } - void set_num_inputs(uint32_t num_inputs) noexcept { - num_inputs_ = num_inputs; + /// Adds a placeholder for an input that will not be used. + uint32_t add_input_metadata(undefined_input u) noexcept { + uint32_t input_nr = input_metadata_.size(); + input_metadata_.emplace_back(); + return input_nr; } uint32_t num_inputs() const noexcept { - return num_inputs_; + return input_metadata_.size(); + } + + const TypeAndShape& input_metadata(size_t index) const { + return input_metadata_[index]; + } + + void clear_input_metadata() { + input_metadata_.clear(); } // Outputs ("Next Edges") @@ -185,7 +200,7 @@ struct Function : std::enable_shared_from_this { } /// Returns the name of the dynamic type of the function, for debugging. - virtual std::string name(); + virtual std::string name() const; /// Returns true if the particular output edge is active, and that particular /// output of this function should be computed. @@ -312,12 +327,12 @@ struct Function : std::enable_shared_from_this { // fields. const uint64_t sequence_nr_; - uint32_t num_inputs_; edge_list next_edges_; PyObject* pyobj_ = nullptr; // weak reference std::vector> pre_hooks_; std::vector> post_hooks_; auto_unique_ptr tracing_state_; + at::SmallVector input_metadata_; }; /// See Function::is_traceable() for definition. @@ -355,13 +370,14 @@ struct MakeNextFunctionList : IterArgs { /// `input_nr` thus equal to `function->num_inputs()`. Additionally, it /// increments the `Function`'s number of inputs by one. Approximately /// equivalent to `variable.set_gradient_edge(function, -/// function->bump_inputs())`. If you don't want the `Function`'s `num_inputs` -/// to be incremented, use `set_gradient_edge` directly. +/// function->add_input_metadata(variable.type(), variable.sizes()))`. +/// If you don't want the `Function`'s `num_inputs` to be incremented, use +/// `set_gradient_edge` directly. inline void create_gradient_edge( Variable& variable, std::shared_ptr function) { // Copy before move. - const auto input_nr = function->bump_inputs(); + const auto input_nr = function->add_input_metadata(variable.type(), variable.sizes()); variable.set_gradient_edge({std::move(function), input_nr}); } diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp index 31fc2434ea5d90..fdbe54d3fc3e92 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.cpp +++ b/torch/csrc/autograd/functions/accumulate_grad.cpp @@ -13,12 +13,13 @@ using at::Tensor; namespace torch { namespace autograd { -// AccumulateGrad sets sequence_nr to the max value so it's always called +// AccumulateGrad sets sequence_nr to the max value so it's always called // ASAP during backwards. AccumulateGrad::AccumulateGrad(Variable variable_) - : Function(/*num_inputs=*/1 - , /*sequence_nr=*/UINT64_MAX) - , variable(std::move(variable_)) {} + : Function(/*sequence_nr=*/UINT64_MAX) + , variable(std::move(variable_)) { + add_input_metadata(variable.type(), variable.sizes()); +} auto AccumulateGrad::apply(const variable_list& grads) -> variable_list { // XXX: this method is not thread-safe! diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h index a630c3c79eb378..7ac4d2a3faa3c8 100644 --- a/torch/csrc/autograd/functions/basic_ops.h +++ b/torch/csrc/autograd/functions/basic_ops.h @@ -12,7 +12,7 @@ namespace torch { namespace autograd { struct Error : public Function { Error(std::string msg, edge_list&& next_edges) - : Function(/*num_inputs=*/0, std::move(next_edges)) + : Function(std::move(next_edges)) , msg(std::move(msg)) {} Error(std::string msg) @@ -35,7 +35,7 @@ struct DelayedError : public Function { struct GraphRoot : public Function { GraphRoot(edge_list functions, variable_list inputs) - : Function(/*num_inputs=*/0, std::move(functions)), + : Function(std::move(functions)), outputs(std::move(inputs)) {} virtual variable_list apply(const variable_list& inputs) { diff --git a/torch/csrc/autograd/functions/special.cpp b/torch/csrc/autograd/functions/special.cpp index a5c5a6228e4098..88ac969122b780 100644 --- a/torch/csrc/autograd/functions/special.cpp +++ b/torch/csrc/autograd/functions/special.cpp @@ -17,7 +17,9 @@ namespace torch { namespace autograd { // Used when an output has multiple uses (there's only one entry // in next_edges per output). struct Replicate : public Function { - Replicate() : Function(/*num_inputs=*/1) {} + Replicate(const at::Type& type, at::IntList shape) : Function() { + add_input_metadata(type, shape); + } virtual variable_list apply(const variable_list& inputs) { TORCH_ASSERT(inputs.size() == 1); @@ -236,6 +238,7 @@ bool Eval::replaceSubgraph(const variable_list& inputs, const variable_list& _ou // This detaches the subgraph from the full backward graph. for (auto& begin : subgraph.boundary.begins) { const auto& edge = begin.function->next_edge(begin.input_nr); + begin.function->set_next_edge( begin.input_nr, Edge(ends_to_outputs.at(edge), 0)); } @@ -265,7 +268,7 @@ bool Eval::replaceSubgraph(const variable_list& inputs, const variable_list& _ou // the same Variable has been returned multiple times, and // is repeated in this list. if (output.grad_fn_unsafe() == this) { - auto replicate = std::make_shared(); + auto replicate = std::make_shared(output.type(), output.sizes()); replicate->add_next_edge({this_shared, output.output_nr()}); output.set_gradient_edge({std::move(replicate), 0}); repeated_outputs.emplace(&output); @@ -274,7 +277,8 @@ bool Eval::replaceSubgraph(const variable_list& inputs, const variable_list& _ou // perform any allocations until we actually see repeated outputs. if (repeated_outputs.count(&output) > 0) { auto & replicate = output.grad_fn(); - replicate->add_next_edge({this_shared, num_inputs_++}); + auto input_nr = add_input_metadata(output.type(), output.sizes()); + replicate->add_next_edge({this_shared, input_nr}); } else { autograd::create_gradient_edge(output, this_shared); } diff --git a/torch/csrc/autograd/functions/special.h b/torch/csrc/autograd/functions/special.h index 076a4faa98767a..273b139e238780 100644 --- a/torch/csrc/autograd/functions/special.h +++ b/torch/csrc/autograd/functions/special.h @@ -15,7 +15,9 @@ namespace torch { namespace autograd { struct EvalOutput : Function { explicit EvalOutput(const Edge& next_edge_) - : Function(/*num_inputs=*/1), next_edge(next_edge_) {} + : Function(), next_edge(next_edge_) { + add_input_metadata(undefined_input()); + } virtual variable_list apply(const variable_list& inputs) override { throw std::logic_error("EvalOutput::apply() called"); diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp index 75ec8180c95983..5df72d52630680 100644 --- a/torch/csrc/autograd/functions/tensor.cpp +++ b/torch/csrc/autograd/functions/tensor.cpp @@ -36,12 +36,13 @@ CopySlices::CopySlices( const Variable& base_var, at::TensorGeometry view_, std::shared_ptr fn_) - : Function(/*num_inputs=*/1), + : Function(), base(base_var), view(std::move(view_)), fn(std::move(fn_)) { // Take the next_edges of fn as our own, except for index 0 which goes // to base instead of the view. + add_input_metadata(base_var.type(), base_var.sizes()); const auto num_outputs = fn->num_outputs(); next_edges_.reserve(num_outputs); add_next_edge(base_var.gradient_edge()); diff --git a/torch/csrc/autograd/functions/utils.cpp b/torch/csrc/autograd/functions/utils.cpp index 09939a109a6c50..485572d9ee812c 100644 --- a/torch/csrc/autograd/functions/utils.cpp +++ b/torch/csrc/autograd/functions/utils.cpp @@ -29,7 +29,7 @@ variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs, autograd::create_gradient_edge(variable, grad_fn); result.push_back(std::move(variable)); } else { - grad_fn->bump_inputs(); + grad_fn->add_input_metadata(Function::undefined_input()); result.emplace_back(); } } diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp index e07d88feeeb21d..e52e06a34199c5 100644 --- a/torch/csrc/autograd/python_engine.cpp +++ b/torch/csrc/autograd/python_engine.cpp @@ -134,7 +134,7 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar } } - edge_list output_edges; + std::vector output_edges; if (inputs != nullptr) { int num_inputs = PyTuple_GET_SIZE(inputs); output_edges.reserve(num_inputs); diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 3fb53c9a6d804f..4672d535ef5d9a 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -207,7 +207,7 @@ auto PyFunction::release_variables() -> void { f->has_freed_buffers = 1; } -auto PyFunction::name() -> std::string { +auto PyFunction::name() const -> std::string { AutoGIL gil; auto f = (THPFunction*) obj; auto name = std::string(Py_TYPE(f)->tp_name); @@ -245,7 +245,7 @@ static int THPFunction_traverse(THPFunction *self, visitproc visit, void *arg) static int THPFunction_clear(THPFunction *self) { - self->cdata.set_num_inputs(0); + self->cdata.clear_input_metadata(); Py_CLEAR(self->needs_input_grad); @@ -293,7 +293,6 @@ PyObject *THPFunction_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) new (&self->input_info) std::vector(); new (&self->saved_variables) std::vector(); new (&self->is_variable_input) std::vector(); - self->cdata.set_num_inputs(0); return obj; } @@ -425,6 +424,10 @@ static void _wrap_outputs(THPFunction *self, // Note that output Variables may be repeated. In that case, the last call // to set_history wins. auto var = as_variable(obj, i); + if (cdata) { + auto output_nr = cdata->add_input_metadata(var.type(), var.sizes()); + TORCH_ASSERT(i == (int)output_nr); + } set_history(var, i, is_input, is_modified, is_differentiable); if (is_executable) { @@ -616,7 +619,7 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked THPObjectPtr outputs(PyTuple_New(num_outputs)); if (!outputs) throw python_error(); - grad_fn->cdata.set_num_inputs(num_outputs); + grad_fn->cdata.clear_input_metadata(); // Record type, device, and size information about inputs if (is_executable) { diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h index 562ab79a280f68..529bcaf4546a15 100644 --- a/torch/csrc/autograd/python_function.h +++ b/torch/csrc/autograd/python_function.h @@ -35,7 +35,7 @@ struct PyFunction : public Function { variable_list legacy_apply(const variable_list& inputs); virtual void release_variables() override; - virtual std::string name() override; + virtual std::string name() const override; virtual std::shared_ptr get_shared_ptr() override; virtual bool is_traceable() override; diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp index 179c54a1e3477d..33dd281ab078d5 100644 --- a/torch/csrc/autograd/python_legacy_variable.cpp +++ b/torch/csrc/autograd/python_legacy_variable.cpp @@ -57,7 +57,7 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject Variable var; if (grad_fn) { auto grad_fn_ = THPFunction_asFunction((THPFunction*)grad_fn); - Edge edge(grad_fn_, grad_fn_->bump_inputs()); + Edge edge(grad_fn_, grad_fn_->add_input_metadata(tensor.type(), tensor.sizes())); var = make_variable(std::move(tensor), std::move(edge)); } else { var = make_variable(std::move(tensor), requires_grad); diff --git a/torch/csrc/autograd/type_and_shape.h b/torch/csrc/autograd/type_and_shape.h new file mode 100644 index 00000000000000..01a62fa2cf18a9 --- /dev/null +++ b/torch/csrc/autograd/type_and_shape.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include "torch/csrc/assertions.h" + +namespace torch { namespace autograd { + +/// A tensor's type and shape. Each Function records the required type and +/// shape of its inputs. If is_valid() is false, then the corresponding input +/// is not used and may be an undefined tensor. +struct TypeAndShape { + TypeAndShape() : type_(nullptr) {} + + TypeAndShape(const at::Type& type, at::IntList shape) + : type_(&type) , shape_(shape) {} + + bool is_valid() const { + return type_ != nullptr; + } + + const at::Type& type() const { + TORCH_ASSERT(type_); + return *type_; + } + + at::IntList shape() const { + return shape_; + } + + const at::Type* type_; + at::DimVector shape_; +}; + +}} diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index f0b4c1df9d99fb..48a35f4f88795f 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -126,10 +126,12 @@ void Variable::Impl::backward( } void Variable::Impl::set_data(Tensor new_data) { - data_ = std::move(new_data); - if (data_.type() != *type_) { - type_ = VariableType::getType(data_); + if (new_data.type() != data_.type()) { + type_ = VariableType::getType(new_data.type()); + // Clear grad_accumulator if it exists, since it stores the old type info. + grad_accumulator_.reset(); } + data_ = std::move(new_data); } Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge) @@ -158,7 +160,7 @@ std::shared_ptr& Variable::ViewImpl::get_grad_fn() { fn->stride = strides(); fn->storage_offset = data_.storage_offset(); fn->set_next_edges(collect_next_edges(base_)); - fn->set_num_inputs(1); + fn->add_input_metadata(base_.type(), sizes()); grad_fn_ = std::move(fn); attr_version = current_version; } diff --git a/torch/csrc/cuda/Storage.cpp b/torch/csrc/cuda/Storage.cpp index a9806d9aa5554f..4e3d1b4ef3dd83 100644 --- a/torch/csrc/cuda/Storage.cpp +++ b/torch/csrc/cuda/Storage.cpp @@ -19,7 +19,7 @@ #include template<> -void THPPointer::free() { +void THPPointer::free() { if (ptr) THCStorage_free(LIBRARY_STATE ptr); } diff --git a/torch/csrc/cuda/override_macros.h b/torch/csrc/cuda/override_macros.h index a124903bf51cb5..439f33db001f67 100644 --- a/torch/csrc/cuda/override_macros.h +++ b/torch/csrc/cuda/override_macros.h @@ -1,14 +1,14 @@ #include "undef_macros.h" -#define THStoragePtr THCStoragePtr +#define THWStoragePtr THCStoragePtr #define THPStoragePtr THCPStoragePtr -#define THTensorPtr THCTensorPtr +#define THWTensorPtr THCTensorPtr #define THPTensorPtr THCPTensorPtr -#define THStorage THCStorage -#define THStorage_(NAME) THCStorage_(NAME) -#define THTensor THCTensor -#define THTensor_(NAME) THCTensor_(NAME) +#define THWStorage THCStorage +#define THWStorage_(NAME) THCStorage_(NAME) +#define THWTensor THCTensor +#define THWTensor_(NAME) THCTensor_(NAME) #define THPStorage_(NAME) TH_CONCAT_4(THCP,Real,Storage_,NAME) #define THPStorage THCPStorage @@ -29,10 +29,7 @@ #define THPTensorStateless THCPTensorStateless -#define THSTensorPtr THCSTensorPtr #define THSPTensorPtr THCSPTensorPtr -#define THSTensor THCSTensor -#define THSTensor_(NAME) THCSTensor_(NAME) #define THSPTensor_(NAME) TH_CONCAT_4(THCSP,Real,Tensor_,NAME) #define THSPTensor_stateless_(NAME) TH_CONCAT_4(THCSP,Real,Tensor_stateless_,NAME) diff --git a/torch/csrc/cuda/restore_macros.h b/torch/csrc/cuda/restore_macros.h index ec6eee25232446..054eec69897bbb 100644 --- a/torch/csrc/cuda/restore_macros.h +++ b/torch/csrc/cuda/restore_macros.h @@ -1,6 +1,6 @@ -#define THTensor TH_CONCAT_3(TH,Real,Tensor) -#define THTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME) +#define THWTensor TH_CONCAT_3(TH,Real,Tensor) +#define THWTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME) #define THPTensor TH_CONCAT_3(THP,Real,Tensor) #define THPTensorStr TH_CONCAT_STRING_3(torch.,Real,Tensor) @@ -13,8 +13,8 @@ #define THPStorage_(NAME) TH_CONCAT_4(THP,Real,Storage_,NAME) #ifdef _THP_CORE -#define THStoragePtr TH_CONCAT_3(TH,Real,StoragePtr) -#define THTensorPtr TH_CONCAT_3(TH,Real,TensorPtr) +#define THWStoragePtr TH_CONCAT_3(TH,Real,StoragePtr) +#define THWTensorPtr TH_CONCAT_3(TH,Real,TensorPtr) #define THPStoragePtr TH_CONCAT_3(THP,Real,StoragePtr) #define THPTensorPtr TH_CONCAT_3(THP,Real,TensorPtr) #endif diff --git a/torch/csrc/cuda/undef_macros.h b/torch/csrc/cuda/undef_macros.h index bfc6600959cc5a..a6d5e834b5f76e 100644 --- a/torch/csrc/cuda/undef_macros.h +++ b/torch/csrc/cuda/undef_macros.h @@ -22,14 +22,14 @@ #undef THPStorageClass #undef THPStorageType -#undef THStorage -#undef THStorage_ -#undef THTensor -#undef THTensor_ +#undef THWStorage +#undef THWStorage_ +#undef THWTensor +#undef THWTensor_ -#undef THStoragePtr +#undef THWStoragePtr #undef THPStoragePtr -#undef THTensorPtr +#undef THWTensorPtr #undef THPTensorPtr @@ -44,9 +44,6 @@ #undef THSPTensorStateless #undef THSPTensorType -#undef THSTensor -#undef THSTensor_ -#undef THSTensorPtr #undef THSPTensorPtr diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp index 270d972fc05a1b..30950bd0782ef1 100644 --- a/torch/csrc/cuda/utils.cpp +++ b/torch/csrc/cuda/utils.cpp @@ -7,3 +7,30 @@ #define THC_GENERIC_FILE "torch/csrc/generic/utils.cpp" #include + +#ifdef WITH_CUDA +std::vector THPUtils_PySequence_to_THCStreamList(PyObject *obj) { + if (!PySequence_Check(obj)) { + throw std::runtime_error("Expected a sequence in THPUtils_PySequence_to_THCStreamList"); + } + THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, NULL)); + if (seq.get() == NULL) { + throw std::runtime_error("expected PySequence, but got " + std::string(THPUtils_typename(obj))); + } + + std::vector streams; + Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get()); + for (Py_ssize_t i = 0; i < length; i++) { + PyObject *stream = PySequence_Fast_GET_ITEM(seq.get(), i); + + if (PyObject_IsInstance(stream, THCPStreamClass)) { + streams.push_back( ((THCPStream *)stream)->cdata); + } else if (stream == Py_None) { + streams.push_back(NULL); + } else { + std::runtime_error("Unknown data type found in stream list. Need THCStream or None"); + } + } + return streams; +} +#endif diff --git a/torch/csrc/distributed/override_macros.h b/torch/csrc/distributed/override_macros.h index bc94d056174189..e788533adb7c98 100644 --- a/torch/csrc/distributed/override_macros.h +++ b/torch/csrc/distributed/override_macros.h @@ -1,14 +1,14 @@ #include "undef_macros.h" -#define THStoragePtr THDStoragePtr +#define THWStoragePtr THDStoragePtr #define THPStoragePtr THDPStoragePtr -#define THTensorPtr THDTensorPtr +#define THWTensorPtr THDTensorPtr #define THPTensorPtr THDPTensorPtr -#define THStorage THDStorage -#define THStorage_(NAME) THDStorage_(NAME) -#define THTensor THDTensor -#define THTensor_(NAME) THDTensor_(NAME) +#define THWStorage THDStorage +#define THWStorage_(NAME) THDStorage_(NAME) +#define THWTensor THDTensor +#define THWTensor_(NAME) THDTensor_(NAME) #define THPStorage_(NAME) TH_CONCAT_4(THDP,Real,Storage_,NAME) #define THPStorage THDPStorage diff --git a/torch/csrc/distributed/undef_macros.h b/torch/csrc/distributed/undef_macros.h index a3bed51406e6c3..969817503ec0ce 100644 --- a/torch/csrc/distributed/undef_macros.h +++ b/torch/csrc/distributed/undef_macros.h @@ -20,14 +20,14 @@ #undef THPStorageClass #undef THPStorageType -#undef THStorage -#undef THStorage_ -#undef THTensor -#undef THTensor_ +#undef THWStorage +#undef THWStorage_ +#undef THWTensor +#undef THWTensor_ -#undef THStoragePtr +#undef THWStoragePtr #undef THPStoragePtr -#undef THTensorPtr +#undef THWTensorPtr #undef THPTensorPtr #undef THHostTensor diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp index 2d34aeabcc9afd..c50b8ce507b60c 100644 --- a/torch/csrc/generic/Storage.cpp +++ b/torch/csrc/generic/Storage.cpp @@ -4,7 +4,7 @@ PyObject *THPStorageClass = NULL; -PyObject * THPStorage_(New)(THStorage *ptr) +PyObject * THPStorage_(New)(THWStorage *ptr) { TORCH_ASSERT(ptr); PyTypeObject *type = (PyTypeObject *)THPStorageClass; @@ -12,24 +12,24 @@ PyObject * THPStorage_(New)(THStorage *ptr) if (obj) { ((THPStorage *)obj)->cdata = ptr; } else { - THStorage_(free)(LIBRARY_STATE ptr); + THWStorage_(free)(LIBRARY_STATE ptr); } return obj; } static void THPStorage_(dealloc)(THPStorage* self) { - THStorage_(free)(LIBRARY_STATE self->cdata); + THWStorage_(free)(LIBRARY_STATE self->cdata); Py_TYPE(self)->tp_free((PyObject*)self); } -static THStorage* THPStorage_(newWithAllocator)(int64_t size, THAllocator* allocator) +static THWStorage* THPStorage_(newWithAllocator)(int64_t size, THAllocator* allocator) { #if defined(THC_GENERIC_FILE) || defined(THD_GENERIC_FILE) THPUtils_setError(THPStorageStr " does not support custom allocators"); return NULL; #else - return THStorage_(newWithAllocator)(LIBRARY_STATE size, allocator, NULL); + return THWStorage_(newWithAllocator)(LIBRARY_STATE size, allocator, NULL); #endif } @@ -55,7 +55,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec if (num_args == 0) { PyObject *cdata_ptr = PyDict_GetItemString(kwargs, "cdata"); if (num_kwargs == 1 && cdata_ptr && THPUtils_checkLong(cdata_ptr)) { - THStorage *ptr = (THStorage*)PyLong_AsVoidPtr(cdata_ptr); + THWStorage *ptr = (THWStorage*)PyLong_AsVoidPtr(cdata_ptr); self->cdata = ptr; return (PyObject*)self.release(); } @@ -68,7 +68,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec if (allocator) { self->cdata = THPStorage_(newWithAllocator)(0, allocator); } else { - self->cdata = THStorage_(new)(LIBRARY_STATE_NOARGS); + self->cdata = THWStorage_(new)(LIBRARY_STATE_NOARGS); } return (PyObject*)self.release(); } @@ -81,7 +81,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec if (allocator) { self->cdata = THPStorage_(newWithAllocator)(size, allocator); } else { - self->cdata = THStorage_(newWithSize)(LIBRARY_STATE size); + self->cdata = THWStorage_(newWithSize)(LIBRARY_STATE size); } return (PyObject*)self.release(); } @@ -117,11 +117,11 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec "%" PRId64 ", but the viewed storage has only %" PRId64 " element(s) after offset %" PRId64, size, numel - offset, offset); - real *data_ptr = THStorage_(data)(LIBRARY_STATE storage_arg->cdata) + offset; - THStoragePtr storage(THStorage_(newWithData)(LIBRARY_STATE data_ptr, size)); + real *data_ptr = THWStorage_(data)(LIBRARY_STATE storage_arg->cdata) + offset; + THWStoragePtr storage(THWStorage_(newWithData)(LIBRARY_STATE data_ptr, size)); storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW; storage->view = storage_arg->cdata; - THStorage_(retain)(LIBRARY_STATE storage_arg->cdata); + THWStorage_(retain)(LIBRARY_STATE storage_arg->cdata); self->cdata = storage.release(); return (PyObject*)self.release(); #endif @@ -135,7 +135,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec Py_ssize_t length = PySequence_Length(first_arg); THPUtils_assert(length >= 0, "couldn't obtain the length of %s", THPUtils_typename(first_arg)); - self->cdata = THStorage_(newWithSize)(LIBRARY_STATE length); + self->cdata = THWStorage_(newWithSize)(LIBRARY_STATE length); THPObjectPtr item; try { for (Py_ssize_t i = 0; i < length; i++) { @@ -177,7 +177,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec static Py_ssize_t THPStorage_(length)(THPStorage *self) { HANDLE_TH_ERRORS - return THStorage_(size)(LIBRARY_STATE self->cdata); + return THWStorage_(size)(LIBRARY_STATE self->cdata); END_HANDLE_TH_ERRORS_RET(-1) } @@ -188,13 +188,13 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index) if (THPUtils_checkLong(index)) { int64_t nindex = THPUtils_unpackLong(index); if (nindex < 0) - nindex += THStorage_(size)(LIBRARY_STATE self->cdata); + nindex += THWStorage_(size)(LIBRARY_STATE self->cdata); if (nindex < 0 || nindex >= self->cdata->size) { PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of " "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size); return NULL; } - real value = THStorage_(get)(LIBRARY_STATE self->cdata, nindex); + real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex); return THPUtils_(newReal)(value); /* Slice index */ } else if (PySlice_Check(index)) { @@ -203,7 +203,7 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index) return NULL; #else Py_ssize_t start, stop, slicelength, step; - int64_t len = THStorage_(size)(LIBRARY_STATE self->cdata); + int64_t len = THWStorage_(size)(LIBRARY_STATE self->cdata); if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength)) return NULL; if (step != 1) { @@ -212,11 +212,11 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index) return NULL; } - real *data = THStorage_(data)(LIBRARY_STATE self->cdata); - THStoragePtr new_storage(THStorage_(newWithData)(LIBRARY_STATE data + start, slicelength)); + real *data = THWStorage_(data)(LIBRARY_STATE self->cdata); + THWStoragePtr new_storage(THWStorage_(newWithData)(LIBRARY_STATE data + start, slicelength)); new_storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW; new_storage->view = self->cdata; - THStorage_(retain)(LIBRARY_STATE self->cdata); + THWStorage_(retain)(LIBRARY_STATE self->cdata); PyObject *_ret = THPStorage_(New)(new_storage); new_storage.release(); @@ -242,11 +242,11 @@ static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value) real rvalue = THPUtils_(unpackReal)(value); if (THPUtils_checkLong(index)) { int64_t nindex = THPUtils_unpackLong(index); - THStorage_(set)(LIBRARY_STATE self->cdata, nindex, rvalue); + THWStorage_(set)(LIBRARY_STATE self->cdata, nindex, rvalue); return 0; } else if (PySlice_Check(index)) { Py_ssize_t start, stop, slicelength, step; - int64_t len = THStorage_(size)(LIBRARY_STATE self->cdata); + int64_t len = THWStorage_(size)(LIBRARY_STATE self->cdata); if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength)) return -1; if (step != 1) { @@ -257,7 +257,7 @@ static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value) // TODO: check the bounds only once // TODO: fill? for (;start < stop; start++) - THStorage_(set)(LIBRARY_STATE self->cdata, start, rvalue); + THWStorage_(set)(LIBRARY_STATE self->cdata, start, rvalue); return 0; } THPUtils_setError("can't index a " THPStorageStr " with %s", @@ -319,33 +319,33 @@ static struct PyMemberDef THPStorage_(members)[] = { {NULL} }; -extern THPCopyList THStorage_(copy_functions); -THPCopyList THStorage_(copy_functions); +extern THPCopyList THWStorage_(copy_functions); +THPCopyList THWStorage_(copy_functions); void THPStorage_(initCopyMethods)() { #ifndef THD_GENERIC_FILE - auto& h = THStorage_(copy_functions); + auto& h = THWStorage_(copy_functions); // copy from CPU types - THPInsertStorageCopyFunction(&THPByteStorageType, h, &THStorage_(copyByte)); - THPInsertStorageCopyFunction(&THPCharStorageType, h, &THStorage_(copyChar)); - THPInsertStorageCopyFunction(&THPShortStorageType, h, &THStorage_(copyShort)); - THPInsertStorageCopyFunction(&THPIntStorageType, h, &THStorage_(copyInt)); - THPInsertStorageCopyFunction(&THPLongStorageType, h, &THStorage_(copyLong)); - THPInsertStorageCopyFunction(&THPHalfStorageType, h, &THStorage_(copyHalf)); - THPInsertStorageCopyFunction(&THPFloatStorageType, h, &THStorage_(copyFloat)); - THPInsertStorageCopyFunction(&THPDoubleStorageType, h, &THStorage_(copyDouble)); + THPInsertStorageCopyFunction(&THPByteStorageType, h, &THWStorage_(copyByte)); + THPInsertStorageCopyFunction(&THPCharStorageType, h, &THWStorage_(copyChar)); + THPInsertStorageCopyFunction(&THPShortStorageType, h, &THWStorage_(copyShort)); + THPInsertStorageCopyFunction(&THPIntStorageType, h, &THWStorage_(copyInt)); + THPInsertStorageCopyFunction(&THPLongStorageType, h, &THWStorage_(copyLong)); + THPInsertStorageCopyFunction(&THPHalfStorageType, h, &THWStorage_(copyHalf)); + THPInsertStorageCopyFunction(&THPFloatStorageType, h, &THWStorage_(copyFloat)); + THPInsertStorageCopyFunction(&THPDoubleStorageType, h, &THWStorage_(copyDouble)); #ifdef THC_GENERIC_FILE // copy from GPU types - THPInsertStorageCopyFunction(&THCPByteStorageType, h, &THStorage_(copyCudaByte)); - THPInsertStorageCopyFunction(&THCPCharStorageType, h, &THStorage_(copyCudaChar)); - THPInsertStorageCopyFunction(&THCPShortStorageType, h, &THStorage_(copyCudaShort)); - THPInsertStorageCopyFunction(&THCPIntStorageType, h, &THStorage_(copyCudaInt)); - THPInsertStorageCopyFunction(&THCPLongStorageType, h, &THStorage_(copyCudaLong)); - THPInsertStorageCopyFunction(&THCPFloatStorageType, h, &THStorage_(copyCudaFloat)); - THPInsertStorageCopyFunction(&THCPDoubleStorageType, h, &THStorage_(copyCudaDouble)); + THPInsertStorageCopyFunction(&THCPByteStorageType, h, &THWStorage_(copyCudaByte)); + THPInsertStorageCopyFunction(&THCPCharStorageType, h, &THWStorage_(copyCudaChar)); + THPInsertStorageCopyFunction(&THCPShortStorageType, h, &THWStorage_(copyCudaShort)); + THPInsertStorageCopyFunction(&THCPIntStorageType, h, &THWStorage_(copyCudaInt)); + THPInsertStorageCopyFunction(&THCPLongStorageType, h, &THWStorage_(copyCudaLong)); + THPInsertStorageCopyFunction(&THCPFloatStorageType, h, &THWStorage_(copyCudaFloat)); + THPInsertStorageCopyFunction(&THCPDoubleStorageType, h, &THWStorage_(copyCudaDouble)); #ifdef CUDA_HALF_TENSOR - THPInsertStorageCopyFunction(&THCPHalfStorageType, h, &THStorage_(copyCudaHalf)); + THPInsertStorageCopyFunction(&THCPHalfStorageType, h, &THWStorage_(copyCudaHalf)); #endif // add CPU <- GPU copies to base type #define THPCpuStorage TH_CONCAT_3(THP, Real, Storage) diff --git a/torch/csrc/generic/Storage.h b/torch/csrc/generic/Storage.h index 141a2c33101657..4a9b0acc576978 100644 --- a/torch/csrc/generic/Storage.h +++ b/torch/csrc/generic/Storage.h @@ -4,10 +4,10 @@ struct THPStorage { PyObject_HEAD - THStorage *cdata; + THWStorage *cdata; }; -THP_API PyObject * THPStorage_(New)(THStorage *ptr); +THP_API PyObject * THPStorage_(New)(THWStorage *ptr); extern PyObject *THPStorageClass; #ifdef _THP_CORE diff --git a/torch/csrc/generic/StorageMethods.cpp b/torch/csrc/generic/StorageMethods.cpp index f525d621502997..26863d9e7404d6 100644 --- a/torch/csrc/generic/StorageMethods.cpp +++ b/torch/csrc/generic/StorageMethods.cpp @@ -5,7 +5,7 @@ static PyObject * THPStorage_(size)(THPStorage *self) { HANDLE_TH_ERRORS - return PyLong_FromLong(THStorage_(size)(LIBRARY_STATE self->cdata)); + return PyLong_FromLong(THWStorage_(size)(LIBRARY_STATE self->cdata)); END_HANDLE_TH_ERRORS } @@ -13,7 +13,7 @@ static PyObject * THPStorage_(size)(THPStorage *self) static PyObject * THPStorage_(dataPtr)(THPStorage *self) { HANDLE_TH_ERRORS - return PyLong_FromVoidPtr(THStorage_(data)(LIBRARY_STATE self->cdata)); + return PyLong_FromVoidPtr(THWStorage_(data)(LIBRARY_STATE self->cdata)); END_HANDLE_TH_ERRORS } #endif @@ -21,7 +21,7 @@ static PyObject * THPStorage_(dataPtr)(THPStorage *self) static PyObject * THPStorage_(copy_)(PyObject *self, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS - return THPStorageCopyMethod(THStorage_(copy_functions), self, args, kwargs); + return THPStorageCopyMethod(THWStorage_(copy_functions), self, args, kwargs); END_HANDLE_TH_ERRORS } @@ -31,7 +31,7 @@ static PyObject * THPStorage_(isPinned)(THPStorage *self) HANDLE_TH_ERRORS #if defined(WITH_CUDA) cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, THStorage_(data)(LIBRARY_STATE self->cdata)); + cudaError_t err = cudaPointerGetAttributes(&attr, THWStorage_(data)(LIBRARY_STATE self->cdata)); if (err != cudaSuccess) { cudaGetLastError(); Py_RETURN_FALSE; @@ -47,14 +47,14 @@ static PyObject * THPStorage_(isPinned)(THPStorage *self) static PyObject * THPStorage_(elementSize)(THPStorage *self) { HANDLE_TH_ERRORS - return PyLong_FromLong(THStorage_(elementSize)(LIBRARY_STATE_NOARGS)); + return PyLong_FromLong(THWStorage_(elementSize)(LIBRARY_STATE_NOARGS)); END_HANDLE_TH_ERRORS } static PyObject * THPStorage_(new)(THPStorage *self) { HANDLE_TH_ERRORS - THStoragePtr new_storage(THStorage_(new)(LIBRARY_STATE_NOARGS)); + THWStoragePtr new_storage(THWStorage_(new)(LIBRARY_STATE_NOARGS)); PyObject *_ret = THPStorage_(New)(new_storage); new_storage.release(); return _ret; @@ -67,7 +67,7 @@ static PyObject * THPStorage_(resize_)(THPStorage *self, PyObject *number_arg) THPUtils_assert(THPUtils_checkLong(number_arg), "resize_ expects an int, " "but got %s", THPUtils_typename(number_arg)); int64_t newsize = THPUtils_unpackLong(number_arg); - THStorage_(resize)(LIBRARY_STATE self->cdata, newsize); + THWStorage_(resize)(LIBRARY_STATE self->cdata, newsize); Py_INCREF(self); return (PyObject*)self; END_HANDLE_TH_ERRORS @@ -79,7 +79,7 @@ static PyObject * THPStorage_(fill_)(THPStorage *self, PyObject *number_arg) THPUtils_assert(THPUtils_(checkReal)(number_arg), "fill_ expects %s, " "but got %s", THPUtils_typeTraits::python_type_str, THPUtils_typename(number_arg)); - THStorage_(fill)(LIBRARY_STATE self->cdata, THPUtils_(unpackReal)(number_arg)); + THWStorage_(fill)(LIBRARY_STATE self->cdata, THPUtils_(unpackReal)(number_arg)); Py_INCREF(self); return (PyObject*)self; END_HANDLE_TH_ERRORS @@ -152,23 +152,23 @@ static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyO } uint8_t* src = (uint8_t*) buffer.buf; - THStorage* storage = THStorage_(newWithSize)(count); + THWStorage* storage = THWStorage_(newWithSize)(count); #if defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) - memcpy(THStorage_(data)(storage), src + offset, count); + memcpy(THWStorage_(data)(storage), src + offset, count); #elif defined(TH_REAL_IS_SHORT) - THP_decodeInt16Buffer(THStorage_(data)(storage), src + offset, byte_order, count); + THP_decodeInt16Buffer(THWStorage_(data)(storage), src + offset, byte_order, count); #elif defined(TH_REAL_IS_INT) - THP_decodeInt32Buffer(THStorage_(data)(storage), src + offset, byte_order, count); + THP_decodeInt32Buffer(THWStorage_(data)(storage), src + offset, byte_order, count); #elif defined(TH_REAL_IS_LONG) // TODO: remove the cast - THP_decodeInt64Buffer((int64_t*) THStorage_(data)(storage), src + offset, byte_order, count); + THP_decodeInt64Buffer((int64_t*) THWStorage_(data)(storage), src + offset, byte_order, count); #elif defined(TH_REAL_IS_HALF) - THP_decodeHalfBuffer(THStorage_(data)(storage), src + offset, byte_order, count); + THP_decodeHalfBuffer(THWStorage_(data)(storage), src + offset, byte_order, count); #elif defined(TH_REAL_IS_FLOAT) - THP_decodeFloatBuffer(THStorage_(data)(storage), src + offset, byte_order, count); + THP_decodeFloatBuffer(THWStorage_(data)(storage), src + offset, byte_order, count); #elif defined(TH_REAL_IS_DOUBLE) - THP_decodeDoubleBuffer(THStorage_(data)(storage), src + offset, byte_order, count); + THP_decodeDoubleBuffer(THWStorage_(data)(storage), src + offset, byte_order, count); #else #error "Unknown type" #endif @@ -192,7 +192,7 @@ static PyObject * THPStorage_(fromFile)(PyObject *_unused, PyObject *args, PyObj } if (shared) shared = TH_ALLOCATOR_MAPPED_SHARED; - THStorage *storage = THStorage_(newWithMapping)(LIBRARY_STATE filename, size, shared); + THWStorage *storage = THWStorage_(newWithMapping)(LIBRARY_STATE filename, size, shared); return (PyObject*)THPStorage_(New)(storage); END_HANDLE_TH_ERRORS } @@ -223,7 +223,7 @@ PyObject * THPStorage_(newWithFile)(PyObject *_unused, PyObject *file) int fd = PyObject_AsFileDescriptor(file); THPUtils_assert(fd != -1, "_new_with_file couldn't retrieve a file " "descriptor from given object"); - THStorage *storage = THPStorage_(readFileRaw)(fd, nullptr); + THWStorage *storage = THPStorage_(readFileRaw)(fd, nullptr); if (storage == nullptr) return nullptr; PyObject *result = THPStorage_(New)(storage); @@ -243,7 +243,7 @@ static PyObject *THPStorage_(setFromFile)(THPStorage *self, PyObject *args) // but it is currently unnecessary to support this. THPUtils_assert(offset == Py_None, "_set_from_file: offset is NYI for filelike objects"); - THStorage *storage = THPStorage_(readFileRaw)(file, self->cdata); + THWStorage *storage = THPStorage_(readFileRaw)(file, self->cdata); if (storage == nullptr) { return nullptr; } @@ -258,7 +258,7 @@ static PyObject *THPStorage_(setFromFile)(THPStorage *self, PyObject *args) } THPUtils_assert(fd != -1, "_set_from_file couldn't retrieve a file " "descriptor from given object"); - THStorage *storage = THPStorage_(readFileRaw)(fd, self->cdata); + THWStorage *storage = THPStorage_(readFileRaw)(fd, self->cdata); if (storage == nullptr) return nullptr; Py_INCREF(self); @@ -283,9 +283,9 @@ PyObject * THPStorage_(_setCdata)(THPStorage *self, PyObject *new_cdata) THPUtils_assert(THPUtils_checkLong(new_cdata), "given an invalid argument to " "_set_cdata - expected an int or long, but got %s", THPUtils_typename(new_cdata)); - THStorage *ptr = (THStorage*)PyLong_AsVoidPtr(new_cdata); - THStorage_(retain)(LIBRARY_STATE ptr); - THStorage_(free)(LIBRARY_STATE self->cdata); + THWStorage *ptr = (THWStorage*)PyLong_AsVoidPtr(new_cdata); + THWStorage_(retain)(LIBRARY_STATE ptr); + THWStorage_(free)(LIBRARY_STATE self->cdata); self->cdata = ptr; Py_INCREF(self); return (PyObject*)self; @@ -299,11 +299,11 @@ PyObject * THPStorage_(_rootStorage)(THPStorage *self) if (!(self->cdata->flag & TH_STORAGE_VIEW)) { return Py_BuildValue("(ON)", self, PyLong_FromLong(0)); } - THStorage *root = self->cdata; + THWStorage *root = self->cdata; while (root->flag & TH_STORAGE_VIEW) root = root->view; - size_t offset = THStorage_(data)(LIBRARY_STATE self->cdata) - THStorage_(data)(LIBRARY_STATE root); - THStorage_(retain)(LIBRARY_STATE root); + size_t offset = THWStorage_(data)(LIBRARY_STATE self->cdata) - THWStorage_(data)(LIBRARY_STATE root); + THWStorage_(retain)(LIBRARY_STATE root); THPObjectPtr storage(THPStorage_(New)(root)); PyObject *result = Py_BuildValue("(NN)", storage.get(), PyLong_FromLong(offset)); storage.release(); diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp index eabc9f57c61620..4c09f22498a9a7 100644 --- a/torch/csrc/generic/StorageSharing.cpp +++ b/torch/csrc/generic/StorageSharing.cpp @@ -9,7 +9,7 @@ static PyObject * THPStorage_(sharedDecref)(THPStorage *self) HANDLE_TH_ERRORS #ifndef THC_GENERIC_FILE libshm_context *ctx = NULL; - THStorage *storage = self->cdata; + THWStorage *storage = self->cdata; if (storage->allocator == &THManagedSharedAllocator) { ctx = (libshm_context*)storage->allocatorContext; } else if (storage->allocator == &THStorageWeakRefAllocator) { @@ -18,7 +18,7 @@ static PyObject * THPStorage_(sharedDecref)(THPStorage *self) ctx = (libshm_context*)allocator_obj->allocatorContext; } if (ctx) - THRefcountedMapAllocator_decref(ctx->th_context, THStorage_(data)(storage)); + THRefcountedMapAllocator_decref(ctx->th_context, THWStorage_(data)(storage)); #endif Py_INCREF(self); return (PyObject *)self; @@ -30,7 +30,7 @@ static PyObject * THPStorage_(sharedIncref)(THPStorage *self) HANDLE_TH_ERRORS #ifndef THC_GENERIC_FILE libshm_context *ctx = NULL; - THStorage *storage = self->cdata; + THWStorage *storage = self->cdata; if (storage->allocator == &THManagedSharedAllocator) { ctx = (libshm_context*)storage->allocatorContext; } else if (storage->allocator == &THStorageWeakRefAllocator) { @@ -39,19 +39,19 @@ static PyObject * THPStorage_(sharedIncref)(THPStorage *self) ctx = (libshm_context*)allocator_obj->allocatorContext; } if (ctx) - THRefcountedMapAllocator_incref(ctx->th_context, THStorage_(data)(storage)); + THRefcountedMapAllocator_incref(ctx->th_context, THWStorage_(data)(storage)); #endif Py_RETURN_NONE; END_HANDLE_TH_ERRORS } -static PyObject * THPStorage_(newTHView)(THStorage *base, ptrdiff_t offset, size_t size) +static PyObject * THPStorage_(newTHView)(THWStorage *base, ptrdiff_t offset, size_t size) { void *data = (char*)base->data() + offset; - THStoragePtr view(THStorage_(newWithData)(LIBRARY_STATE (real*)data, size)); + THWStoragePtr view(THWStorage_(newWithData)(LIBRARY_STATE (real*)data, size)); view->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW; view->view = base; - THStorage_(retain)(LIBRARY_STATE base); + THWStorage_(retain)(LIBRARY_STATE base); return THPStorage_(New)(view.release()); } @@ -69,12 +69,12 @@ static std::string THPStorage_(__newHandle)() { return handle; } -static THStorage* THPStorage_(newFilenameStorage)(ptrdiff_t size) +static THWStorage* THPStorage_(newFilenameStorage)(ptrdiff_t size) { int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_EXCLUSIVE; std::string handle = THPStorage_(__newHandle)(); auto ctx = libshm_context_new(NULL, handle.c_str(), flags); - return THStorage_(newWithAllocator)(size, &THManagedSharedAllocator, (void*)ctx); + return THWStorage_(newWithAllocator)(size, &THManagedSharedAllocator, (void*)ctx); } static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject *args) @@ -91,7 +91,7 @@ static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject static PyObject * THPStorage_(shareFilename)(THPStorage *self) { HANDLE_TH_ERRORS - THStorage *storage = self->cdata; + THWStorage *storage = self->cdata; libshm_context *ctx; // Storage is already in shared memory, just return a handle if (storage->allocator == &THManagedSharedAllocator) { @@ -102,9 +102,9 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self) } else { // TODO: retry on collision AutoNoGIL no_gil; - THStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size)); - THStorage_(copy)(new_storage, storage); - THStorage_(swap)(storage, new_storage); + THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size)); + THWStorage_(copy)(new_storage, storage); + THWStorage_(swap)(storage, new_storage); ctx = (libshm_context*)storage->allocatorContext; } @@ -143,12 +143,12 @@ static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *ar int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_NOCREATE; libshm_context *ctx = libshm_context_new(manager_handle, object_handle, flags); - return THPStorage_(New)(THStorage_(newWithAllocator)(size, + return THPStorage_(New)(THWStorage_(newWithAllocator)(size, &THManagedSharedAllocator, (void*)ctx)); END_HANDLE_TH_ERRORS } -static THStorage* THPStorage_(newFdStorage)(ptrdiff_t size) +static THWStorage* THPStorage_(newFdStorage)(ptrdiff_t size) { int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_EXCLUSIVE | @@ -156,7 +156,7 @@ static THStorage* THPStorage_(newFdStorage)(ptrdiff_t size) TH_ALLOCATOR_MAPPED_UNLINK; std::string handle = THPStorage_(__newHandle)(); auto ctx = THMapAllocatorContext_new(handle.c_str(), flags); - return THStorage_(newWithAllocator)(size, &THMapAllocator, (void*)ctx); + return THWStorage_(newWithAllocator)(size, &THMapAllocator, (void*)ctx); } static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args) @@ -173,7 +173,7 @@ static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args) static PyObject * THPStorage_(shareFd)(THPStorage *self) { HANDLE_TH_ERRORS - THStorage *storage = self->cdata; + THWStorage *storage = self->cdata; THMapAllocatorContext *ctx; // Storage is already in shared memory, just return a handle if (storage->allocator == &THMapAllocator) { @@ -183,9 +183,9 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self) ctx = (THMapAllocatorContext*)allocator_obj->allocatorContext; } else { AutoNoGIL no_gil; - THStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size)); - THStorage_(copy)(new_storage, storage); - THStorage_(swap)(storage, new_storage); + THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size)); + THWStorage_(copy)(new_storage, storage); + THWStorage_(swap)(storage, new_storage); ctx = (THMapAllocatorContext*)storage->allocatorContext; } @@ -226,7 +226,7 @@ static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args) TH_ALLOCATOR_MAPPED_KEEPFD | TH_ALLOCATOR_MAPPED_FROMFD; THMapAllocatorContext *ctx = THMapAllocatorContext_newWithFd(NULL, fd, flags); - return THPStorage_(New)(THStorage_(newWithAllocator)(size, &THMapAllocator, + return THPStorage_(New)(THWStorage_(newWithAllocator)(size, &THMapAllocator, (void*)ctx)); END_HANDLE_TH_ERRORS } @@ -236,7 +236,7 @@ static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args) static PyObject * THPStorage_(shareCuda)(THPStorage *self) { HANDLE_TH_ERRORS - THStorage *storage = self->cdata; + THWStorage *storage = self->cdata; AutoGPU gpu_guard(storage->device); THPObjectPtr tuple(PyTuple_New(5)); THPObjectPtr device(PyLong_FromLong(storage->device)); @@ -245,9 +245,9 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self) THPObjectPtr size(PyLong_FromLong(storage->size)); THPObjectPtr _offset(PyLong_FromLong(0)); THPObjectPtr view_size(PyLong_FromLong(storage->size)); - if (THStorage_(data)(LIBRARY_STATE storage)) { + if (THWStorage_(data)(LIBRARY_STATE storage)) { size_t base_size; - void *base_ptr = THCCachingAllocator_getBaseAllocation(THStorage_(data)(LIBRARY_STATE storage), &base_size); + void *base_ptr = THCCachingAllocator_getBaseAllocation(THWStorage_(data)(LIBRARY_STATE storage), &base_size); ptrdiff_t offset = (char*)storage->data() - (char*)base_ptr; cudaIpcMemHandle_t handle; @@ -304,7 +304,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) void *devPtr = NULL; THCudaCheck(cudaIpcOpenMemHandle(&devPtr, handle, cudaIpcMemLazyEnablePeerAccess)); - THStoragePtr base(THStorage_(newWithDataAndAllocator)( + THWStoragePtr base(THWStorage_(newWithDataAndAllocator)( LIBRARY_STATE (real*)devPtr, storage_size, &THCIpcAllocator, (void*)device)); base->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_FREEMEM; @@ -317,13 +317,13 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) } #endif -// Returns an object that holds a "weak" pointer to the THStorage. The -// pointer is set to None when the THStorage is deallocated. This is done by +// Returns an object that holds a "weak" pointer to the THWStorage. The +// pointer is set to None when the THWStorage is deallocated. This is done by // wrapping the storages allocator with THStorageWeakRefAllocator which is // responsible for clearing the weak reference. static PyObject * THPStorage_(weakRef)(THPStorage *self, PyObject *weak_ref_class) { HANDLE_TH_ERRORS - THStorage* storage = self->cdata; + THWStorage* storage = self->cdata; while (storage->flag & TH_STORAGE_VIEW) { storage = storage->view; } @@ -367,8 +367,8 @@ PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg) } THPUtils_assert(THPUtils_checkLong(ref.get()), "_new_with_weak_ptr(): arg.cdata must be an 'int'"); - THStorage *storage = (THStorage*)PyLong_AsVoidPtr(ref.get()); - if (THStorage_(retainIfLive)(LIBRARY_STATE storage)) { + THWStorage *storage = (THWStorage*)PyLong_AsVoidPtr(ref.get()); + if (THWStorage_(retainIfLive)(LIBRARY_STATE storage)) { return THPStorage_(New)(storage); } Py_RETURN_NONE; @@ -380,7 +380,7 @@ PyObject * THPStorage_(sharedFd)(THPStorage *self) HANDLE_TH_ERRORS THMapAllocatorContext *ctx = NULL; #ifndef THC_GENERIC_FILE - THStorage *storage = self->cdata; + THWStorage *storage = self->cdata; if (storage->allocator == &THMapAllocator) { ctx = (THMapAllocatorContext*)storage->allocatorContext; } else if (storage->allocator == &THStorageWeakRefAllocator) { diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp index 8550e9c9d83a9a..42dff61c2feb9f 100644 --- a/torch/csrc/generic/serialization.cpp +++ b/torch/csrc/generic/serialization.cpp @@ -5,16 +5,16 @@ #define SYSCHECK(call) { ssize_t __result = call; if (__result < 0) throw std::system_error((int) __result, std::system_category()); } template -void THPStorage_(writeFileRaw)(THStorage *self, io fd) +void THPStorage_(writeFileRaw)(THWStorage *self, io fd) { real *data; - int64_t size = THStorage_(size)(LIBRARY_STATE self); + int64_t size = THWStorage_(size)(LIBRARY_STATE self); #ifndef THC_GENERIC_FILE - data = THStorage_(data)(LIBRARY_STATE self); + data = THWStorage_(data)(LIBRARY_STATE self); #else std::unique_ptr cpu_data(new char[size * sizeof(real)]); data = (real*)cpu_data.get(); - THCudaCheck(cudaMemcpy(data, THStorage_(data)(LIBRARY_STATE self), size * sizeof(real), cudaMemcpyDeviceToHost)); + THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(real), cudaMemcpyDeviceToHost)); #endif ssize_t result = doWrite(fd, &size, sizeof(int64_t)); if (result != sizeof(int64_t)) @@ -59,11 +59,11 @@ void THPStorage_(writeFileRaw)(THStorage *self, io fd) } } -template void THPStorage_(writeFileRaw)(THStorage *self, int fd); -template void THPStorage_(writeFileRaw)(THStorage *self, PyObject* fd); +template void THPStorage_(writeFileRaw)(THWStorage *self, int fd); +template void THPStorage_(writeFileRaw)(THWStorage *self, PyObject* fd); template -THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage) +THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage) { real *data; int64_t size; @@ -72,18 +72,18 @@ THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage) throw std::runtime_error("unexpected EOF. The file might be corrupted."); if (result != sizeof(int64_t)) throw std::system_error(result, std::system_category()); - THStoragePtr storage; + THWStoragePtr storage; if (_storage == nullptr) { - storage = THStorage_(newWithSize)(LIBRARY_STATE size); + storage = THWStorage_(newWithSize)(LIBRARY_STATE size); } else { - THPUtils_assert(THStorage_(size)(LIBRARY_STATE _storage) == size, + THPUtils_assert(THWStorage_(size)(LIBRARY_STATE _storage) == size, "storage has wrong size: expected %ld got %ld", - size, THStorage_(size)(LIBRARY_STATE _storage)); + size, THWStorage_(size)(LIBRARY_STATE _storage)); storage = _storage; } #ifndef THC_GENERIC_FILE - data = THStorage_(data)(LIBRARY_STATE storage); + data = THWStorage_(data)(LIBRARY_STATE storage); #else std::unique_ptr cpu_data(new char[size * sizeof(real)]); data = (real*)cpu_data.get(); @@ -92,7 +92,7 @@ THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage) // fast track for bytes and little endian if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) { char *bytes = (char *) data; - int64_t remaining = sizeof(real) * THStorage_(size)(LIBRARY_STATE storage); + int64_t remaining = sizeof(real) * THWStorage_(size)(LIBRARY_STATE storage); while (remaining > 0) { // we write and read in 1GB blocks to avoid bugs on some OSes ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824)); @@ -134,13 +134,13 @@ THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage) } #ifdef THC_GENERIC_FILE - THCudaCheck(cudaMemcpy(THStorage_(data)(LIBRARY_STATE storage), data, size * sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THWStorage_(data)(LIBRARY_STATE storage), data, size * sizeof(real), cudaMemcpyHostToDevice)); #endif return storage.release(); } -template THStorage* THPStorage_(readFileRaw)(int fd, THStorage* storage); -template THStorage* THPStorage_(readFileRaw)(PyObject* fd, THStorage* storage); +template THWStorage* THPStorage_(readFileRaw)(int fd, THWStorage* storage); +template THWStorage* THPStorage_(readFileRaw)(PyObject* fd, THWStorage* storage); #undef SYSCHECK diff --git a/torch/csrc/generic/serialization.h b/torch/csrc/generic/serialization.h index d4f46c889f40d7..e73ac7c76229a9 100644 --- a/torch/csrc/generic/serialization.h +++ b/torch/csrc/generic/serialization.h @@ -3,9 +3,9 @@ #else template -void THPStorage_(writeFileRaw)(THStorage *self, io fd); +void THPStorage_(writeFileRaw)(THWStorage *self, io fd); template -THStorage * THPStorage_(readFileRaw)(io fd, THStorage *storage); +THWStorage * THPStorage_(readFileRaw)(io fd, THWStorage *storage); #endif diff --git a/torch/csrc/generic/utils.cpp b/torch/csrc/generic/utils.cpp index fe722b1ab28943..0e59a81ed1ee6a 100644 --- a/torch/csrc/generic/utils.cpp +++ b/torch/csrc/generic/utils.cpp @@ -9,9 +9,9 @@ #endif template<> -void THPPointer::free() { +void THPPointer::free() { if (ptr) - THTensor_(free)(LIBRARY_STATE ptr); + THWTensor_(free)(LIBRARY_STATE ptr); } template<> @@ -20,20 +20,8 @@ void THPPointer::free() { Py_DECREF(ptr); } -#if GENERATE_SPARSE -template<> -void THPPointer::free() { - if (ptr) - THSTensor_(free)(LIBRARY_STATE ptr); -} -#endif - - -template class THPPointer; +template class THPPointer; template class THPPointer; -#if GENERATE_SPARSE -template class THPPointer; -#endif #undef GENERATE_SPARSE diff --git a/torch/csrc/generic/utils.h b/torch/csrc/generic/utils.h index a9191a7c6f1b30..bcccffa051b770 100644 --- a/torch/csrc/generic/utils.h +++ b/torch/csrc/generic/utils.h @@ -11,14 +11,10 @@ struct THPStorage; struct THSPTensor; -typedef class THPPointer THStoragePtr; -typedef class THPPointer THTensorPtr; +typedef class THPPointer THWStoragePtr; +typedef class THPPointer THWTensorPtr; typedef class THPPointer THPStoragePtr; -#if GENERATE_SPARSE -typedef class THPPointer THSTensorPtr; -#endif - #if (!defined(THC_GENERIC_FILE) || defined(THC_REAL_IS_HALF)) && \ (!defined(THD_GENERIC_FILE)) template<> diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index 4e9f50f98c451d..8cf5bf386f25aa 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -15,6 +15,7 @@ #include "torch/csrc/jit/passes/shape_analysis.h" #include "torch/csrc/jit/passes/remove_expands.h" #include "torch/csrc/jit/passes/decompose_addmm.h" +#include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/autograd/edge.h" #include "torch/csrc/autograd/function.h" @@ -349,6 +350,7 @@ struct GraphExecutorImpl { // do not work on variables // They also may assume that concrete sizes/strides are availiable + UnrollLoops(graph); //TODO: create peephole optimizations that are safe to run // when we are using variables, and when we do not know sizes. diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index 2790161e2208da..b7514465db0c28 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -16,6 +16,7 @@ #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h" #include "torch/csrc/jit/passes/shape_analysis.h" #include "torch/csrc/jit/passes/decompose_addmm.h" +#include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/graph_executor.h" #include "torch/csrc/jit/script/init.h" #include "torch/csrc/jit/script/python_tree_views.h" @@ -76,6 +77,7 @@ void initJITBindings(PyObject *module) { auto tensor_inputs = createVariableTensorList(inputs); PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs)); }) + .def("_jit_pass_loop_unrolling", UnrollLoops) .def("_jit_run_cpp_tests", [] { // We have to release the GIL inside this method, because if we happen to // initialize the autograd engine in these tests, the newly spawned worker threads will diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index 40f95c6f3b1156..bbeb60fddef6c9 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -557,10 +557,18 @@ inline Value* Value::setUniqueName(const std::string & name) { auto old_owner_of_name = names.find(name); if(old_owner_of_name != names.end()) { size_t suffix = 1; + std::string name_base = name; + auto last_dot_pos = name.find_last_of('.'); + if (last_dot_pos != std::string::npos && last_dot_pos + 1 != name.size()) { + if (name.find_first_not_of("0123456789", last_dot_pos + 1) == std::string::npos) { + suffix = std::stoll(name.substr(last_dot_pos + 1)); + name_base = name.substr(0, last_dot_pos); + } + } std::string replacement_name; do { std::stringstream ss; - ss << name << "." << suffix++; + ss << name_base << "." << suffix++; replacement_name = ss.str(); } while(names.count(replacement_name) > 0); old_owner_of_name->second->setUniqueName(replacement_name); diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp index 3e5b4d95703466..42b1e8086b9cca 100644 --- a/torch/csrc/jit/passes/dead_code_elimination.cpp +++ b/torch/csrc/jit/passes/dead_code_elimination.cpp @@ -1,23 +1,49 @@ #include "torch/csrc/jit/passes/dead_code_elimination.h" +#include + namespace torch { namespace jit { -void EliminateDeadCode(std::shared_ptr& graph) { - EliminateDeadCode(graph->block()); -} -bool hasSideEffects(Node * node) { - return node->kind() == prim::Print || node->blocks().size() > 0; +using bool_memo_type = std::unordered_map; + +bool hasSideEffects(Node * node, bool_memo_type& memo) { + // FIXME: PythonOp and CppOp should be treated as having side effects as well! + // Unfortunately ONNX depends on them getting removed in this pass, so it's not + // a simple change. + auto it = memo.find(node); + if (it != memo.end()) + return it->second; + bool has_side_effects = node->kind() == prim::Print || + std::any_of(node->blocks().begin(), node->blocks().end(), + [&](Block *b) { + return std::any_of(b->nodes().begin(), b->nodes().end(), + [&](Node *n) { return hasSideEffects(n, memo); }); + }); + memo.emplace(node, has_side_effects); + return has_side_effects; } -void EliminateDeadCode(Block *block) { +void EliminateDeadCode(Block *block, bool recurse, bool_memo_type& memo) { auto nodes = block->nodes().reverse(); for (auto it = nodes.begin(); it != nodes.end(); it++) { auto node = *it; - for (Block * block : node->blocks()) - EliminateDeadCode(block); - if (!node->hasUses() && !hasSideEffects(node)) + if (recurse) { + for (Block * block : node->blocks()) + EliminateDeadCode(block, true, memo); + } + if (!node->hasUses() && !hasSideEffects(node, memo)) it.destroyCurrent(); } } -}} +void EliminateDeadCode(std::shared_ptr& graph) { + bool_memo_type side_effect_memo; + EliminateDeadCode(graph->block(), true, side_effect_memo); +} + +void EliminateDeadCode(Block *block, bool recurse) { + bool_memo_type side_effect_memo; + EliminateDeadCode(block, recurse, side_effect_memo); +} + +}} // namespace torch::jit diff --git a/torch/csrc/jit/passes/dead_code_elimination.h b/torch/csrc/jit/passes/dead_code_elimination.h index 438e33d7ebcfef..a6b38f16f586f4 100644 --- a/torch/csrc/jit/passes/dead_code_elimination.h +++ b/torch/csrc/jit/passes/dead_code_elimination.h @@ -5,6 +5,6 @@ namespace torch { namespace jit { void EliminateDeadCode(std::shared_ptr& graph); -void EliminateDeadCode(Block *block); +void EliminateDeadCode(Block *block, bool recurse=true); }} diff --git a/torch/csrc/jit/passes/loop_unrolling.cpp b/torch/csrc/jit/passes/loop_unrolling.cpp new file mode 100644 index 00000000000000..bbf05f198a3b61 --- /dev/null +++ b/torch/csrc/jit/passes/loop_unrolling.cpp @@ -0,0 +1,212 @@ +#include "torch/csrc/jit/passes/loop_unrolling.h" + +#include "torch/csrc/jit/interned_strings.h" +#include "torch/csrc/jit/symbolic_variable.h" +#include "torch/csrc/jit/tensor_conversions.h" +#include "torch/csrc/jit/passes/dead_code_elimination.h" + +namespace torch { namespace jit { + +namespace { + +static constexpr int64_t kUnrollFactor = 8; +static constexpr int64_t kMaxBodySize = 16; +static constexpr int64_t kMaxBodyRepeats = 64; + +bool isTrueConstant(Value *val) { + at::optional maybe_value = constant_as(val); + return maybe_value && *maybe_value; +} + +bool isForLoop(Node* node) { + if (node->kind() != prim::Loop) + return false; + Value *start_cond = node->inputs().at(1); + Value *continue_cond = node->blocks().at(0)->outputs().at(0); + return isTrueConstant(start_cond) && isTrueConstant(continue_cond); +} + +// Counts the size of this block, stopping and returning once reaches limit instructions. +int64_t limitedBlockSize(Block *body, int64_t limit) { + auto it = body->nodes().begin(); + auto end = body->nodes().end(); + for (int64_t i = 0; i < limit; ++i, ++it) { + for (Block *subblock : it->blocks()) { + i += limitedBlockSize(subblock, limit - i); + } + if (it == end) { + return i; + } + } + return limit; +} + +bool isSmallBlock(Block *body) { + return limitedBlockSize(body, kMaxBodySize + 1) <= kMaxBodySize; +} + +// XXX: This function can only be called with a loop that is guaranteed to execute EXACTLY ONCE. +void inlineBody(Node *loop) { + auto graph = loop->owningGraph(); + auto body = loop->blocks().at(0); + WithInsertPoint insert_point_guard { loop }; + + std::unordered_map value_map; + auto get_value = [&](Value *v) { + auto it = value_map.find(v); + if (it != value_map.end()) + return it->second; + return v; + }; + + // Loop node has extra (max_iters, initial_cond) inputs, + // body has an extra (loop_counter) input. + for (size_t i = 2; i < loop->inputs().size(); ++i) { + value_map[body->inputs()[i - 1]] = loop->inputs()[i]; + } + + for (Node *orig : body->nodes()) { + Node *clone = graph->insertNode(graph->createClone(orig, get_value)); + for (size_t i = 0; i < orig->outputs().size(); ++i) { + value_map[orig->outputs()[i]] = clone->outputs()[i]; + } + } + for (size_t i = 0; i < loop->outputs().size(); ++i) { + loop->outputs().at(i)->replaceAllUsesWith(get_value(body->outputs().at(i + 1))); + } + // XXX: it is extremely important to destroy the loop in here. DCE might not be able + // to conclude that it's safe, because the loop might contain side effects. + loop->destroy(); +} + +void repeatBody(Block *body, int64_t times) { + // We will be adding nodes to the body, so cache the initial start and end. + // XXX: they are both inclusive, because the exclusive body_end would point to + // return_node, which would move further away if we were to add nodes, and we + // would enter an infinite loop. + auto body_start = body->nodes().begin(); + auto body_end = std::prev(body->nodes().end()); + auto graph = body->owningGraph(); + WithInsertPoint insert_point_guard { body }; + + std::unordered_map value_map; + auto get_value = [&](Value *v) { + auto it = value_map.find(v); + if (it != value_map.end()) + return it->second; + return v; + }; + + for (int64_t i = 1; i < times; ++i) { + // Update loop-carried values + // NB: note that we don't need to worry about the loop counter, because we've + // replaced it with a loop-carried variable + JIT_ASSERT(body->inputs().size() == body->outputs().size()); + for (size_t i = 1; i < body->inputs().size(); ++i) { + value_map[body->inputs()[i]] = get_value(body->outputs()[i]); + } + + // Clone the nodes + for (auto it = body_start; it != std::next(body_end); ++it) { + Node *orig = *it; + Node *clone = graph->insertNode(graph->createClone(orig, get_value)); + for (size_t i = 0; i < orig->outputs().size(); ++i) { + value_map[orig->outputs()[i]] = clone->outputs()[i]; + } + } + } + + // Update outputs of the body + const std::vector new_outputs = fmap(body->outputs(), get_value); + for (int64_t i = new_outputs.size() - 1; i >= 0; --i) { + body->eraseOutput(i); + } + for (Value *output : new_outputs) { + body->registerOutput(output); + } + + // It's likely that we have some dead nodes now - for example the "true" constant + // that prevents the loop from breaking. We shouldn't wait too long before removing + // them because they might artificially increase the loop size and prevent outer loop + // unrolling. + EliminateDeadCode(body, false); +} + +// Replaces the builtin loop counter with a "mutable" variable outside of the loop. +void replaceLoopCounter(Node *loop) { + Graph *graph = loop->owningGraph(); + Block *body = loop->blocks().at(0); + Node *init_counter_node = graph->createConstant(at::CPU(at::kLong).scalarTensor(0)) + ->insertBefore(loop); + loop->insertInput(2, init_counter_node->output()); + loop->insertOutput(0); + + Value * internal_counter = body->insertInput(1); + body->inputs()[0]->replaceAllUsesWith(internal_counter); + + WithInsertPoint insertPointGuard{ body->return_node() }; + body->insertOutput(1, SymbolicVariable(internal_counter) + at::Scalar(1)); +} + +void unroll(Node *loop) { + Graph *graph = loop->owningGraph(); + Block *body = loop->blocks().at(0); + if (!isSmallBlock(body)) + return; + + // We will be using a "mutable" counter outside of the loop instead of the default + // one, because this will allow us to share it between the unrolled loop and its epilogue. + // This is necessary only if the loop counter is actually used in the body. + if (body->inputs()[0]->uses().size() > 0) + replaceLoopCounter(loop); + + // Some optimization for constant-length loops. If we know they won't run too many + // times, then we can unroll them entirely. + Value *trip_count = loop->inputs().at(0); + int64_t const_len = constant_as(trip_count).value_or(-1); + if (const_len != -1 && const_len < kMaxBodyRepeats) { + repeatBody(body, const_len); + inlineBody(loop); + return; + } + + WithInsertPoint insert_point_guard { loop }; + + // Clone the loop before we unroll it. The clone will become the epilogue. + Node *loop_epilogue = graph->createClone(loop, [](Value *v) { return v; }) + ->insertAfter(loop); + for (size_t i = 0; i < loop->outputs().size(); ++i) { + loop->outputs()[i]->replaceAllUsesWith(loop_epilogue->outputs()[i]); + loop_epilogue->replaceInput(i + 2, loop->outputs()[i]); + } + + repeatBody(body, kUnrollFactor); + + // Change the iteration counts of both loops + SymbolicVariable iter_count = loop->inputs().at(0); + SymbolicVariable unrolled_iter_count = iter_count / kUnrollFactor; + loop->replaceInput(0, unrolled_iter_count); + loop_epilogue->replaceInput(0, iter_count - (unrolled_iter_count * kUnrollFactor)); +} + +void UnrollLoops(Block *block) { + for (auto it = block->nodes().begin(); it != block->nodes().end();) { + // XXX: unroll might destroy the current node, so we need to pre-increment the iterator + Node *node = *it; ++it; + for (Block *subblock : node->blocks()) { + UnrollLoops(subblock); + } + if (isForLoop(node)) { + unroll(node); + } + } +} + +} // anonymous namespace + +void UnrollLoops(std::shared_ptr& graph) { + UnrollLoops(graph->block()); + EliminateDeadCode(graph); +} + +}} // namespace torch::jit diff --git a/torch/csrc/jit/passes/loop_unrolling.h b/torch/csrc/jit/passes/loop_unrolling.h new file mode 100644 index 00000000000000..4ca1fd761f9459 --- /dev/null +++ b/torch/csrc/jit/passes/loop_unrolling.h @@ -0,0 +1,9 @@ +#pragma once + +#include "torch/csrc/jit/ir.h" + +namespace torch { namespace jit { + +void UnrollLoops(std::shared_ptr& graph); + +}} // namespace torch::jit diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 8f9f818e623916..163b8adda1ace3 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -290,20 +290,6 @@ static bool isTensorSubtype(Value* v) { return v->type()->isSubtypeOf(*DynamicType::get()); } -// if a value is a constant then try to turn into type T using the -// same rules as the interpreter -template -at::optional constant_as(Value* v) { - if(v->node()->kind() != prim::Constant) - return at::nullopt; - auto tensor = v->node()->t(attr::value); - try { - return tensor_as(std::move(tensor)); - } catch (tensor_conversion_error& err) { - return at::nullopt; - } -} - at::optional> getIntListAttribute(at::optional N, Value* input) { auto list = constant_as(input); if(list) @@ -324,9 +310,21 @@ void liftConstantAttributes(const FunctionSchema& schema, Node* node) { JIT_ASSERT(!node->hasAttributes()); std::vector new_inputs; Attributes attributes; - for(size_t i = 0; i < node->inputs().size(); ++i) { + for(size_t i = 0, n = 0; i < schema.arguments.size(); ++i) { const auto& arg = schema.arguments[i]; - auto input = node->input(i); + // this was a builtin with a vararg list lowered, + if(arg.type->kind() == TypeKind::ListType) { + // we do not support constant lifting of the arg itself + if(arg.attribute_info) + return; + // but we do support it for other values so we need to skip all the vararg nodes: + size_t vararg_list_size = node->inputs().size() - (schema.arguments.size() - 1); + while(n < i + vararg_list_size) { + new_inputs.push_back(node->input(n++)); + } + continue; + } + auto input = node->input(n++); if(arg.attribute_info) { switch(arg.attribute_info->kind) { case AttributeKind::i: { diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index e488a7535ea25b..d9d44fe5658685 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -4,6 +4,14 @@ #include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/jit/python_tracer.h" +#include + +#include +#include +#include +#include +#include + namespace torch { namespace jit { namespace script { @@ -248,11 +256,11 @@ struct ModuleValue : public SugaredValue { // select an attribute on it, e.g. `this.field` virtual std::shared_ptr attr(SourceRange loc, Method & m, const std::string& field) override { - if(at::optional v = module->find_module(field)) { + if(NamedModule* v = module->find_module(field)) { return std::make_shared(v->module); - } else if(at::optional v = module->find_method(field)) { + } else if(Method* v = module->find_method(field)) { return std::make_shared(module, *v); - } else if(at::optional v = module->find_parameter(field)) { + } else if(NamedParameter* v = module->find_parameter(field)) { return std::make_shared(m.get_or_add_parameter(v->slot())); } // This can also be a call to a non-script module, or a plain @@ -330,11 +338,11 @@ py::object unpackVariableTensorList(std::vector outputs) { } static void gatherParametersAndBuffers(std::vector & values, const Module & m) { - for(auto & params : m.get_parameters()) { - values.push_back(params.slot()); + for(auto & param : m.get_parameters()) { + values.push_back(param->slot()); } for(const auto & sub : m.get_modules()) { - gatherParametersAndBuffers(values, *sub.module); + gatherParametersAndBuffers(values, *sub->module); } } @@ -378,8 +386,8 @@ void initJitScriptBindings(PyObject* module) { auto & modules = self.get_modules(); py::tuple result(modules.size()); for(size_t i = 0; i < modules.size(); ++i) { - auto & nm = modules[i]; - result[i] = std::make_pair(nm.name, nm.module); + auto & item = modules[i]; + result[i] = std::make_pair(item.key, item.value); } return result; }) @@ -390,9 +398,9 @@ void initJitScriptBindings(PyObject* module) { auto & p = parameters[i]; py::tuple r(3); result[i] = std::make_tuple( - p.name, - static_cast(*p.slot()), - p.is_buffer); + p.key, + static_cast(*p->slot()), + p->is_buffer); } return result; @@ -416,8 +424,9 @@ void initJitScriptBindings(PyObject* module) { return bool(self.find_method(name)); }) .def("_method_names", [](Module& self) { - return fmap(self.get_methods(), [](const std::unique_ptr & m) { - return m->name(); + using Item = torch::detail::OrderedDict>::Item; + return fmap(self.get_methods(), [](const Item & item) { + return (*item)->name(); }); }) .def("_create_method_from_trace", []( diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h index 64ea9118da5c56..b51a5fdf82d10d 100644 --- a/torch/csrc/jit/script/module.h +++ b/torch/csrc/jit/script/module.h @@ -7,8 +7,17 @@ #include "torch/csrc/jit/function_schema.h" #include "torch/csrc/jit/named_value.h" +#include + #include +#include + #include +#include +#include +#include +#include +#include // This file contains classes which assist in desugaring Python style // modules and their methods into flattened graphs which don't have any @@ -189,69 +198,12 @@ struct NamedParameter { std::unique_ptr parameter; }; -// simple ordered dict used only in Module -// contains only the minimum necessary functionality for Module -template -struct OrderedDict { - OrderedDict(const char * what) - : what(what) {} - // note: slight difference from python here. - // we do not allow for insertion of an already existing value, - // because we not allow allow methods or submodules to be updated - // once created - T& insert(const std::string& name, T&& value) { - if(index_.count(name) != 0) { - std::stringstream ss; - ss << "module " << what << "'" << name << "' already defined."; - throw std::runtime_error(ss.str()); - } - values_.push_back(std::move(value)); - index_[name] = values_.size() - 1; - return values_.back(); - } - at::optional find(const std::string& str) { - auto it = index_.find(str); - if(it == index_.end()) - return at::nullopt; - return at::optional(values_.at(it->second)); - } - at::optional find(const std::string& str) const { - auto it = index_.find(str); - if(it == index_.end()) - return at::nullopt; - return at::optional(values_.at(it->second)); - } - T& get(const std::string& name) { - if(auto v = find(name)) { - return *v; - } - std::stringstream ss; - ss << "module " << what << "'" << name << "' is not defined."; - throw std::runtime_error(ss.str()); - } - const T& get(const std::string& name) const { - if(auto v = find(name)) { - return *v; - } - std::stringstream ss; - ss << "module " << what << "'" << name << "' is not defined."; - throw std::runtime_error(ss.str()); - } - const std::vector& values() const { - return values_; - } -private: - std::unordered_map index_; - std::vector values_; - const char * what; -}; - struct Module : public std::enable_shared_from_this { TH_DISALLOW_COPY_AND_ASSIGN(Module); Module() - : modules("modules") - , parameters("parameters") - , methods("methods") + : modules("Module") + , parameters("Parameter") + , methods("Method") , optimize(true) {} // note this doesn't change the flags of existing methods just ones @@ -296,7 +248,7 @@ struct Module : public std::enable_shared_from_this { } // each module owns its method. The reference returned here - // is guarenteed to stay valid until this module has been destoryed + // is guarenteed to stay valid until this module has been destroyed Method& get_method(const std::string& name) const { return *methods.get(name); } @@ -305,39 +257,38 @@ struct Module : public std::enable_shared_from_this { return modules.get(name).module; } - const std::vector& get_modules() const { - return modules.values(); + const detail::OrderedDict& get_modules() const { + return modules; } - const std::vector& get_parameters() const { - return parameters.values(); + const detail::OrderedDict& get_parameters() const { + return parameters; } - const std::vector>& get_methods() const { - return methods.values(); + const detail::OrderedDict>& get_methods() const { + return methods; } - - at::optional find_parameter(const std::string& name) { + NamedParameter* find_parameter(const std::string& name) { return parameters.find(name); } - at::optional find_module(const std::string& name) { + NamedModule* find_module(const std::string& name) { return modules.find(name); } - at::optional find_method(const std::string& name) { - if(auto pm = methods.find(name)) - return at::optional(**pm); - return at::nullopt; + Method* find_method(const std::string& name) { + if (auto* pm = methods.find(name)) { + return pm->get(); + } + return nullptr; } - -private: + private: // invariant: to ensure member_inputs of Methods stay valid, // it is only legal to _add_ new modules and parameters. // removing them will allow member_inputs to point to invalid parameters // no such restriction exists for methods - OrderedDict modules; - OrderedDict parameters; - OrderedDict> methods; + detail::OrderedDict modules; + detail::OrderedDict parameters; + detail::OrderedDict> methods; bool optimize; }; diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h index 1e971f96ade355..238022333ff7c8 100644 --- a/torch/csrc/jit/symbolic_variable.h +++ b/torch/csrc/jit/symbolic_variable.h @@ -79,6 +79,24 @@ struct SymbolicVariable { SymbolicVariable operator-() const { return create(aten::neg, {*this})[0].typeLike(*this); } + SymbolicVariable operator-(const SymbolicVariable rhs) const { + Node *n; + auto r = create(aten::sub, {*this, rhs}, 1, &n)[0].typeLike(*this); + n->t_(attr::alpha, at::Scalar(1).toTensor()); + return r; + } + SymbolicVariable operator/(at::Scalar rhs) const { + Node *n; + auto r = create(aten::div, {*this}, 1, &n)[0].typeLike(*this); + n->t_(attr::other, rhs.toTensor()); + return r; + } + SymbolicVariable operator%(at::Scalar rhs) const { + Node *n; + auto r = create(aten::remainder, {*this}, 1, &n)[0].typeLike(*this); + n->t_(attr::other, rhs.toTensor()); + return r; + } SymbolicVariable mm(const SymbolicVariable rhs) const { auto r = create(t("mm"), {*this, rhs})[0]; return r; diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h index 3401d0c0d9b2e1..ef3c0febdaa711 100644 --- a/torch/csrc/jit/tensor_conversions.h +++ b/torch/csrc/jit/tensor_conversions.h @@ -123,4 +123,22 @@ inline at::Tensor as_variable(const T& t) { return autograd::make_variable(as_tensor(t)); } +////////////////////////////////////////////////////////////////////////////////// +// Helper for retrieving constants +////////////////////////////////////////////////////////////////////////////////// + +// if a value is a constant then try to turn into type T using the +// same rules as the interpreter +template +at::optional constant_as(Value* v) { + if(v->node()->kind() != prim::Constant) + return at::nullopt; + auto tensor = v->node()->t(attr::value); + try { + return tensor_as(std::move(tensor)); + } catch (tensor_conversion_error& err) { + return at::nullopt; + } +} + }} // namespace torch::jit diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp index af7cb560ddbbfb..f8266acec3e8e8 100644 --- a/torch/csrc/utils.cpp +++ b/torch/csrc/utils.cpp @@ -17,10 +17,6 @@ #include "generic/utils.cpp" #include -#ifdef WITH_CUDA -#include "torch/csrc/cuda/THCP.h" -#endif - int THPUtils_getCallable(PyObject *arg, PyObject **result) { if (!PyCallable_Check(arg)) return 0; @@ -231,30 +227,3 @@ bool maybeThrowBackCompatKeepdimWarn(char *func) { } return true; } - -#ifdef WITH_CUDA -std::vector THPUtils_PySequence_to_THCStreamList(PyObject *obj) { - if (!PySequence_Check(obj)) { - throw std::runtime_error("Expected a sequence in THPUtils_PySequence_to_THCStreamList"); - } - THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, NULL)); - if (seq.get() == NULL) { - throw std::runtime_error("expected PySequence, but got " + std::string(THPUtils_typename(obj))); - } - - std::vector streams; - Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get()); - for (Py_ssize_t i = 0; i < length; i++) { - PyObject *stream = PySequence_Fast_GET_ITEM(seq.get(), i); - - if (PyObject_IsInstance(stream, THCPStreamClass)) { - streams.push_back( ((THCPStream *)stream)->cdata); - } else if (stream == Py_None) { - streams.push_back(NULL); - } else { - std::runtime_error("Unknown data type found in stream list. Need THCStream or None"); - } - } - return streams; -} -#endif diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h index 5756954bc27027..296cd5e97229fe 100644 --- a/torch/csrc/utils.h +++ b/torch/csrc/utils.h @@ -133,11 +133,10 @@ void THPUtils_addPyMethodDefs(std::vector& vector, PyMethodDef* met int THPUtils_getCallable(PyObject *arg, PyObject **result); -#define THStoragePtr TH_CONCAT_3(TH,Real,StoragePtr) -#define THTensorPtr TH_CONCAT_3(TH,Real,TensorPtr) +#define THWStoragePtr TH_CONCAT_3(TH,Real,StoragePtr) +#define THWTensorPtr TH_CONCAT_3(TH,Real,TensorPtr) #define THPStoragePtr TH_CONCAT_3(THP,Real,StoragePtr) #define THPTensorPtr TH_CONCAT_3(THP,Real,TensorPtr) -#define THSTensorPtr TH_CONCAT_3(THS,Real,TensorPtr) #define THSPTensorPtr TH_CONCAT_3(THSP,Real,TensorPtr) typedef THPPointer THPGeneratorPtr; diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f4e20f722b18a6..0dd9099d8a677b 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -367,9 +367,9 @@ def _script_graph(fn, _frames_up=0): return _jit_script_compile(ast, rcb) -def script(fn, _frames_up=0): +def script(fn, optimize=True, _frames_up=0): graph = _script_graph(fn, _frames_up=_frames_up + 1) - return functools.wraps(fn)(torch._C.GraphExecutor(graph, True)) + return functools.wraps(fn)(torch._C.GraphExecutor(graph, optimize)) ScriptMethodStub = namedtuple('ScriptMethodStub', ('resolution_callback', 'ast', 'original_method')) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index d661af34038985..2e216216da59cb 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -15,9 +15,9 @@ include(CMakeInitializeConfigs) # Relies on CMAKE_INSTALL_PREFIX to be set to ../tmp_install. # It then finds $PREFIX/share/cmake/ATen/ATenConfig.cmake, # which defines ATEN_INCLUDE_DIR and ATEN_LIBRARIES. -find_package(ATen REQUIRED) -if(NOT ATen_FOUND) - message(FATAL_ERROR "ATen not found") +find_package(Caffe2 REQUIRED) +if(NOT Caffe2_FOUND) + message(FATAL_ERROR "Caffe2 not found") endif() find_package(Gloo REQUIRED) @@ -39,12 +39,6 @@ if(NOT CUDA_FOUND) message(FATAL_ERROR "CUDA not found") endif() -set(CUDA_PROPAGATE_HOST_FLAGS OFF) -if (NOT MSVC) - list(APPEND CUDA_NVCC_FLAGS "-std=c++11") - list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") -endif() - set(C10D_SRCS Utils.cpp Store.cpp @@ -55,13 +49,17 @@ set(C10D_SRCS ) add_library(c10d ${C10D_SRCS}) -target_compile_options(c10d PUBLIC "-std=c++11") -target_include_directories(c10d PUBLIC ${ATEN_INCLUDE_DIR}) -target_link_libraries(c10d PUBLIC ${ATEN_LIBRARIES}) -target_include_directories(c10d PUBLIC ${CUDA_INCLUDE_DIRS}) -target_link_libraries(c10d PUBLIC ${CUDA_LIBRARIES}) +target_compile_options(c10d PUBLIC + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + ) +target_link_libraries(c10d PUBLIC caffe2_gpu) -# c10d links to ATen, but the ATen targets don't add TH/THC to the include path +# c10d links to Caffe2/ATen, but the targets don't add TH/THC to the include path target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/TH) target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC) diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index 01d29829661195..435486b6f64071 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -85,6 +85,8 @@ const ::gloo::ReductionFunction* reductionFunction(const ReduceOp& r) { return ::gloo::ReductionFunction::min; case ReduceOp::MAX: return ::gloo::ReductionFunction::max; + case ReduceOp::UNUSED: + break; } throw std::runtime_error("Unhandled ReduceOp"); @@ -92,7 +94,7 @@ const ::gloo::ReductionFunction* reductionFunction(const ReduceOp& r) { std::vector getStreamVector(AlgorithmEntry& entry) { std::vector streams(entry.streams.size()); - for (auto i = 0; i < entry.streams.size(); i++) { + for (size_t i = 0; i < entry.streams.size(); i++) { streams[i] = entry.streams[i].getStream(); } return streams; @@ -103,7 +105,7 @@ std::vector getStreamVector(AlgorithmEntry& entry) { void synchronizeStreams(THCState* thcState, AlgorithmEntry* entry) { CUDADevice deviceGuard; const auto& key = entry->key; - for (auto i = 0; i < key.devices.size(); i++) { + for (size_t i = 0; i < key.devices.size(); i++) { const auto& device = key.devices[i]; auto publicStream = THCState_getCurrentStreamOnDevice(thcState, device); auto privateStream = entry->streams[i].getStream(); @@ -138,7 +140,7 @@ bool ProcessGroupGloo::WorkGloo::isSuccess() const { void ProcessGroupGloo::WorkGloo::synchronize() { if (cuda_) { auto thcState = ::at::globalContext().lazyInitCUDA(); - for (auto i = 0; i < devices_.size(); i++) { + for (size_t i = 0; i < devices_.size(); i++) { auto stream = THCState_getCurrentStreamOnDevice(thcState, devices_[i]); auto event = events_[i].getEvent(); C10D_CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); @@ -174,7 +176,7 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) { CUDADevice deviceGuard; devices_ = entry.key.devices; events_.resize(devices_.size()); - for (auto i = 0; i < devices_.size(); i++) { + for (size_t i = 0; i < devices_.size(); i++) { deviceGuard.setDevice(devices_[i]); events_[i] = CUDAEvent::create(); const auto& event = events_[i].getEvent(); @@ -218,7 +220,7 @@ ProcessGroupGloo::ProcessGroupGloo( } threads_.resize(options.threads); - for (int i = 0; i < threads_.size(); i++) { + for (size_t i = 0; i < threads_.size(); i++) { threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this); } @@ -297,6 +299,8 @@ void ProcessGroupGloo::createAlgorithm(AlgorithmEntry& entry) { case CollectiveType::BROADCAST: GENERATE_ALL_TYPES(key.type->scalarType(), createBroadcast, entry); return; + case CollectiveType::UNUSED: + break; } throw std::runtime_error("Unhandled collective type"); @@ -387,7 +391,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) { // Allocate source tensors for this entry auto& srcSizes = key.srcSizes; entry->src.resize(srcSizes.size()); - for (int i = 0; i < srcSizes.size(); i++) { + for (size_t i = 0; i < srcSizes.size(); i++) { deviceGuard.setDevice(key.type->is_cuda() ? key.devices[i] : -1); entry->src[i] = key.type->tensor(srcSizes[i]); } @@ -396,7 +400,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) { if (key.type->is_cuda()) { entry->streams.resize(key.devices.size()); entry->events.resize(key.devices.size()); - for (auto i = 0; i < key.devices.size(); i++) { + for (size_t i = 0; i < key.devices.size(); i++) { deviceGuard.setDevice(key.devices[i]); entry->streams[i] = CUDAStream::create(); entry->events[i] = CUDAEvent::create(); @@ -466,7 +470,7 @@ std::shared_ptr ProcessGroupGloo::broadcast( synchronizeStreams(thcState_, entry); entry->run = [=]() mutable { entry->algorithm->run(); - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { // The THCStreamGuard is a RAII wrapper for temporarily // overriding the current THCStream. This also sets the // current device to the stream's device. @@ -477,7 +481,7 @@ std::shared_ptr ProcessGroupGloo::broadcast( } else { entry->run = [=]() mutable { entry->algorithm->run(); - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { tensors[i].copy_(entry->src[i]); } }; @@ -502,7 +506,7 @@ std::shared_ptr ProcessGroupGloo::allreduce( auto entry = checkout(key); // Copy input tensors - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { entry->src[i].copy_(tensors[i]); } @@ -512,7 +516,7 @@ std::shared_ptr ProcessGroupGloo::allreduce( synchronizeStreams(thcState_, entry); entry->run = [=]() mutable { entry->algorithm->run(); - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { // The THCStreamGuard is a RAII wrapper for temporarily // overriding the current THCStream. This also sets the // current device to the stream's device. @@ -523,7 +527,7 @@ std::shared_ptr ProcessGroupGloo::allreduce( } else { entry->run = [=]() mutable { entry->algorithm->run(); - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { tensors[i].copy_(entry->src[i]); } }; diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp index e589c6d621bd87..163e2312b55741 100644 --- a/torch/lib/c10d/Types.hpp +++ b/torch/lib/c10d/Types.hpp @@ -5,20 +5,8 @@ namespace c10d { enum class CollectiveType : std::uint8_t { - ALLGATHER = 0, - GATHER, - SCATTER, - ALLREDUCE, - REDUCE, BROADCAST, - SEND, - BARRIER, - UNUSED, -}; - -enum class DeviceType : std::uint8_t { - CPU = 0, - CUDA, + ALLREDUCE, UNUSED, }; diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index a079ff2cfe2598..321268c029a5d3 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -23,7 +23,7 @@ namespace c10d { inline std::string toString(at::IntList l) { std::stringstream ss; ss << "("; - for (int i = 0; i < l.size(); i++) { + for (size_t i = 0; i < l.size(); i++) { if (i > 0) { ss << ", "; } @@ -42,7 +42,7 @@ inline void assertSameSizeAndType(const std::vector& tensors) { // Ensure all tensors have identical type and shape auto& type = tensors[0].type(); auto sizes = tensors[0].sizes(); - for (auto i = 1; i < tensors.size(); i++) { + for (size_t i = 1; i < tensors.size(); i++) { if (tensors[i].type() != type) { const std::string expected = type.toString(); const std::string actual = tensors[i].type().toString(); @@ -63,7 +63,7 @@ inline void assertSameSizeAndType(const std::vector& tensors) { inline std::vector> getSizes( const std::vector& tensors) { std::vector> sizes(tensors.size()); - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { sizes[i] = tensors[i].sizes(); } return sizes; @@ -72,7 +72,7 @@ inline std::vector> getSizes( inline std::vector getDevices(const std::vector& tensors) { std::vector devices(tensors.size(), -1); if (tensors[0].type().is_cuda()) { - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { devices[i] = tensors[i].storage()->getDevice(); } } @@ -82,7 +82,7 @@ inline std::vector getDevices(const std::vector& tensors) { template std::vector getDataPointers(const std::vector& tensors) { std::vector ptrs(tensors.size()); - for (auto i = 0; i < tensors.size(); i++) { + for (size_t i = 0; i < tensors.size(); i++) { ptrs[i] = static_cast(tensors[i].storage()->data()); } return ptrs; diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index ab06631f02ff39..3138a7603bb03c 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -1,5 +1,4 @@ cuda_add_library(c10d_cuda_test CUDATest.cu) -target_compile_options(c10d_cuda_test PUBLIC "-std=c++11") target_link_libraries(c10d_cuda_test c10d) function(c10d_add_test test_src) diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp index 2e1bc8de07f1f6..acc10ad1e87bdf 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp @@ -186,25 +186,25 @@ class AsyncBroadcastTest : public AsyncInputIsOutputTest { void runAsyncAllreduceTest( const std::string& path, - int numProcesses, - int numTensors) { + size_t numProcesses, + size_t numTensors) { auto tests = initialize(path, numProcesses, numTensors); std::vector> work(numProcesses); - for (auto i = 0; i < numProcesses; i++) { + for (size_t i = 0; i < numProcesses; i++) { work[i] = tests[i].run(); } // Wait for work to complete - for (auto i = 0; i < numProcesses; i++) { + for (size_t i = 0; i < numProcesses; i++) { tests[i].wait(work[i]); } // Check results - for (auto i = 0; i < numProcesses; i++) { + for (size_t i = 0; i < numProcesses; i++) { const auto size = numProcesses * numTensors; const auto expected = (size * (size - 1)) / 2; auto tensors = tests[i].getTensors(); - for (auto j = 0; j < tensors.size(); j++) { + for (size_t j = 0; j < tensors.size(); j++) { auto& tensor = tensors[j]; auto data = tensor.data(); for (auto k = 0; k < tensor.numel(); k++) { @@ -218,28 +218,28 @@ void runAsyncAllreduceTest( void runAsyncBroadcastTest( const std::string& path, - int numProcesses, - int numTensors) { + size_t numProcesses, + size_t numTensors) { auto tests = initialize(path, numProcesses, numTensors); // Try every permutation of root rank and root tensor - for (auto rootRank = 0; rootRank < numProcesses; rootRank++) { - for (auto rootTensor = 0; rootTensor < numTensors; rootTensor++) { + for (size_t rootRank = 0; rootRank < numProcesses; rootRank++) { + for (size_t rootTensor = 0; rootTensor < numTensors; rootTensor++) { std::vector> work(numProcesses); - for (auto i = 0; i < numProcesses; i++) { + for (size_t i = 0; i < numProcesses; i++) { work[i] = tests[i].run(rootRank, rootTensor); } // Wait for work to complete - for (auto i = 0; i < numProcesses; i++) { + for (size_t i = 0; i < numProcesses; i++) { tests[i].wait(work[i]); } // Check results const auto expected = (rootRank * numTensors + rootTensor); - for (auto i = 0; i < numProcesses; i++) { + for (size_t i = 0; i < numProcesses; i++) { auto tensors = tests[i].getTensors(); - for (auto j = 0; j < tensors.size(); j++) { + for (size_t j = 0; j < tensors.size(); j++) { auto& tensor = tensors[j]; auto data = tensor.data(); for (auto k = 0; k < tensor.numel(); k++) { diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index 744f6c535fdab3..8aedc14dd9cd0a 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -139,10 +139,10 @@ class CollectiveTest { std::vector> copyTensors( const std::vector>& inputs) { std::vector> outputs(inputs.size()); - for (auto i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { const auto& input = inputs[i]; std::vector output(input.size()); - for (auto j = 0; j < input.size(); j++) { + for (size_t j = 0; j < input.size(); j++) { output[j] = input[j].toBackend(at::kCPU); } outputs[i] = std::move(output); diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py index 281f72a716e2e4..6a5ab99cacfae1 100644 --- a/torch/nn/parallel/parallel_apply.py +++ b/torch/nn/parallel/parallel_apply.py @@ -18,6 +18,15 @@ def get_a_var(obj): def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): + r"""Applies each `module` in :attr:`modules` in parallel on arguments + contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword) + on each of :attr:`devices`. + + :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and + :attr:`devices` (if given) should all have same length. Moreover, each + element of :attr:`inputs` can either be a single object as the only argument + to a module, or a collection of positional arguments. + """ assert len(modules) == len(inputs) if kwargs_tup is not None: assert len(modules) == len(kwargs_tup) @@ -38,6 +47,9 @@ def _worker(i, module, input, kwargs, device=None): device = get_a_var(input).get_device() try: with torch.cuda.device(device): + # this also avoids accidental slicing of `input` if it is a Tensor + if not isinstance(input, (list, tuple)): + input = (input,) output = module(*input, **kwargs) with lock: results[i] = output diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 89b05ae0d23060..e7cd77bb9a8731 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -600,12 +600,15 @@ def index_select(g, self, index, dim): def type_as(g, self, other): - if self.type().scalarType() == other.type().scalarType(): - # no-op + if self.isTensor() and other.isTensor() and self.type().scalarType() == other.type().scalarType(): return self + + if other.isTensor(): + other_type_name = other.type().scalarType() + return g.op("Cast", self, to_i=cast_pytorch_to_onnx[other_type_name]) else: - other_type_name = self.type().scalarType().lower() - return g.op("Cast", self, to_i=cast_pytorch_to_onnx[scalar_name_to_pytorch[other_type_name]]) + # We don't know the type of other, bail by emitting ATen + return g.op("ATen", self, other, operator_s="type_as") # ignore clone operators that are inserted by PyTorch autograd